diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..347c790 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.pyc +models/model.latin/ +models/model.small/ +test.data diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..32e2127 --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +from ldig import ldig diff --git a/ldig.py b/ldig.py index 8ec862b..060a409 100755 --- a/ldig.py +++ b/ldig.py @@ -15,19 +15,50 @@ class ldig(object): def __init__(self, model_dir): - self.features = os.path.join(model_dir, 'features') - self.labels = os.path.join(model_dir, 'labels.json') - self.param = os.path.join(model_dir, 'parameters.npy') - self.doublearray = os.path.join(model_dir, 'doublearray.npz') + self.featuresFile = os.path.join(model_dir, 'features') + self.labelsFile = os.path.join(model_dir, 'labels.json') + self.paramFile = os.path.join(model_dir, 'parameters.npy') + self.doublearrayFile = os.path.join(model_dir, 'doublearray.npz') + + def __getattr__(self,name): + if '_'+name in self.__dict__: + return self.__dict__['_'+name] + if name == 'feature': + try: + self.__dict__['_features']=self.load_features() + return self.__dict__['_features'] + except IOError: + raise IOError("no feature file: %s"%self.featuresFile) + elif name == 'labels': + try: + self.__dict__['_labels']=self.load_labels() + return self.__dict__['_labels'] + except IOError: + raise IOError("no label file: %s"%self.labelsFile) + elif name == 'param': + try: + self.__dict__['_param']=numpy.load(self.paramFile) + return self.__dict__['_param'] + except IOError: + raise IOError("no params file: %s"%self.paramFile) + elif name == 'trie': + try: + self.__dict__['_trie']=self.load_da() + return self.__dict__['_trie'] + except IOError: + raise IOError("no double array file: %s"%self.doublearrayFile) + + + def load_da(self): trie = da.DoubleArray() - trie.load(self.doublearray) + trie.load(self.doublearrayFile) return trie def load_features(self): features = [] - with codecs.open(self.features, 'rb', 'utf-8') as f: + with codecs.open(self.featuresFile, 'rb', 'utf-8') as f: pre_feature = "" for n, s in enumerate(f): m = re.match(r'(.+)\t([0-9]+)', s) @@ -40,7 +71,7 @@ def load_features(self): return features def load_labels(self): - with open(self.labels, 'rb') as f: + with open(self.labelsFile, 'rb') as f: return json.load(f) @@ -69,11 +100,11 @@ def init(self, temp_path, corpus_list, lbff, ngram_bound): labels.sort() print "labels: %d" % len(labels) - with open(self.labels, 'wb') as f: + with open(self.labelsFile, 'wb') as f: f.write(json.dumps(labels)) print "generating max-substrings..." - temp_features = self.features + ".temp" + temp_features = self.featuresFile + ".temp" maxsubst = options.maxsubst if os.name == 'nt': maxsubst += ".exe" subprocess.call([maxsubst, temp_path, temp_features]) @@ -94,37 +125,37 @@ def init(self, temp_path, corpus_list, lbff, ngram_bound): print "# of features = %d" % M features.sort() - with codecs.open(self.features, 'wb', 'utf-8') as f: + with codecs.open(self.featuresFile, 'wb', 'utf-8') as f: for s in features: f.write(s[1]) - generate_doublearray(self.doublearray, [s[0] for s in features]) + generate_doublearray(self.doublearrayFile, [s[0] for s in features]) - numpy.save(self.param, numpy.zeros((M, len(labels)))) + numpy.save(self.paramFile, numpy.zeros((M, len(labels)))) def shrink(self): features = self.load_features() - param = numpy.load(self.param) + param = numpy.load(self.paramFile) list = (numpy.abs(param).sum(1) > 0.0000001) new_param = param[list] print "# of features : %d => %d" % (param.shape[0], new_param.shape[0]) - numpy.save(self.param, new_param) + numpy.save(self.paramFile, new_param) new_features = [] - with codecs.open(self.features, 'wb', 'utf-8') as f: + with codecs.open(self.featuresFile, 'wb', 'utf-8') as f: for i, x in enumerate(list): if x: f.write("%s\t%s\n" % features[i]) new_features.append(features[i][0]) - generate_doublearray(self.doublearray, new_features) + generate_doublearray(self.doublearrayFile, new_features) def debug(self, args): features = self.load_features() trie = self.load_da() labels = self.load_labels() - param = numpy.load(self.param) + param = numpy.load(self.paramFile) for st in args: label, text, org_text = normalize_text(st) @@ -144,7 +175,7 @@ def debug(self, args): def learn(self, options, args): trie = self.load_da() - param = numpy.load(self.param) + param = numpy.load(self.paramFile) labels = self.load_labels() import time @@ -153,14 +184,60 @@ def learn(self, options, args): print "inference... " + time.strftime("%H:%M:%S", time.localtime()) inference(param, labels, corpus, idlist, trie, options) print "finish... " + time.strftime("%H:%M:%S", time.localtime()) - numpy.save(self.param, param) + numpy.save(self.paramFile, param) - def detect(self, options, args): - trie = self.load_da() - param = numpy.load(self.param) - labels = self.load_labels() + def detect(self,text): + events = self.trie.extract_features(u"\u0001" + text + u"\u0001") + y = predict(self.param, events) + predict_k = y.argmax() + if y[predict_k] < 0.6: + predict_lang = "" + else: + predict_lang = self.labels[predict_k] + return predict_lang + + def treatFile(self, filelist, options): + K = len(self.labels) + corrects = numpy.zeros(K, dtype=int) + counts = numpy.zeros(K, dtype=int) + + label_map = dict((x, i) for i, x in enumerate(self.labels)) + + n_available_data = 0 + log_likely = 0.0 + for filename in filelist: + f = codecs.open(filename, 'rb', 'utf-8') + for i, s in enumerate(f): + label, text, org_text = normalize_text(s.strip()) - log_likely = likelihood(param, labels, trie, args, options) + predict_lang = self.detect(text) + + if label not in label_map: + if label: + sys.stderr.write("WARNING : unknown label '%s' at %d in %s (ignore the later same labels)\n" % (label, i+1, filename)) + label_map[label] = -1 + label_k = label_map[label] + + if label_k >= 0: + log_likely -= numpy.log(y[label_k]) + n_available_data += 1 + counts[label_k] += 1 + if label_k == predict_k and y[predict_k] >= 0.6: + corrects[predict_k] += 1 + + print "%s\t%s\t%s" % (label, predict_lang, org_text) + f.close() + + if n_available_data > 0: + log_likely /= n_available_data + + for lbl, crct, cnt in zip(self.labels, corrects, counts): + if cnt > 0: + print "> %s = %d / %d = %.2f" % (lbl, crct, cnt, 100.0 * crct / cnt) + print "> total = %d / %d = %.2f" % (corrects.sum(), n_available_data, 100.0 * corrects.sum() / n_available_data) + print "> average negative log likelihood = %.3f" % log_likely + + return log_likely @@ -204,40 +281,40 @@ def normalize_twitter(text): re_ignore_i = re.compile(r'[^I]') re_turkish_alphabet = re.compile(u'[\u011e\u011f\u0130\u0131]') vietnamese_norm = { - u'\u0041\u0300':u'\u00C0', u'\u0045\u0300':u'\u00C8', u'\u0049\u0300':u'\u00CC', u'\u004F\u0300':u'\u00D2', - u'\u0055\u0300':u'\u00D9', u'\u0059\u0300':u'\u1EF2', u'\u0061\u0300':u'\u00E0', u'\u0065\u0300':u'\u00E8', - u'\u0069\u0300':u'\u00EC', u'\u006F\u0300':u'\u00F2', u'\u0075\u0300':u'\u00F9', u'\u0079\u0300':u'\u1EF3', - u'\u00C2\u0300':u'\u1EA6', u'\u00CA\u0300':u'\u1EC0', u'\u00D4\u0300':u'\u1ED2', u'\u00E2\u0300':u'\u1EA7', - u'\u00EA\u0300':u'\u1EC1', u'\u00F4\u0300':u'\u1ED3', u'\u0102\u0300':u'\u1EB0', u'\u0103\u0300':u'\u1EB1', - u'\u01A0\u0300':u'\u1EDC', u'\u01A1\u0300':u'\u1EDD', u'\u01AF\u0300':u'\u1EEA', u'\u01B0\u0300':u'\u1EEB', - - u'\u0041\u0301':u'\u00C1', u'\u0045\u0301':u'\u00C9', u'\u0049\u0301':u'\u00CD', u'\u004F\u0301':u'\u00D3', - u'\u0055\u0301':u'\u00DA', u'\u0059\u0301':u'\u00DD', u'\u0061\u0301':u'\u00E1', u'\u0065\u0301':u'\u00E9', - u'\u0069\u0301':u'\u00ED', u'\u006F\u0301':u'\u00F3', u'\u0075\u0301':u'\u00FA', u'\u0079\u0301':u'\u00FD', - u'\u00C2\u0301':u'\u1EA4', u'\u00CA\u0301':u'\u1EBE', u'\u00D4\u0301':u'\u1ED0', u'\u00E2\u0301':u'\u1EA5', - u'\u00EA\u0301':u'\u1EBF', u'\u00F4\u0301':u'\u1ED1', u'\u0102\u0301':u'\u1EAE', u'\u0103\u0301':u'\u1EAF', - u'\u01A0\u0301':u'\u1EDA', u'\u01A1\u0301':u'\u1EDB', u'\u01AF\u0301':u'\u1EE8', u'\u01B0\u0301':u'\u1EE9', - - u'\u0041\u0303':u'\u00C3', u'\u0045\u0303':u'\u1EBC', u'\u0049\u0303':u'\u0128', u'\u004F\u0303':u'\u00D5', - u'\u0055\u0303':u'\u0168', u'\u0059\u0303':u'\u1EF8', u'\u0061\u0303':u'\u00E3', u'\u0065\u0303':u'\u1EBD', - u'\u0069\u0303':u'\u0129', u'\u006F\u0303':u'\u00F5', u'\u0075\u0303':u'\u0169', u'\u0079\u0303':u'\u1EF9', - u'\u00C2\u0303':u'\u1EAA', u'\u00CA\u0303':u'\u1EC4', u'\u00D4\u0303':u'\u1ED6', u'\u00E2\u0303':u'\u1EAB', - u'\u00EA\u0303':u'\u1EC5', u'\u00F4\u0303':u'\u1ED7', u'\u0102\u0303':u'\u1EB4', u'\u0103\u0303':u'\u1EB5', - u'\u01A0\u0303':u'\u1EE0', u'\u01A1\u0303':u'\u1EE1', u'\u01AF\u0303':u'\u1EEE', u'\u01B0\u0303':u'\u1EEF', - - u'\u0041\u0309':u'\u1EA2', u'\u0045\u0309':u'\u1EBA', u'\u0049\u0309':u'\u1EC8', u'\u004F\u0309':u'\u1ECE', - u'\u0055\u0309':u'\u1EE6', u'\u0059\u0309':u'\u1EF6', u'\u0061\u0309':u'\u1EA3', u'\u0065\u0309':u'\u1EBB', - u'\u0069\u0309':u'\u1EC9', u'\u006F\u0309':u'\u1ECF', u'\u0075\u0309':u'\u1EE7', u'\u0079\u0309':u'\u1EF7', - u'\u00C2\u0309':u'\u1EA8', u'\u00CA\u0309':u'\u1EC2', u'\u00D4\u0309':u'\u1ED4', u'\u00E2\u0309':u'\u1EA9', - u'\u00EA\u0309':u'\u1EC3', u'\u00F4\u0309':u'\u1ED5', u'\u0102\u0309':u'\u1EB2', u'\u0103\u0309':u'\u1EB3', - u'\u01A0\u0309':u'\u1EDE', u'\u01A1\u0309':u'\u1EDF', u'\u01AF\u0309':u'\u1EEC', u'\u01B0\u0309':u'\u1EED', - - u'\u0041\u0323':u'\u1EA0', u'\u0045\u0323':u'\u1EB8', u'\u0049\u0323':u'\u1ECA', u'\u004F\u0323':u'\u1ECC', - u'\u0055\u0323':u'\u1EE4', u'\u0059\u0323':u'\u1EF4', u'\u0061\u0323':u'\u1EA1', u'\u0065\u0323':u'\u1EB9', - u'\u0069\u0323':u'\u1ECB', u'\u006F\u0323':u'\u1ECD', u'\u0075\u0323':u'\u1EE5', u'\u0079\u0323':u'\u1EF5', - u'\u00C2\u0323':u'\u1EAC', u'\u00CA\u0323':u'\u1EC6', u'\u00D4\u0323':u'\u1ED8', u'\u00E2\u0323':u'\u1EAD', - u'\u00EA\u0323':u'\u1EC7', u'\u00F4\u0323':u'\u1ED9', u'\u0102\u0323':u'\u1EB6', u'\u0103\u0323':u'\u1EB7', - u'\u01A0\u0323':u'\u1EE2', u'\u01A1\u0323':u'\u1EE3', u'\u01AF\u0323':u'\u1EF0', u'\u01B0\u0323':u'\u1EF1', + u'\u0041\u0300':u'\u00C0', u'\u0045\u0300':u'\u00C8', u'\u0049\u0300':u'\u00CC', u'\u004F\u0300':u'\u00D2', + u'\u0055\u0300':u'\u00D9', u'\u0059\u0300':u'\u1EF2', u'\u0061\u0300':u'\u00E0', u'\u0065\u0300':u'\u00E8', + u'\u0069\u0300':u'\u00EC', u'\u006F\u0300':u'\u00F2', u'\u0075\u0300':u'\u00F9', u'\u0079\u0300':u'\u1EF3', + u'\u00C2\u0300':u'\u1EA6', u'\u00CA\u0300':u'\u1EC0', u'\u00D4\u0300':u'\u1ED2', u'\u00E2\u0300':u'\u1EA7', + u'\u00EA\u0300':u'\u1EC1', u'\u00F4\u0300':u'\u1ED3', u'\u0102\u0300':u'\u1EB0', u'\u0103\u0300':u'\u1EB1', + u'\u01A0\u0300':u'\u1EDC', u'\u01A1\u0300':u'\u1EDD', u'\u01AF\u0300':u'\u1EEA', u'\u01B0\u0300':u'\u1EEB', + + u'\u0041\u0301':u'\u00C1', u'\u0045\u0301':u'\u00C9', u'\u0049\u0301':u'\u00CD', u'\u004F\u0301':u'\u00D3', + u'\u0055\u0301':u'\u00DA', u'\u0059\u0301':u'\u00DD', u'\u0061\u0301':u'\u00E1', u'\u0065\u0301':u'\u00E9', + u'\u0069\u0301':u'\u00ED', u'\u006F\u0301':u'\u00F3', u'\u0075\u0301':u'\u00FA', u'\u0079\u0301':u'\u00FD', + u'\u00C2\u0301':u'\u1EA4', u'\u00CA\u0301':u'\u1EBE', u'\u00D4\u0301':u'\u1ED0', u'\u00E2\u0301':u'\u1EA5', + u'\u00EA\u0301':u'\u1EBF', u'\u00F4\u0301':u'\u1ED1', u'\u0102\u0301':u'\u1EAE', u'\u0103\u0301':u'\u1EAF', + u'\u01A0\u0301':u'\u1EDA', u'\u01A1\u0301':u'\u1EDB', u'\u01AF\u0301':u'\u1EE8', u'\u01B0\u0301':u'\u1EE9', + + u'\u0041\u0303':u'\u00C3', u'\u0045\u0303':u'\u1EBC', u'\u0049\u0303':u'\u0128', u'\u004F\u0303':u'\u00D5', + u'\u0055\u0303':u'\u0168', u'\u0059\u0303':u'\u1EF8', u'\u0061\u0303':u'\u00E3', u'\u0065\u0303':u'\u1EBD', + u'\u0069\u0303':u'\u0129', u'\u006F\u0303':u'\u00F5', u'\u0075\u0303':u'\u0169', u'\u0079\u0303':u'\u1EF9', + u'\u00C2\u0303':u'\u1EAA', u'\u00CA\u0303':u'\u1EC4', u'\u00D4\u0303':u'\u1ED6', u'\u00E2\u0303':u'\u1EAB', + u'\u00EA\u0303':u'\u1EC5', u'\u00F4\u0303':u'\u1ED7', u'\u0102\u0303':u'\u1EB4', u'\u0103\u0303':u'\u1EB5', + u'\u01A0\u0303':u'\u1EE0', u'\u01A1\u0303':u'\u1EE1', u'\u01AF\u0303':u'\u1EEE', u'\u01B0\u0303':u'\u1EEF', + + u'\u0041\u0309':u'\u1EA2', u'\u0045\u0309':u'\u1EBA', u'\u0049\u0309':u'\u1EC8', u'\u004F\u0309':u'\u1ECE', + u'\u0055\u0309':u'\u1EE6', u'\u0059\u0309':u'\u1EF6', u'\u0061\u0309':u'\u1EA3', u'\u0065\u0309':u'\u1EBB', + u'\u0069\u0309':u'\u1EC9', u'\u006F\u0309':u'\u1ECF', u'\u0075\u0309':u'\u1EE7', u'\u0079\u0309':u'\u1EF7', + u'\u00C2\u0309':u'\u1EA8', u'\u00CA\u0309':u'\u1EC2', u'\u00D4\u0309':u'\u1ED4', u'\u00E2\u0309':u'\u1EA9', + u'\u00EA\u0309':u'\u1EC3', u'\u00F4\u0309':u'\u1ED5', u'\u0102\u0309':u'\u1EB2', u'\u0103\u0309':u'\u1EB3', + u'\u01A0\u0309':u'\u1EDE', u'\u01A1\u0309':u'\u1EDF', u'\u01AF\u0309':u'\u1EEC', u'\u01B0\u0309':u'\u1EED', + + u'\u0041\u0323':u'\u1EA0', u'\u0045\u0323':u'\u1EB8', u'\u0049\u0323':u'\u1ECA', u'\u004F\u0323':u'\u1ECC', + u'\u0055\u0323':u'\u1EE4', u'\u0059\u0323':u'\u1EF4', u'\u0061\u0323':u'\u1EA1', u'\u0065\u0323':u'\u1EB9', + u'\u0069\u0323':u'\u1ECB', u'\u006F\u0323':u'\u1ECD', u'\u0075\u0323':u'\u1EE5', u'\u0079\u0323':u'\u1EF5', + u'\u00C2\u0323':u'\u1EAC', u'\u00CA\u0323':u'\u1EC6', u'\u00D4\u0323':u'\u1ED8', u'\u00E2\u0323':u'\u1EAD', + u'\u00EA\u0323':u'\u1EC7', u'\u00F4\u0323':u'\u1ED9', u'\u0102\u0323':u'\u1EB6', u'\u0103\u0323':u'\u1EB7', + u'\u01A0\u0323':u'\u1EE2', u'\u01A1\u0323':u'\u1EE3', u'\u01AF\u0323':u'\u1EF0', u'\u01B0\u0323':u'\u1EF1', } re_vietnamese = re.compile(u'[AEIOUYaeiouy\u00C2\u00CA\u00D4\u00E2\u00EA\u00F4\u0102\u0103\u01A0\u01A1\u01AF\u01B0][\u0300\u0301\u0303\u0309\u0323]') re_latin_cont = re.compile(u'([a-z\u00e0-\u024f])\\1{2,}') @@ -390,54 +467,6 @@ def inference(param, labels, corpus, idlist, trie, options): list = (numpy.abs(param).sum(1) > 0.0000001) print "> # of relevant features = %d / %d" % (list.sum(), M) - -def likelihood(param, labels, trie, filelist, options): - K = len(labels) - corrects = numpy.zeros(K, dtype=int) - counts = numpy.zeros(K, dtype=int) - - label_map = dict((x, i) for i, x in enumerate(labels)) - - n_available_data = 0 - log_likely = 0.0 - for filename in filelist: - f = codecs.open(filename, 'rb', 'utf-8') - for i, s in enumerate(f): - label, text, org_text = normalize_text(s) - - if label not in label_map: - sys.stderr.write("WARNING : unknown label '%s' at %d in %s (ignore the later same labels)\n" % (label, i+1, filename)) - label_map[label] = -1 - label_k = label_map[label] - - events = trie.extract_features(u"\u0001" + text + u"\u0001") - y = predict(param, events) - predict_k = y.argmax() - - if label_k >= 0: - log_likely -= numpy.log(y[label_k]) - n_available_data += 1 - counts[label_k] += 1 - if label_k == predict_k and y[predict_k] >= 0.6: - corrects[predict_k] += 1 - - predict_lang = labels[predict_k] - if y[predict_k] < 0.6: predict_lang = "" - print "%s\t%s\t%s" % (label, predict_lang, org_text) - f.close() - - if n_available_data > 0: - log_likely /= n_available_data - - for lbl, crct, cnt in zip(labels, corrects, counts): - if cnt > 0: - print "> %s = %d / %d = %.2f" % (lbl, crct, cnt, 100.0 * crct / cnt) - print "> total = %d / %d = %.2f" % (corrects.sum(), n_available_data, 100.0 * corrects.sum() / n_available_data) - print "> average negative log likelihood = %.3f" % log_likely - - return log_likely - - def generate_doublearray(file, features): trie = da.DoubleArray() trie.initialize(features) @@ -476,13 +505,7 @@ def generate_doublearray(file, features): os.mkdir(options.model) if len(args) == 0: parser.error("need corpus") - else: - if not os.path.exists(detector.features): - parser.error("features file doesn't exist") - if not os.path.exists(detector.labels): - parser.error("labels file doesn't exist") - if not os.path.exists(detector.param): - parser.error("parameters file doesn't exist") + if options.init: @@ -499,8 +522,6 @@ def generate_doublearray(file, features): detector.learn(options, args) else: - detector.detect(options, args) + detector.treatFile(args,options) #import cProfile #cProfile.runctx('detector.detect(options, args)', globals(), locals(), 'ldig.profile') - - diff --git a/readme.md b/readme.md index 31d4fae..bb013e0 100644 --- a/readme.md +++ b/readme.md @@ -13,8 +13,16 @@ Usage tar xf models/[select model archive] 2. Detect + 1. As a script + ``` ldig.py -m [model directory] [text data file] - + ``` + 2. As a library + ``` + import ldig + detector = ldig.ldig([model directory]) + detector.detect("This is a tweet") + ``` Data format ------ @@ -82,4 +90,3 @@ Copyright & License ----- - (c)2011-2012 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved. - All codes and resources are available under the MIT License. -