shuyo · jmague · Jul 22, 2013 · Jul 3, 2015 · Jul 3, 2015 · Jul 3, 2015
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+*.pyc
+models/model.latin/
+models/model.small/
+test.data
diff --git a/__init__.py b/__init__.py
@@ -0,0 +1 @@
+from ldig import ldig
diff --git a/ldig.py b/ldig.py
@@ -15,19 +15,50 @@
 
 class ldig(object):
     def __init__(self, model_dir):
-        self.features = os.path.join(model_dir, 'features')
-        self.labels = os.path.join(model_dir, 'labels.json')
-        self.param = os.path.join(model_dir, 'parameters.npy')
-        self.doublearray = os.path.join(model_dir, 'doublearray.npz')
+        self.featuresFile = os.path.join(model_dir, 'features')
+        self.labelsFile = os.path.join(model_dir, 'labels.json')
+        self.paramFile = os.path.join(model_dir, 'parameters.npy')
+        self.doublearrayFile = os.path.join(model_dir, 'doublearray.npz')
+
+    def __getattr__(self,name):
+        if '_'+name in self.__dict__:
+            return self.__dict__['_'+name]
+        if name == 'feature':
+            try:
+                self.__dict__['_features']=self.load_features()
+                return self.__dict__['_features']
+            except IOError:
+                raise IOError("no feature file: %s"%self.featuresFile)
+        elif name == 'labels':
+            try:
+                self.__dict__['_labels']=self.load_labels()
+                return self.__dict__['_labels']
+            except IOError:
+                raise IOError("no label file: %s"%self.labelsFile)
+        elif name == 'param':
+            try:
+                self.__dict__['_param']=numpy.load(self.paramFile)
+                return self.__dict__['_param']
+            except IOError:
+                raise IOError("no params file: %s"%self.paramFile)
+        elif name == 'trie':
+            try:
+                self.__dict__['_trie']=self.load_da()
+                return self.__dict__['_trie']
+            except IOError:
+                raise IOError("no double array file: %s"%self.doublearrayFile)
+
+
+
 
     def load_da(self):
         trie = da.DoubleArray()
-        trie.load(self.doublearray)
+        trie.load(self.doublearrayFile)
         return trie
 
     def load_features(self):
         features = []
-        with codecs.open(self.features, 'rb',  'utf-8') as f:
+        with codecs.open(self.featuresFile, 'rb',  'utf-8') as f:
             pre_feature = ""
             for n, s in enumerate(f):
                 m = re.match(r'(.+)\t([0-9]+)', s)
@@ -40,7 +71,7 @@ def load_features(self):
         return features
 
     def load_labels(self):
-        with open(self.labels, 'rb') as f:
+        with open(self.labelsFile, 'rb') as f:
             return json.load(f)
 
 
@@ -69,11 +100,11 @@ def init(self, temp_path, corpus_list, lbff, ngram_bound):
 
         labels.sort()
         print "labels: %d" % len(labels)
-        with open(self.labels, 'wb') as f:
+        with open(self.labelsFile, 'wb') as f:
             f.write(json.dumps(labels))
 
         print "generating max-substrings..."
-        temp_features = self.features + ".temp"
+        temp_features = self.featuresFile + ".temp"
         maxsubst = options.maxsubst
         if os.name == 'nt': maxsubst += ".exe"
         subprocess.call([maxsubst, temp_path, temp_features])
@@ -94,37 +125,37 @@ def init(self, temp_path, corpus_list, lbff, ngram_bound):
         print "# of features = %d" % M
 
         features.sort()
-        with codecs.open(self.features, 'wb', 'utf-8') as f:
+        with codecs.open(self.featuresFile, 'wb', 'utf-8') as f:
             for s in features:
                 f.write(s[1])
 
-        generate_doublearray(self.doublearray, [s[0] for s in features])
+        generate_doublearray(self.doublearrayFile, [s[0] for s in features])
 
-        numpy.save(self.param, numpy.zeros((M, len(labels))))
+        numpy.save(self.paramFile, numpy.zeros((M, len(labels))))
 
     def shrink(self):
         features = self.load_features()
-        param = numpy.load(self.param)
+        param = numpy.load(self.paramFile)
 
         list = (numpy.abs(param).sum(1) > 0.0000001)
         new_param = param[list]
         print "# of features : %d => %d" % (param.shape[0], new_param.shape[0])
 
-        numpy.save(self.param, new_param)
+        numpy.save(self.paramFile, new_param)
         new_features = []
-        with codecs.open(self.features, 'wb',  'utf-8') as f:
+        with codecs.open(self.featuresFile, 'wb',  'utf-8') as f:
             for i, x in enumerate(list):
                 if x:
                     f.write("%s\t%s\n" % features[i])
                     new_features.append(features[i][0])
 
-        generate_doublearray(self.doublearray, new_features)
+        generate_doublearray(self.doublearrayFile, new_features)
 
     def debug(self, args):
         features = self.load_features()
         trie = self.load_da()
         labels = self.load_labels()
-        param = numpy.load(self.param)
+        param = numpy.load(self.paramFile)
 
         for st in args:
             label, text, org_text = normalize_text(st)
@@ -144,7 +175,7 @@ def debug(self, args):
 
     def learn(self, options, args):
         trie = self.load_da()
-        param = numpy.load(self.param)
+        param = numpy.load(self.paramFile)
         labels = self.load_labels()
 
         import time
@@ -153,14 +184,60 @@ def learn(self, options, args):
         print "inference... " + time.strftime("%H:%M:%S", time.localtime())
         inference(param, labels, corpus, idlist, trie, options)
         print "finish... " + time.strftime("%H:%M:%S", time.localtime())
-        numpy.save(self.param, param)
+        numpy.save(self.paramFile, param)
 
-    def detect(self, options, args):
-        trie = self.load_da()
-        param = numpy.load(self.param)
-        labels = self.load_labels()
+    def detect(self,text):
+        events = self.trie.extract_features(u"\u0001" + text + u"\u0001")
+        y = predict(self.param, events)
+        predict_k = y.argmax()
+        if y[predict_k] < 0.6:
+            predict_lang = ""
+        else:
+            predict_lang = self.labels[predict_k]
+        return predict_lang
+
+    def treatFile(self, filelist, options):
+        K = len(self.labels)
+        corrects = numpy.zeros(K, dtype=int)
+        counts = numpy.zeros(K, dtype=int)
+
+        label_map = dict((x, i) for i, x in enumerate(self.labels))
+
+        n_available_data = 0
+        log_likely = 0.0
+        for filename in filelist:
+            f = codecs.open(filename, 'rb',  'utf-8')
+            for i, s in enumerate(f):
+                label, text, org_text = normalize_text(s.strip())
 
-        log_likely = likelihood(param, labels, trie, args, options)
+                predict_lang = self.detect(text)
+
+                if label not in label_map:
+                    if label:
+                        sys.stderr.write("WARNING : unknown label '%s' at %d in %s (ignore the later same labels)\n" % (label, i+1, filename))
+                    label_map[label] = -1
+                label_k = label_map[label]
+
+                if label_k >= 0:
+                    log_likely -= numpy.log(y[label_k])
+                    n_available_data += 1
+                    counts[label_k] += 1
+                    if label_k == predict_k and y[predict_k] >= 0.6:
+                        corrects[predict_k] += 1
+
+                print "%s\t%s\t%s" % (label, predict_lang, org_text)
+            f.close()
+
+        if n_available_data > 0:
+            log_likely /= n_available_data
+
+            for lbl, crct, cnt in zip(self.labels, corrects, counts):
+                if cnt > 0:
+                    print ">    %s = %d / %d = %.2f" % (lbl, crct, cnt, 100.0 * crct / cnt)
+            print "> total = %d / %d = %.2f" % (corrects.sum(), n_available_data, 100.0 * corrects.sum() / n_available_data)
+            print "> average negative log likelihood = %.3f" % log_likely
+
+        return log_likely
 
 
 
@@ -204,40 +281,40 @@ def normalize_twitter(text):
 re_ignore_i = re.compile(r'[^I]')
 re_turkish_alphabet = re.compile(u'[\u011e\u011f\u0130\u0131]')
 vietnamese_norm = {
-	u'\u0041\u0300':u'\u00C0', u'\u0045\u0300':u'\u00C8', u'\u0049\u0300':u'\u00CC', u'\u004F\u0300':u'\u00D2', 
-	u'\u0055\u0300':u'\u00D9', u'\u0059\u0300':u'\u1EF2', u'\u0061\u0300':u'\u00E0', u'\u0065\u0300':u'\u00E8', 
-	u'\u0069\u0300':u'\u00EC', u'\u006F\u0300':u'\u00F2', u'\u0075\u0300':u'\u00F9', u'\u0079\u0300':u'\u1EF3', 
-	u'\u00C2\u0300':u'\u1EA6', u'\u00CA\u0300':u'\u1EC0', u'\u00D4\u0300':u'\u1ED2', u'\u00E2\u0300':u'\u1EA7', 
-	u'\u00EA\u0300':u'\u1EC1', u'\u00F4\u0300':u'\u1ED3', u'\u0102\u0300':u'\u1EB0', u'\u0103\u0300':u'\u1EB1', 
-	u'\u01A0\u0300':u'\u1EDC', u'\u01A1\u0300':u'\u1EDD', u'\u01AF\u0300':u'\u1EEA', u'\u01B0\u0300':u'\u1EEB', 
-
-	u'\u0041\u0301':u'\u00C1', u'\u0045\u0301':u'\u00C9', u'\u0049\u0301':u'\u00CD', u'\u004F\u0301':u'\u00D3', 
-	u'\u0055\u0301':u'\u00DA', u'\u0059\u0301':u'\u00DD', u'\u0061\u0301':u'\u00E1', u'\u0065\u0301':u'\u00E9', 
-	u'\u0069\u0301':u'\u00ED', u'\u006F\u0301':u'\u00F3', u'\u0075\u0301':u'\u00FA', u'\u0079\u0301':u'\u00FD', 
-	u'\u00C2\u0301':u'\u1EA4', u'\u00CA\u0301':u'\u1EBE', u'\u00D4\u0301':u'\u1ED0', u'\u00E2\u0301':u'\u1EA5', 
-	u'\u00EA\u0301':u'\u1EBF', u'\u00F4\u0301':u'\u1ED1', u'\u0102\u0301':u'\u1EAE', u'\u0103\u0301':u'\u1EAF', 
-	u'\u01A0\u0301':u'\u1EDA', u'\u01A1\u0301':u'\u1EDB', u'\u01AF\u0301':u'\u1EE8', u'\u01B0\u0301':u'\u1EE9', 
-
-	u'\u0041\u0303':u'\u00C3', u'\u0045\u0303':u'\u1EBC', u'\u0049\u0303':u'\u0128', u'\u004F\u0303':u'\u00D5', 
-	u'\u0055\u0303':u'\u0168', u'\u0059\u0303':u'\u1EF8', u'\u0061\u0303':u'\u00E3', u'\u0065\u0303':u'\u1EBD', 
-	u'\u0069\u0303':u'\u0129', u'\u006F\u0303':u'\u00F5', u'\u0075\u0303':u'\u0169', u'\u0079\u0303':u'\u1EF9', 
-	u'\u00C2\u0303':u'\u1EAA', u'\u00CA\u0303':u'\u1EC4', u'\u00D4\u0303':u'\u1ED6', u'\u00E2\u0303':u'\u1EAB', 
-	u'\u00EA\u0303':u'\u1EC5', u'\u00F4\u0303':u'\u1ED7', u'\u0102\u0303':u'\u1EB4', u'\u0103\u0303':u'\u1EB5', 
-	u'\u01A0\u0303':u'\u1EE0', u'\u01A1\u0303':u'\u1EE1', u'\u01AF\u0303':u'\u1EEE', u'\u01B0\u0303':u'\u1EEF', 
-
-	u'\u0041\u0309':u'\u1EA2', u'\u0045\u0309':u'\u1EBA', u'\u0049\u0309':u'\u1EC8', u'\u004F\u0309':u'\u1ECE', 
-	u'\u0055\u0309':u'\u1EE6', u'\u0059\u0309':u'\u1EF6', u'\u0061\u0309':u'\u1EA3', u'\u0065\u0309':u'\u1EBB', 
-	u'\u0069\u0309':u'\u1EC9', u'\u006F\u0309':u'\u1ECF', u'\u0075\u0309':u'\u1EE7', u'\u0079\u0309':u'\u1EF7', 
-	u'\u00C2\u0309':u'\u1EA8', u'\u00CA\u0309':u'\u1EC2', u'\u00D4\u0309':u'\u1ED4', u'\u00E2\u0309':u'\u1EA9', 
-	u'\u00EA\u0309':u'\u1EC3', u'\u00F4\u0309':u'\u1ED5', u'\u0102\u0309':u'\u1EB2', u'\u0103\u0309':u'\u1EB3', 
-	u'\u01A0\u0309':u'\u1EDE', u'\u01A1\u0309':u'\u1EDF', u'\u01AF\u0309':u'\u1EEC', u'\u01B0\u0309':u'\u1EED', 
-
-	u'\u0041\u0323':u'\u1EA0', u'\u0045\u0323':u'\u1EB8', u'\u0049\u0323':u'\u1ECA', u'\u004F\u0323':u'\u1ECC', 
-	u'\u0055\u0323':u'\u1EE4', u'\u0059\u0323':u'\u1EF4', u'\u0061\u0323':u'\u1EA1', u'\u0065\u0323':u'\u1EB9', 
-	u'\u0069\u0323':u'\u1ECB', u'\u006F\u0323':u'\u1ECD', u'\u0075\u0323':u'\u1EE5', u'\u0079\u0323':u'\u1EF5', 
-	u'\u00C2\u0323':u'\u1EAC', u'\u00CA\u0323':u'\u1EC6', u'\u00D4\u0323':u'\u1ED8', u'\u00E2\u0323':u'\u1EAD', 
-	u'\u00EA\u0323':u'\u1EC7', u'\u00F4\u0323':u'\u1ED9', u'\u0102\u0323':u'\u1EB6', u'\u0103\u0323':u'\u1EB7', 
-	u'\u01A0\u0323':u'\u1EE2', u'\u01A1\u0323':u'\u1EE3', u'\u01AF\u0323':u'\u1EF0', u'\u01B0\u0323':u'\u1EF1', 
+	u'\u0041\u0300':u'\u00C0', u'\u0045\u0300':u'\u00C8', u'\u0049\u0300':u'\u00CC', u'\u004F\u0300':u'\u00D2',
+	u'\u0055\u0300':u'\u00D9', u'\u0059\u0300':u'\u1EF2', u'\u0061\u0300':u'\u00E0', u'\u0065\u0300':u'\u00E8',
+	u'\u0069\u0300':u'\u00EC', u'\u006F\u0300':u'\u00F2', u'\u0075\u0300':u'\u00F9', u'\u0079\u0300':u'\u1EF3',
+	u'\u00C2\u0300':u'\u1EA6', u'\u00CA\u0300':u'\u1EC0', u'\u00D4\u0300':u'\u1ED2', u'\u00E2\u0300':u'\u1EA7',
+	u'\u00EA\u0300':u'\u1EC1', u'\u00F4\u0300':u'\u1ED3', u'\u0102\u0300':u'\u1EB0', u'\u0103\u0300':u'\u1EB1',
+	u'\u01A0\u0300':u'\u1EDC', u'\u01A1\u0300':u'\u1EDD', u'\u01AF\u0300':u'\u1EEA', u'\u01B0\u0300':u'\u1EEB',
+
+	u'\u0041\u0301':u'\u00C1', u'\u0045\u0301':u'\u00C9', u'\u0049\u0301':u'\u00CD', u'\u004F\u0301':u'\u00D3',
+	u'\u0055\u0301':u'\u00DA', u'\u0059\u0301':u'\u00DD', u'\u0061\u0301':u'\u00E1', u'\u0065\u0301':u'\u00E9',
+	u'\u0069\u0301':u'\u00ED', u'\u006F\u0301':u'\u00F3', u'\u0075\u0301':u'\u00FA', u'\u0079\u0301':u'\u00FD',
+	u'\u00C2\u0301':u'\u1EA4', u'\u00CA\u0301':u'\u1EBE', u'\u00D4\u0301':u'\u1ED0', u'\u00E2\u0301':u'\u1EA5',
+	u'\u00EA\u0301':u'\u1EBF', u'\u00F4\u0301':u'\u1ED1', u'\u0102\u0301':u'\u1EAE', u'\u0103\u0301':u'\u1EAF',
+	u'\u01A0\u0301':u'\u1EDA', u'\u01A1\u0301':u'\u1EDB', u'\u01AF\u0301':u'\u1EE8', u'\u01B0\u0301':u'\u1EE9',
+
+	u'\u0041\u0303':u'\u00C3', u'\u0045\u0303':u'\u1EBC', u'\u0049\u0303':u'\u0128', u'\u004F\u0303':u'\u00D5',
+	u'\u0055\u0303':u'\u0168', u'\u0059\u0303':u'\u1EF8', u'\u0061\u0303':u'\u00E3', u'\u0065\u0303':u'\u1EBD',
+	u'\u0069\u0303':u'\u0129', u'\u006F\u0303':u'\u00F5', u'\u0075\u0303':u'\u0169', u'\u0079\u0303':u'\u1EF9',
+	u'\u00C2\u0303':u'\u1EAA', u'\u00CA\u0303':u'\u1EC4', u'\u00D4\u0303':u'\u1ED6', u'\u00E2\u0303':u'\u1EAB',
+	u'\u00EA\u0303':u'\u1EC5', u'\u00F4\u0303':u'\u1ED7', u'\u0102\u0303':u'\u1EB4', u'\u0103\u0303':u'\u1EB5',
+	u'\u01A0\u0303':u'\u1EE0', u'\u01A1\u0303':u'\u1EE1', u'\u01AF\u0303':u'\u1EEE', u'\u01B0\u0303':u'\u1EEF',
+
+	u'\u0041\u0309':u'\u1EA2', u'\u0045\u0309':u'\u1EBA', u'\u0049\u0309':u'\u1EC8', u'\u004F\u0309':u'\u1ECE',
+	u'\u0055\u0309':u'\u1EE6', u'\u0059\u0309':u'\u1EF6', u'\u0061\u0309':u'\u1EA3', u'\u0065\u0309':u'\u1EBB',
+	u'\u0069\u0309':u'\u1EC9', u'\u006F\u0309':u'\u1ECF', u'\u0075\u0309':u'\u1EE7', u'\u0079\u0309':u'\u1EF7',
+	u'\u00C2\u0309':u'\u1EA8', u'\u00CA\u0309':u'\u1EC2', u'\u00D4\u0309':u'\u1ED4', u'\u00E2\u0309':u'\u1EA9',
+	u'\u00EA\u0309':u'\u1EC3', u'\u00F4\u0309':u'\u1ED5', u'\u0102\u0309':u'\u1EB2', u'\u0103\u0309':u'\u1EB3',
+	u'\u01A0\u0309':u'\u1EDE', u'\u01A1\u0309':u'\u1EDF', u'\u01AF\u0309':u'\u1EEC', u'\u01B0\u0309':u'\u1EED',
+
+	u'\u0041\u0323':u'\u1EA0', u'\u0045\u0323':u'\u1EB8', u'\u0049\u0323':u'\u1ECA', u'\u004F\u0323':u'\u1ECC',
+	u'\u0055\u0323':u'\u1EE4', u'\u0059\u0323':u'\u1EF4', u'\u0061\u0323':u'\u1EA1', u'\u0065\u0323':u'\u1EB9',
+	u'\u0069\u0323':u'\u1ECB', u'\u006F\u0323':u'\u1ECD', u'\u0075\u0323':u'\u1EE5', u'\u0079\u0323':u'\u1EF5',
+	u'\u00C2\u0323':u'\u1EAC', u'\u00CA\u0323':u'\u1EC6', u'\u00D4\u0323':u'\u1ED8', u'\u00E2\u0323':u'\u1EAD',
+	u'\u00EA\u0323':u'\u1EC7', u'\u00F4\u0323':u'\u1ED9', u'\u0102\u0323':u'\u1EB6', u'\u0103\u0323':u'\u1EB7',
+	u'\u01A0\u0323':u'\u1EE2', u'\u01A1\u0323':u'\u1EE3', u'\u01AF\u0323':u'\u1EF0', u'\u01B0\u0323':u'\u1EF1',
 }
 re_vietnamese = re.compile(u'[AEIOUYaeiouy\u00C2\u00CA\u00D4\u00E2\u00EA\u00F4\u0102\u0103\u01A0\u01A1\u01AF\u01B0][\u0300\u0301\u0303\u0309\u0323]')
 re_latin_cont = re.compile(u'([a-z\u00e0-\u024f])\\1{2,}')
@@ -390,54 +467,6 @@ def inference(param, labels, corpus, idlist, trie, options):
     list = (numpy.abs(param).sum(1) > 0.0000001)
     print "> # of relevant features = %d / %d" % (list.sum(), M)
 
-
-def likelihood(param, labels, trie, filelist, options):
-    K = len(labels)
-    corrects = numpy.zeros(K, dtype=int)
-    counts = numpy.zeros(K, dtype=int)
-
-    label_map = dict((x, i) for i, x in enumerate(labels))
-
-    n_available_data = 0
-    log_likely = 0.0
-    for filename in filelist:
-        f = codecs.open(filename, 'rb',  'utf-8')
-        for i, s in enumerate(f):
-            label, text, org_text = normalize_text(s)
-
-            if label not in label_map:
-                sys.stderr.write("WARNING : unknown label '%s' at %d in %s (ignore the later same labels)\n" % (label, i+1, filename))
-                label_map[label] = -1
-            label_k = label_map[label]
-
-            events = trie.extract_features(u"\u0001" + text + u"\u0001")
-            y = predict(param, events)
-            predict_k = y.argmax()
-
-            if label_k >= 0:
-                log_likely -= numpy.log(y[label_k])
-                n_available_data += 1
-                counts[label_k] += 1
-                if label_k == predict_k and y[predict_k] >= 0.6:
-                    corrects[predict_k] += 1
-
-            predict_lang = labels[predict_k]
-            if y[predict_k] < 0.6: predict_lang = ""
-            print "%s\t%s\t%s" % (label, predict_lang, org_text)
-        f.close()
-
-    if n_available_data > 0:
-        log_likely /= n_available_data
-
-        for lbl, crct, cnt in zip(labels, corrects, counts):
-            if cnt > 0:
-                print ">    %s = %d / %d = %.2f" % (lbl, crct, cnt, 100.0 * crct / cnt)
-        print "> total = %d / %d = %.2f" % (corrects.sum(), n_available_data, 100.0 * corrects.sum() / n_available_data)
-        print "> average negative log likelihood = %.3f" % log_likely
-
-    return log_likely
-
-
 def generate_doublearray(file, features):
     trie = da.DoubleArray()
     trie.initialize(features)
@@ -476,13 +505,7 @@ def generate_doublearray(file, features):
             os.mkdir(options.model)
         if len(args) == 0:
             parser.error("need corpus")
-    else:
-        if not os.path.exists(detector.features):
-            parser.error("features file doesn't exist")
-        if not os.path.exists(detector.labels):
-            parser.error("labels file doesn't exist")
-        if not os.path.exists(detector.param):
-            parser.error("parameters file doesn't exist")
+
 
 
     if options.init:
@@ -499,8 +522,6 @@ def generate_doublearray(file, features):
         detector.learn(options, args)
 
     else:
-        detector.detect(options, args)
+        detector.treatFile(args,options)
         #import cProfile
         #cProfile.runctx('detector.detect(options, args)', globals(), locals(), 'ldig.profile')
-
-
diff --git a/readme.md b/readme.md
@@ -13,8 +13,16 @@ Usage
     tar xf models/[select model archive]
 
 2. Detect
+  1. As a script
+  ```
     ldig.py -m [model directory] [text data file]
-
+  ```
+  2. As a library
+  ```
+    import ldig
+    detector = ldig.ldig([model directory])
+    detector.detect("This is a tweet")
+  ```
 
 Data format
 ------
@@ -82,4 +90,3 @@ Copyright & License
 -----
 - (c)2011-2012 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
 - All codes and resources are available under the MIT License.
-