Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*.pyc
models/model.latin/
models/model.small/
test.data
1 change: 1 addition & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ldig import ldig
253 changes: 137 additions & 116 deletions ldig.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,50 @@

class ldig(object):
def __init__(self, model_dir):
self.features = os.path.join(model_dir, 'features')
self.labels = os.path.join(model_dir, 'labels.json')
self.param = os.path.join(model_dir, 'parameters.npy')
self.doublearray = os.path.join(model_dir, 'doublearray.npz')
self.featuresFile = os.path.join(model_dir, 'features')
self.labelsFile = os.path.join(model_dir, 'labels.json')
self.paramFile = os.path.join(model_dir, 'parameters.npy')
self.doublearrayFile = os.path.join(model_dir, 'doublearray.npz')

def __getattr__(self,name):
if '_'+name in self.__dict__:
return self.__dict__['_'+name]
if name == 'feature':
try:
self.__dict__['_features']=self.load_features()
return self.__dict__['_features']
except IOError:
raise IOError("no feature file: %s"%self.featuresFile)
elif name == 'labels':
try:
self.__dict__['_labels']=self.load_labels()
return self.__dict__['_labels']
except IOError:
raise IOError("no label file: %s"%self.labelsFile)
elif name == 'param':
try:
self.__dict__['_param']=numpy.load(self.paramFile)
return self.__dict__['_param']
except IOError:
raise IOError("no params file: %s"%self.paramFile)
elif name == 'trie':
try:
self.__dict__['_trie']=self.load_da()
return self.__dict__['_trie']
except IOError:
raise IOError("no double array file: %s"%self.doublearrayFile)




def load_da(self):
trie = da.DoubleArray()
trie.load(self.doublearray)
trie.load(self.doublearrayFile)
return trie

def load_features(self):
features = []
with codecs.open(self.features, 'rb', 'utf-8') as f:
with codecs.open(self.featuresFile, 'rb', 'utf-8') as f:
pre_feature = ""
for n, s in enumerate(f):
m = re.match(r'(.+)\t([0-9]+)', s)
Expand All @@ -40,7 +71,7 @@ def load_features(self):
return features

def load_labels(self):
with open(self.labels, 'rb') as f:
with open(self.labelsFile, 'rb') as f:
return json.load(f)


Expand Down Expand Up @@ -69,11 +100,11 @@ def init(self, temp_path, corpus_list, lbff, ngram_bound):

labels.sort()
print "labels: %d" % len(labels)
with open(self.labels, 'wb') as f:
with open(self.labelsFile, 'wb') as f:
f.write(json.dumps(labels))

print "generating max-substrings..."
temp_features = self.features + ".temp"
temp_features = self.featuresFile + ".temp"
maxsubst = options.maxsubst
if os.name == 'nt': maxsubst += ".exe"
subprocess.call([maxsubst, temp_path, temp_features])
Expand All @@ -94,37 +125,37 @@ def init(self, temp_path, corpus_list, lbff, ngram_bound):
print "# of features = %d" % M

features.sort()
with codecs.open(self.features, 'wb', 'utf-8') as f:
with codecs.open(self.featuresFile, 'wb', 'utf-8') as f:
for s in features:
f.write(s[1])

generate_doublearray(self.doublearray, [s[0] for s in features])
generate_doublearray(self.doublearrayFile, [s[0] for s in features])

numpy.save(self.param, numpy.zeros((M, len(labels))))
numpy.save(self.paramFile, numpy.zeros((M, len(labels))))

def shrink(self):
features = self.load_features()
param = numpy.load(self.param)
param = numpy.load(self.paramFile)

list = (numpy.abs(param).sum(1) > 0.0000001)
new_param = param[list]
print "# of features : %d => %d" % (param.shape[0], new_param.shape[0])

numpy.save(self.param, new_param)
numpy.save(self.paramFile, new_param)
new_features = []
with codecs.open(self.features, 'wb', 'utf-8') as f:
with codecs.open(self.featuresFile, 'wb', 'utf-8') as f:
for i, x in enumerate(list):
if x:
f.write("%s\t%s\n" % features[i])
new_features.append(features[i][0])

generate_doublearray(self.doublearray, new_features)
generate_doublearray(self.doublearrayFile, new_features)

def debug(self, args):
features = self.load_features()
trie = self.load_da()
labels = self.load_labels()
param = numpy.load(self.param)
param = numpy.load(self.paramFile)

for st in args:
label, text, org_text = normalize_text(st)
Expand All @@ -144,7 +175,7 @@ def debug(self, args):

def learn(self, options, args):
trie = self.load_da()
param = numpy.load(self.param)
param = numpy.load(self.paramFile)
labels = self.load_labels()

import time
Expand All @@ -153,14 +184,60 @@ def learn(self, options, args):
print "inference... " + time.strftime("%H:%M:%S", time.localtime())
inference(param, labels, corpus, idlist, trie, options)
print "finish... " + time.strftime("%H:%M:%S", time.localtime())
numpy.save(self.param, param)
numpy.save(self.paramFile, param)

def detect(self, options, args):
trie = self.load_da()
param = numpy.load(self.param)
labels = self.load_labels()
def detect(self,text):
events = self.trie.extract_features(u"\u0001" + text + u"\u0001")
y = predict(self.param, events)
predict_k = y.argmax()
if y[predict_k] < 0.6:
predict_lang = ""
else:
predict_lang = self.labels[predict_k]
return predict_lang

def treatFile(self, filelist, options):
K = len(self.labels)
corrects = numpy.zeros(K, dtype=int)
counts = numpy.zeros(K, dtype=int)

label_map = dict((x, i) for i, x in enumerate(self.labels))

n_available_data = 0
log_likely = 0.0
for filename in filelist:
f = codecs.open(filename, 'rb', 'utf-8')
for i, s in enumerate(f):
label, text, org_text = normalize_text(s.strip())

log_likely = likelihood(param, labels, trie, args, options)
predict_lang = self.detect(text)

if label not in label_map:
if label:
sys.stderr.write("WARNING : unknown label '%s' at %d in %s (ignore the later same labels)\n" % (label, i+1, filename))
label_map[label] = -1
label_k = label_map[label]

if label_k >= 0:
log_likely -= numpy.log(y[label_k])
n_available_data += 1
counts[label_k] += 1
if label_k == predict_k and y[predict_k] >= 0.6:
corrects[predict_k] += 1

print "%s\t%s\t%s" % (label, predict_lang, org_text)
f.close()

if n_available_data > 0:
log_likely /= n_available_data

for lbl, crct, cnt in zip(self.labels, corrects, counts):
if cnt > 0:
print "> %s = %d / %d = %.2f" % (lbl, crct, cnt, 100.0 * crct / cnt)
print "> total = %d / %d = %.2f" % (corrects.sum(), n_available_data, 100.0 * corrects.sum() / n_available_data)
print "> average negative log likelihood = %.3f" % log_likely

return log_likely



Expand Down Expand Up @@ -204,40 +281,40 @@ def normalize_twitter(text):
re_ignore_i = re.compile(r'[^I]')
re_turkish_alphabet = re.compile(u'[\u011e\u011f\u0130\u0131]')
vietnamese_norm = {
u'\u0041\u0300':u'\u00C0', u'\u0045\u0300':u'\u00C8', u'\u0049\u0300':u'\u00CC', u'\u004F\u0300':u'\u00D2',
u'\u0055\u0300':u'\u00D9', u'\u0059\u0300':u'\u1EF2', u'\u0061\u0300':u'\u00E0', u'\u0065\u0300':u'\u00E8',
u'\u0069\u0300':u'\u00EC', u'\u006F\u0300':u'\u00F2', u'\u0075\u0300':u'\u00F9', u'\u0079\u0300':u'\u1EF3',
u'\u00C2\u0300':u'\u1EA6', u'\u00CA\u0300':u'\u1EC0', u'\u00D4\u0300':u'\u1ED2', u'\u00E2\u0300':u'\u1EA7',
u'\u00EA\u0300':u'\u1EC1', u'\u00F4\u0300':u'\u1ED3', u'\u0102\u0300':u'\u1EB0', u'\u0103\u0300':u'\u1EB1',
u'\u01A0\u0300':u'\u1EDC', u'\u01A1\u0300':u'\u1EDD', u'\u01AF\u0300':u'\u1EEA', u'\u01B0\u0300':u'\u1EEB',

u'\u0041\u0301':u'\u00C1', u'\u0045\u0301':u'\u00C9', u'\u0049\u0301':u'\u00CD', u'\u004F\u0301':u'\u00D3',
u'\u0055\u0301':u'\u00DA', u'\u0059\u0301':u'\u00DD', u'\u0061\u0301':u'\u00E1', u'\u0065\u0301':u'\u00E9',
u'\u0069\u0301':u'\u00ED', u'\u006F\u0301':u'\u00F3', u'\u0075\u0301':u'\u00FA', u'\u0079\u0301':u'\u00FD',
u'\u00C2\u0301':u'\u1EA4', u'\u00CA\u0301':u'\u1EBE', u'\u00D4\u0301':u'\u1ED0', u'\u00E2\u0301':u'\u1EA5',
u'\u00EA\u0301':u'\u1EBF', u'\u00F4\u0301':u'\u1ED1', u'\u0102\u0301':u'\u1EAE', u'\u0103\u0301':u'\u1EAF',
u'\u01A0\u0301':u'\u1EDA', u'\u01A1\u0301':u'\u1EDB', u'\u01AF\u0301':u'\u1EE8', u'\u01B0\u0301':u'\u1EE9',

u'\u0041\u0303':u'\u00C3', u'\u0045\u0303':u'\u1EBC', u'\u0049\u0303':u'\u0128', u'\u004F\u0303':u'\u00D5',
u'\u0055\u0303':u'\u0168', u'\u0059\u0303':u'\u1EF8', u'\u0061\u0303':u'\u00E3', u'\u0065\u0303':u'\u1EBD',
u'\u0069\u0303':u'\u0129', u'\u006F\u0303':u'\u00F5', u'\u0075\u0303':u'\u0169', u'\u0079\u0303':u'\u1EF9',
u'\u00C2\u0303':u'\u1EAA', u'\u00CA\u0303':u'\u1EC4', u'\u00D4\u0303':u'\u1ED6', u'\u00E2\u0303':u'\u1EAB',
u'\u00EA\u0303':u'\u1EC5', u'\u00F4\u0303':u'\u1ED7', u'\u0102\u0303':u'\u1EB4', u'\u0103\u0303':u'\u1EB5',
u'\u01A0\u0303':u'\u1EE0', u'\u01A1\u0303':u'\u1EE1', u'\u01AF\u0303':u'\u1EEE', u'\u01B0\u0303':u'\u1EEF',

u'\u0041\u0309':u'\u1EA2', u'\u0045\u0309':u'\u1EBA', u'\u0049\u0309':u'\u1EC8', u'\u004F\u0309':u'\u1ECE',
u'\u0055\u0309':u'\u1EE6', u'\u0059\u0309':u'\u1EF6', u'\u0061\u0309':u'\u1EA3', u'\u0065\u0309':u'\u1EBB',
u'\u0069\u0309':u'\u1EC9', u'\u006F\u0309':u'\u1ECF', u'\u0075\u0309':u'\u1EE7', u'\u0079\u0309':u'\u1EF7',
u'\u00C2\u0309':u'\u1EA8', u'\u00CA\u0309':u'\u1EC2', u'\u00D4\u0309':u'\u1ED4', u'\u00E2\u0309':u'\u1EA9',
u'\u00EA\u0309':u'\u1EC3', u'\u00F4\u0309':u'\u1ED5', u'\u0102\u0309':u'\u1EB2', u'\u0103\u0309':u'\u1EB3',
u'\u01A0\u0309':u'\u1EDE', u'\u01A1\u0309':u'\u1EDF', u'\u01AF\u0309':u'\u1EEC', u'\u01B0\u0309':u'\u1EED',

u'\u0041\u0323':u'\u1EA0', u'\u0045\u0323':u'\u1EB8', u'\u0049\u0323':u'\u1ECA', u'\u004F\u0323':u'\u1ECC',
u'\u0055\u0323':u'\u1EE4', u'\u0059\u0323':u'\u1EF4', u'\u0061\u0323':u'\u1EA1', u'\u0065\u0323':u'\u1EB9',
u'\u0069\u0323':u'\u1ECB', u'\u006F\u0323':u'\u1ECD', u'\u0075\u0323':u'\u1EE5', u'\u0079\u0323':u'\u1EF5',
u'\u00C2\u0323':u'\u1EAC', u'\u00CA\u0323':u'\u1EC6', u'\u00D4\u0323':u'\u1ED8', u'\u00E2\u0323':u'\u1EAD',
u'\u00EA\u0323':u'\u1EC7', u'\u00F4\u0323':u'\u1ED9', u'\u0102\u0323':u'\u1EB6', u'\u0103\u0323':u'\u1EB7',
u'\u01A0\u0323':u'\u1EE2', u'\u01A1\u0323':u'\u1EE3', u'\u01AF\u0323':u'\u1EF0', u'\u01B0\u0323':u'\u1EF1',
u'\u0041\u0300':u'\u00C0', u'\u0045\u0300':u'\u00C8', u'\u0049\u0300':u'\u00CC', u'\u004F\u0300':u'\u00D2',
u'\u0055\u0300':u'\u00D9', u'\u0059\u0300':u'\u1EF2', u'\u0061\u0300':u'\u00E0', u'\u0065\u0300':u'\u00E8',
u'\u0069\u0300':u'\u00EC', u'\u006F\u0300':u'\u00F2', u'\u0075\u0300':u'\u00F9', u'\u0079\u0300':u'\u1EF3',
u'\u00C2\u0300':u'\u1EA6', u'\u00CA\u0300':u'\u1EC0', u'\u00D4\u0300':u'\u1ED2', u'\u00E2\u0300':u'\u1EA7',
u'\u00EA\u0300':u'\u1EC1', u'\u00F4\u0300':u'\u1ED3', u'\u0102\u0300':u'\u1EB0', u'\u0103\u0300':u'\u1EB1',
u'\u01A0\u0300':u'\u1EDC', u'\u01A1\u0300':u'\u1EDD', u'\u01AF\u0300':u'\u1EEA', u'\u01B0\u0300':u'\u1EEB',

u'\u0041\u0301':u'\u00C1', u'\u0045\u0301':u'\u00C9', u'\u0049\u0301':u'\u00CD', u'\u004F\u0301':u'\u00D3',
u'\u0055\u0301':u'\u00DA', u'\u0059\u0301':u'\u00DD', u'\u0061\u0301':u'\u00E1', u'\u0065\u0301':u'\u00E9',
u'\u0069\u0301':u'\u00ED', u'\u006F\u0301':u'\u00F3', u'\u0075\u0301':u'\u00FA', u'\u0079\u0301':u'\u00FD',
u'\u00C2\u0301':u'\u1EA4', u'\u00CA\u0301':u'\u1EBE', u'\u00D4\u0301':u'\u1ED0', u'\u00E2\u0301':u'\u1EA5',
u'\u00EA\u0301':u'\u1EBF', u'\u00F4\u0301':u'\u1ED1', u'\u0102\u0301':u'\u1EAE', u'\u0103\u0301':u'\u1EAF',
u'\u01A0\u0301':u'\u1EDA', u'\u01A1\u0301':u'\u1EDB', u'\u01AF\u0301':u'\u1EE8', u'\u01B0\u0301':u'\u1EE9',

u'\u0041\u0303':u'\u00C3', u'\u0045\u0303':u'\u1EBC', u'\u0049\u0303':u'\u0128', u'\u004F\u0303':u'\u00D5',
u'\u0055\u0303':u'\u0168', u'\u0059\u0303':u'\u1EF8', u'\u0061\u0303':u'\u00E3', u'\u0065\u0303':u'\u1EBD',
u'\u0069\u0303':u'\u0129', u'\u006F\u0303':u'\u00F5', u'\u0075\u0303':u'\u0169', u'\u0079\u0303':u'\u1EF9',
u'\u00C2\u0303':u'\u1EAA', u'\u00CA\u0303':u'\u1EC4', u'\u00D4\u0303':u'\u1ED6', u'\u00E2\u0303':u'\u1EAB',
u'\u00EA\u0303':u'\u1EC5', u'\u00F4\u0303':u'\u1ED7', u'\u0102\u0303':u'\u1EB4', u'\u0103\u0303':u'\u1EB5',
u'\u01A0\u0303':u'\u1EE0', u'\u01A1\u0303':u'\u1EE1', u'\u01AF\u0303':u'\u1EEE', u'\u01B0\u0303':u'\u1EEF',

u'\u0041\u0309':u'\u1EA2', u'\u0045\u0309':u'\u1EBA', u'\u0049\u0309':u'\u1EC8', u'\u004F\u0309':u'\u1ECE',
u'\u0055\u0309':u'\u1EE6', u'\u0059\u0309':u'\u1EF6', u'\u0061\u0309':u'\u1EA3', u'\u0065\u0309':u'\u1EBB',
u'\u0069\u0309':u'\u1EC9', u'\u006F\u0309':u'\u1ECF', u'\u0075\u0309':u'\u1EE7', u'\u0079\u0309':u'\u1EF7',
u'\u00C2\u0309':u'\u1EA8', u'\u00CA\u0309':u'\u1EC2', u'\u00D4\u0309':u'\u1ED4', u'\u00E2\u0309':u'\u1EA9',
u'\u00EA\u0309':u'\u1EC3', u'\u00F4\u0309':u'\u1ED5', u'\u0102\u0309':u'\u1EB2', u'\u0103\u0309':u'\u1EB3',
u'\u01A0\u0309':u'\u1EDE', u'\u01A1\u0309':u'\u1EDF', u'\u01AF\u0309':u'\u1EEC', u'\u01B0\u0309':u'\u1EED',

u'\u0041\u0323':u'\u1EA0', u'\u0045\u0323':u'\u1EB8', u'\u0049\u0323':u'\u1ECA', u'\u004F\u0323':u'\u1ECC',
u'\u0055\u0323':u'\u1EE4', u'\u0059\u0323':u'\u1EF4', u'\u0061\u0323':u'\u1EA1', u'\u0065\u0323':u'\u1EB9',
u'\u0069\u0323':u'\u1ECB', u'\u006F\u0323':u'\u1ECD', u'\u0075\u0323':u'\u1EE5', u'\u0079\u0323':u'\u1EF5',
u'\u00C2\u0323':u'\u1EAC', u'\u00CA\u0323':u'\u1EC6', u'\u00D4\u0323':u'\u1ED8', u'\u00E2\u0323':u'\u1EAD',
u'\u00EA\u0323':u'\u1EC7', u'\u00F4\u0323':u'\u1ED9', u'\u0102\u0323':u'\u1EB6', u'\u0103\u0323':u'\u1EB7',
u'\u01A0\u0323':u'\u1EE2', u'\u01A1\u0323':u'\u1EE3', u'\u01AF\u0323':u'\u1EF0', u'\u01B0\u0323':u'\u1EF1',
}
re_vietnamese = re.compile(u'[AEIOUYaeiouy\u00C2\u00CA\u00D4\u00E2\u00EA\u00F4\u0102\u0103\u01A0\u01A1\u01AF\u01B0][\u0300\u0301\u0303\u0309\u0323]')
re_latin_cont = re.compile(u'([a-z\u00e0-\u024f])\\1{2,}')
Expand Down Expand Up @@ -390,54 +467,6 @@ def inference(param, labels, corpus, idlist, trie, options):
list = (numpy.abs(param).sum(1) > 0.0000001)
print "> # of relevant features = %d / %d" % (list.sum(), M)


def likelihood(param, labels, trie, filelist, options):
K = len(labels)
corrects = numpy.zeros(K, dtype=int)
counts = numpy.zeros(K, dtype=int)

label_map = dict((x, i) for i, x in enumerate(labels))

n_available_data = 0
log_likely = 0.0
for filename in filelist:
f = codecs.open(filename, 'rb', 'utf-8')
for i, s in enumerate(f):
label, text, org_text = normalize_text(s)

if label not in label_map:
sys.stderr.write("WARNING : unknown label '%s' at %d in %s (ignore the later same labels)\n" % (label, i+1, filename))
label_map[label] = -1
label_k = label_map[label]

events = trie.extract_features(u"\u0001" + text + u"\u0001")
y = predict(param, events)
predict_k = y.argmax()

if label_k >= 0:
log_likely -= numpy.log(y[label_k])
n_available_data += 1
counts[label_k] += 1
if label_k == predict_k and y[predict_k] >= 0.6:
corrects[predict_k] += 1

predict_lang = labels[predict_k]
if y[predict_k] < 0.6: predict_lang = ""
print "%s\t%s\t%s" % (label, predict_lang, org_text)
f.close()

if n_available_data > 0:
log_likely /= n_available_data

for lbl, crct, cnt in zip(labels, corrects, counts):
if cnt > 0:
print "> %s = %d / %d = %.2f" % (lbl, crct, cnt, 100.0 * crct / cnt)
print "> total = %d / %d = %.2f" % (corrects.sum(), n_available_data, 100.0 * corrects.sum() / n_available_data)
print "> average negative log likelihood = %.3f" % log_likely

return log_likely


def generate_doublearray(file, features):
trie = da.DoubleArray()
trie.initialize(features)
Expand Down Expand Up @@ -476,13 +505,7 @@ def generate_doublearray(file, features):
os.mkdir(options.model)
if len(args) == 0:
parser.error("need corpus")
else:
if not os.path.exists(detector.features):
parser.error("features file doesn't exist")
if not os.path.exists(detector.labels):
parser.error("labels file doesn't exist")
if not os.path.exists(detector.param):
parser.error("parameters file doesn't exist")



if options.init:
Expand All @@ -499,8 +522,6 @@ def generate_doublearray(file, features):
detector.learn(options, args)

else:
detector.detect(options, args)
detector.treatFile(args,options)
#import cProfile
#cProfile.runctx('detector.detect(options, args)', globals(), locals(), 'ldig.profile')


11 changes: 9 additions & 2 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,16 @@ Usage
tar xf models/[select model archive]

2. Detect
1. As a script
```
ldig.py -m [model directory] [text data file]

```
2. As a library
```
import ldig
detector = ldig.ldig([model directory])
detector.detect("This is a tweet")
```

Data format
------
Expand Down Expand Up @@ -82,4 +90,3 @@ Copyright & License
-----
- (c)2011-2012 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
- All codes and resources are available under the MIT License.