-
-
Notifications
You must be signed in to change notification settings - Fork 351
重现日本人的模型训练
库 clstm 标记数据:日本人所使用的2700字符 存储路径为/home/wanghs/ocr/ocr-training-material/trainning_data/clstm-Japanese-model/testdata3877/mincho/
每迭代一次3秒 15万次需要125个小时
docker run -it -v /home/wanghs/ocr/ocr-training-material/trainning_data/:/work datacenter/clstm-ocr /bin/bash
root@7fa3ae0f99e3:/work/clstm# cat ./run-xps-tests-third-japanese-input #!/bin/bash ##2016-06-03 chongxian riben ren tidao de diedai 15 wan ci shoulian set -e debug=${debug:-0} options=${options:-} export PS4='
' trap "echo TEST FAILED" EXIT set -x export seed=0.222 scons -s -c; rm -f *.o *.a scons -j 4 gpu=0 debug=$debug options="$options" clstmocrtrain clstmfiltertrain clstmfilter clstmocr test-lstm time ./test-lstm time ./test-japanese.sh #time ./test-filter.sh #time ./test-ocr.sh scons -s -c; rm -f *.o *.a scons -j 4 gpu=0 double=1 debug=$debug options="$options" test-cderiv test-deriv test-ctc #./test-cderiv #./test-deriv ./test-japanese.sh rm -f *.pb.h *.pb.cc scons -c all #scons -s -c pyswig #scons pyswig #python test-lstm.py #set +x #scons -s -c all pyswig trap "echo ALL TESTS PASS" EXIT
root@7fa3ae0f99e3:/work/clstm# cat ./test-japanese.sh #!/bin/bash set -ea find ../clstm-Japanese-model/testdata3877/mincho/ -name '*.bin.png' | sort -r > japanese-3877-char sed 1,2000d japanese-3877-char > japanese-train-3877 sed 1,1950d japanese-3877-char > japanese-test-3877 report_every=1 save_every=1000 ntrain=400000 dewarp=center display_every=100 test_every=1000 display_every=1000 testset=japanese-test.h5 hidden=800 lrate=1e-4 save_name=japanese-3877 report_time=1 : #load=xps-352000.clstm
./clstmocrtrain japanese-train-3877 japanese-test-3877
在原来单字单行训练得到模型的基础上,拿20字单行的标记数据进行训练 已训练模型:xps-391100.clstm 标记数据路径:/ocr-training-materiali/
root@7fa3ae0f99e3:/work/clstm# cp run-xps-tests run-xps-tests-third root@7fa3ae0f99e3:/work/clstm# vim run-xps-tests-third root@7fa3ae0f99e3:/work/clstm# cat run-xps-tests-third #!/bin/bash set -e debug=${debug:-0} options=${options:-} export PS4='
' trap "echo TEST FAILED" EXIT set -x export seed=0.222 scons -s -c; rm -f *.o *.a scons -j 4 gpu=0 debug=$debug options="$options" clstmocrtrain clstmfiltertrain clstmfilter clstmocr test-lstm time ./test-lstm time ./test-xps-third.sh #time ./test-filter.sh #time ./test-ocr.sh scons -s -c; rm -f *.o *.a scons -j 4 gpu=0 double=1 debug=$debug options="$options" test-cderiv test-deriv test-ctc #./test-cderiv #./test-deriv ./test-xps-third.sh rm -f *.pb.h *.pb.cc scons -c all #scons -s -c pyswig #scons pyswig #python test-lstm.py #set +x #scons -s -c all pyswig trap "echo ALL TESTS PASS" EXIT
root@7fa3ae0f99e3:/work/clstm# cp ./test-xps.sh ./test-xps-third.sh
root@7fa3ae0f99e3:/work/clstm# vim ./test-xps-third.sh root@7fa3ae0f99e3:/work/clstm# cat ./test-xps-third.sh #!/bin/bash set -ea find ../pic_simsun_eachline_20_with_special_char -name '*.bin.png' | sort -r > xps-total-zh-char sed 1,900d xps-total-zh-char > xps-train-total sed 1,850d xps-total-zh-char > xps-test-total report_every=1 save_every=1000 ntrain=400000 dewarp=center display_every=1000 test_every=1000 display_every=1000 testset=xps-total-test.h5 hidden=800 lrate=1e-4 save_name=xps-total report_time=1 : load=xps-391100.clstm
./clstmocrtrain xps-train-total xps-test-total
./run-xps-tests-third
在umaru库上跑日本人的标记数据
[wanghs@db2 umaru]$ cat 3877_japan_setting.json { "project_name": "japan_3877_words", "raw_input": false, "hidden_size": 800, "nthread": 280, "ctc_lua": false, "test_every": 1000, "stride": 5, "save_every": 10000, "gpu": false, "show_every": 1, "input_size": 48, "clamp_size": 1, "max_param_norm": false, "testing_ratio": 1, "momentum": 0.9, "dropout_rate": 0, "max_iter": 10000000000, "feature_size": 240, "learning_rate": 0.0001, "training_list_file": "japan-trainning.txt", "testing_list_file":"japan-test.txt", "omp_threads": 28, "recurrent_unit": "gru", "windows_size": 10 }
th main.lua -setting 3877_japan_setting.json
从断了之前保存的japanese-3877-144000.clstm开始训练
docker run -it --name japan-from-3877-144000 -v /home/wanghs/ocr/ocr-training-material/trainning_data/:/work datacenter/clstm-ocr /bin/bash
root@7fa3ae0f99e3:/work/clstm# cat ./run-xps-tests-third-japanese-input-from-144000 #!/bin/bash ##2016-06-12 chongxian riben ren tidao de diedai 15 wan ci shoulian set -e debug=${debug:-0} options=${options:-} export PS4='
' trap "echo TEST FAILED" EXIT set -x export seed=0.222 scons -s -c; rm -f *.o *.a scons -j 4 gpu=0 debug=$debug options="$options" clstmocrtrain clstmfiltertrain clstmfilter clstmocr test-lstm time ./test-lstm time ./test-japanese.sh #time ./test-filter.sh #time ./test-ocr.sh scons -s -c; rm -f *.o *.a scons -j 4 gpu=0 double=1 debug=$debug options="$options" test-cderiv test-deriv test-ctc #./test-cderiv #./test-deriv ./test-japanese-from-144000.sh rm -f *.pb.h *.pb.cc scons -c all #scons -s -c pyswig #scons pyswig #python test-lstm.py #set +x #scons -s -c all pyswig trap "echo ALL TESTS PASS" EXIT
root@7fa3ae0f99e3:/work/clstm# cat ./test-japanese-from-144000.sh #!/bin/bash set -ea find ../clstm-Japanese-model/testdata3877/mincho/ -name '*.bin.png' | sort -r > japanese-3877-char sed 1,2000d japanese-3877-char > japanese-train-3877 sed 1,1950d japanese-3877-char > japanese-test-3877 report_every=1 save_every=1000 ntrain=400000 dewarp=center display_every=100 test_every=1000 display_every=1000 testset=japanese-test.h5 hidden=800 lrate=1e-4 save_name=japanese-3877 report_time=1 load=xps-352000.clstm
./clstmocrtrain japanese-train-3877 japanese-test-3877