-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmulti_process_translate.py
124 lines (114 loc) · 5.1 KB
/
multi_process_translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from translate import Translate
import utils.file_util as futil
import time
import multiprocessing
import argparse
import os
parser = argparse.ArgumentParser(description="Multi-process translation using Google",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--srclg', type=str, default='en',
help='src language')
parser.add_argument('--trglg', type=str, default='zh-cn',
help='target language')
parser.add_argument('--input', type=str, required=True,
help='input file, each sentence a line')
parser.add_argument('--output', type=str, default='./output',
help='output file, each sentence a line')
parser.add_argument('--pbatch', type=int, default=100,
help='batch size for translation loop')
parser.add_argument('--process_num', type=int, default=5,
help='num of process')
parser.add_argument('--tbatch', type=int, default=5,
help='batch for one google translate request')
parser.add_argument('--max_try', type=int, default=3,
help='maximum try for google translate')
class MultiProcessTranslation:
def __init__(self, client_url, src='en', target='zh-cn'):
self.translator = Translate(client_url, src, target)
def _translate(self, sentences, index, tbatch, max_try):
output_list = []
all_len = len(sentences)
for i in range(0, all_len, tbatch):
output_list.extend(self.translator.translate_batch(sentences[i: i + tbatch], max_try))
return output_list, index
def trans_all_processes(self, sentences, process_num, tbatch, max_try) -> list:
"""
:param sentences:
:param process_num:
:param tbatch: google 翻译,一次提交的数量
:param max_try: 最大尝试次数
:return:
"""
processes = []
pool = multiprocessing.Pool(process_num)
step = int(len(sentences) / process_num)
if step == 0: # 总的翻译句子数小于进程数,则一个进程处理全部翻译
step = len(sentences)
for i in range(0, len(sentences), step):
processes.append(pool.apply_async(func=self._translate, args=(sentences[i:i + step], i, tbatch, max_try)))
pool.close()
pool.join()
result = []
for p in processes:
output_list, index = p.get()
result.append([output_list.copy(), index])
result.sort(key=lambda x: x[1])
output_list = []
for r in result:
output_list.extend(r[0])
return output_list
def trans_all(self, sentences,
output_file,
index_file,
begin=0,
pbatch=100,
process_num=5,
tbatch=5,
max_try=3):
"""
:param sentences:
:param output_file: 如果不为空,则将翻译的内容写入该文件,以append方式
:param index_file: 用来存储翻译的位置,只有output_file不为空才有意义
:param begin: 待翻译文件中需要翻译的起始序列
:param pbatch: 提交给所有进程总的数据量, 如果pbatch不为空,则每隔pbatch写入一次文件
:param process_num: 进程数
:param tbatch: google 翻译,一次提交的数量
:param max_try: 最大尝试次数
:return:
"""
translations = []
all_len = len(sentences)
count = begin
for i in range(0, all_len, pbatch):
translate_start = time.time()
batch_translations = self.trans_all_processes(sentences[i: i + pbatch], process_num=process_num,
tbatch=tbatch, max_try=max_try)
translate_end = time.time()
try:
print('翻译用时 %d, %d -> %s' % ((translate_end-translate_start), begin + i, batch_translations[:1]))
except Exception as e:
print(e)
if output_file:
futil.add_to_txt(output_file, batch_translations)
count += len(batch_translations)
futil.save_to_txt(index_file, [str(count)])
else:
translations.extend(batch_translations)
return translations
if __name__ == '__main__':
args = parser.parse_args()
url_client_file = 'client.config'
client_url = []
if os.path.exists(url_client_file):
client_url = futil.read_txt(url_client_file)
if len(client_url) == 0:
client_url = ['translate.google.cn']
mpt = MultiProcessTranslation(client_url, src=args.srclg, target=args.trglg)
index_file = args.input+'.index'
if os.path.exists(index_file):
index = int(futil.read_txt(index_file)[0])
else:
index = 0
sentences = futil.read_txt(args.input, begin=index)
mpt.trans_all(sentences, output_file=args.output, index_file=index_file, begin=index,
pbatch=args.pbatch, process_num=args.process_num, tbatch=args.tbatch, max_try=args.max_try)