-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathproject-ace.sh
85 lines (75 loc) · 2.42 KB
/
project-ace.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/bin/bash
src="en"
split=${1:-"devtest"}
tgt=${2:-"ar"}
encoder=${3:-"bert-base-multilingual-cased"}
align_layer=${4:-8}
align_system=${5:-"mbert_l8"}
mt_system="helsinki_opus"
max_len=500
#ACE data_dir
DATA_DIR=""
#temp dir where outputs are saved in, after each step
DIR="intermediary/ace"
#path to where final projection file will be saved
FINAL_DIR="projection/ace"
mkdir -p $DIR $FINAL_DIR
#dir containing the splits info
SPLITS_DIR=""
if [ -f "$DIR/$src.$split.text" ]; then
echo "$DIR/$src.$split.text exists."
else
python scripts/process_ace.py \
--input "$DATA_DIR" \
--output "$DIR" \
--lang "english"
python scripts/extract-text.py \
--task ace \
--path "$DIR" \
--lang $src \
--split "$split" \
>"$DIR/$src.$split.text"
fi
if [ -f "$DIR/$src.to_$tgt.$mt_system.$split.text" ]; then
echo "$DIR/$src.to_$tgt.$mt_system.$split.text exists."
else
python scripts/translate.py \
--infile "$DIR/$src.$split.text" \
--model_name "Helsinki-NLP/opus-mt-$src-$tgt" \
--src $src \
--tgt "$tgt" \
>"$DIR/$src.to_$tgt.$mt_system.$split.text"
fi
if [ -f "$DIR/$src.and_$tgt.$mt_system.$split.text" ]; then
echo "$DIR/$src.and_$tgt.$mt_system.$split.text exists."
else
python scripts/bitext-concat.py \
--src_fp "$DIR/$src.$split.text" \
--tgt_fp "$DIR/$src.to_$tgt.$mt_system.$split.text" \
>"$DIR/$src.and_$tgt.$mt_system.$split.text"
fi
if [ -f "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" ]; then
echo "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align exists."
else
python scripts/awesome-align.py \
--data_file "$DIR/$src.and_$tgt.$mt_system.$split.text" \
--align_layer "$align_layer" \
--model_name_or_path "$encoder" \
--max_len $max_len \
--output_file "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align"
fi
python scripts/project-label.py \
--task ace \
--path "$DIR" \
--lang $src \
--split "$split" \
--bitext "$DIR/$src.and_$tgt.$mt_system.$split.text" \
--alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" \
--output_path $FINAL_DIR \
--name "$tgt.from_$src.$mt_system.$align_system"
mv "$FINAL_DIR/out.json" "$FINAL_DIR/arabic.json"
python scripts/process_ace.py \
--input "$DATA_DIR" \
--output "$FINAL_DIR" \
--lang "arabic" \
--split "$SPLITS_DIR"