forked from argonne-lcf/Megatron-DeepSpeed
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess_data_batch.py
More file actions
26 lines (17 loc) · 1.25 KB
/
preprocess_data_batch.py
File metadata and controls
26 lines (17 loc) · 1.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# Preprocess batch of files to bin and idx format
def main():
import subprocess
import glob
import os
from tqdm import tqdm
nfiles = glob.glob('./protein_gym/indels/DMS_ProteinGym_indels_multi_prop_fit_meg-ds/*.json')
print(f'num files: {len(nfiles)}')
for i in tqdm(range(len(nfiles))):
sname = nfiles[i].split('/')[-1].split('.')[0]
print(f'Input json filename: {sname}')
cmd = f'python preprocess_data.py --input ./protein_gym/indels/DMS_ProteinGym_indels_multi_prop_fit_meg-ds/{sname}.json --output-prefix ./protein_gym/indels/DMS_ProteinGym_indels_multi_prop_fit_meg-ds_bin-idx/{sname} --tokenizer-type Llama2Tokenizer --tokenizer-model /lus/eagle/projects/datasets/dolma/utils/tokenizer.model --workers 16'
returned_value = os.system(cmd)
if __name__ == '__main__':
main()
# python preprocess_data.py --input ./protein_gym/indels/DMS_ProteinGym_indels_multi_prop_fit_meg-ds/HIS7_YEAST_Pokusaeva_2019_indels_multi_prop_fit_pref.json --output-prefix ./protein_gym/indels/DMS_ProteinGym_indels_multi_prop_fit_meg-ds_bin-idx/HIS7_YEAST_Pokusaeva_2019_indels_multi_prop_fit_pref --tokenizer-type Llama2Tokenizer --tokenizer-model /lus/eagle/projects/datasets/dolma/utils/tokenizer.model --workers 16
''