From 895f6e0f320e7f4219d44c8b4b0e22258852e2e7 Mon Sep 17 00:00:00 2001 From: Erfan Al-Hossami Date: Fri, 24 Dec 2021 02:12:12 -0500 Subject: [PATCH 1/8] Added loading from bucket --- generate_indexes.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/generate_indexes.py b/generate_indexes.py index ef3ced65..7938362d 100644 --- a/generate_indexes.py +++ b/generate_indexes.py @@ -1,7 +1,7 @@ import argparse from pathlib import Path - +import os parser = argparse.ArgumentParser() parser.add_argument( "--gs_project_id", @@ -15,11 +15,13 @@ "--output_dir", type=str, help="Where the indexes will be stored locally." ) +parser.add_argument ("--load_bucket", type=bool, help="Whether to load the folder structure from the Bucket directly", default = False, nargs = '?') + args = parser.parse_args() input_dir = Path(args.input_dir) root_dir = input_dir.name output_dir = Path(args.output_dir) - +load_bucket = args.load_bucket # get the list of tfrecords # train_tfrecords = [ # str(f) @@ -32,18 +34,23 @@ # if "valid" in str(f) # ] # construct index file paths in the format of a gs bucket -train_indexes = [ - f"gs://{args.gs_project_id}/{root_dir}/{str(f).split(f'/{root_dir}/')[-1]}" - for f in input_dir.glob("**/*.tfrecords") - if "train" in str(f) -] -print(train_indexes[:5]) -val_indexes = [ - f"gs://{args.gs_project_id}/{root_dir}/{str(f).split(f'/{root_dir}/')[-1]}" - for f in input_dir.glob("**/*.tfrecords") - if "valid" in str(f) -] +if load_bucket == False: + train_indexes = [ + f"gs://{args.gs_project_id}/{root_dir}/{str(f).split(f'/{root_dir}/')[-1]}" + for f in input_dir.glob("**/*.tfrecords") + if "train" in str(f) + ] + print(train_indexes[:5]) + val_indexes = [ + f"gs://{args.gs_project_id}/{root_dir}/{str(f).split(f'/{root_dir}/')[-1]}" + for f in input_dir.glob("**/*.tfrecords") + if "valid" in str(f) + ] +else: + list_files = os.popen(f'gsutil ls -r gs://{args.gs_project_id}/{root_dir}').read().split('\n') + train_indexes = [f for f in list_files if 'train' in str(f) and '.tfrecords' in str(f)] + val_indexes = [f for f in list_files if 'valid' in str(f) and '.tfrecords' in str(f)] with open(output_dir / "code_clippy.train.index", "w") as f: f.write("\n".join(train_indexes)) From 4b30c3ee88bd1dd0bbcf5fb876f035e5d1339752 Mon Sep 17 00:00:00 2001 From: Erfan Al-Hossami Date: Fri, 24 Dec 2021 02:12:57 -0500 Subject: [PATCH 2/8] Update requirements.txt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 29951424..9e747c0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy~=1.19.5 -tqdm~=4.45.0 +tqdm wandb>=0.11.2 einops~=0.3.0 requests~=2.25.1 @@ -19,4 +19,4 @@ func_timeout ftfy fastapi uvicorn -pathy \ No newline at end of file +pathy From 42aeeb3122d2601d4c519c36ca42330bf9eb73cd Mon Sep 17 00:00:00 2001 From: Erfan Al-Hossami Date: Fri, 31 Dec 2021 04:38:33 -0500 Subject: [PATCH 3/8] Update code_clippy_6B.json --- configs/code_clippy_6B.json | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/configs/code_clippy_6B.json b/configs/code_clippy_6B.json index f62e5987..bd27daa1 100644 --- a/configs/code_clippy_6B.json +++ b/configs/code_clippy_6B.json @@ -10,19 +10,19 @@ "seq": 2048, "cores_per_replica": 8, "per_replica_batch": 1, - "gradient_accumulation_steps": 16, + "gradient_accumulation_steps": 32, - "warmup_steps": 3000, - "anneal_steps": 300000, - "lr": 1.2e-4, - "end_lr": 1.2e-5, + "warmup_steps": 6200, + "anneal_steps": 613800, + "lr": 1e-5, + "end_lr": 1e-6, "weight_decay": 0.1, - "total_steps": 350000, + "total_steps": 620000, "tpu_size": 8, "bucket": "code-clippy-bucket", - "model_dir": "code_clippy_6B", + "model_dir": "code_clippy_6B_v2", "train_set": "code_clippy.train.index", "val_set": { @@ -36,7 +36,7 @@ "ckpt_every": 500, "keep_every": 10000, - "name": "code_clippy_6B", + "name": "code_clippy_6B_v2", "wandb_project": "mesh-transformer-jax", - "comment": "" + "comment": "Decreased learning rate and increased gradient steps" } From b2c0b8e01e996b37bc5e9b72102401b20e6e6af6 Mon Sep 17 00:00:00 2001 From: Erfan Al-Hossami Date: Fri, 31 Dec 2021 04:39:12 -0500 Subject: [PATCH 4/8] Update device_train.py added an exception for faulty data samples --- device_train.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/device_train.py b/device_train.py index e1131742..765b24bf 100644 --- a/device_train.py +++ b/device_train.py @@ -106,6 +106,7 @@ def save(network, step, bucket, path, mp, aux=None, keep_n=3, delete_old=True): def train_step(network, data): + inputs = { "obs": data[:, :, :-1], "target": data[:, :, 1:], @@ -324,9 +325,14 @@ def eval_step(network, data): exit() start = time.time() - loss, last_loss, grad_norm, grad_norm_micro = train_step( - network, train_dataset.get_samples() - ) + try: + loss, last_loss, grad_norm, grad_norm_micro = train_step( + network, train_dataset.get_samples() + ) + except: + print(f'Skipped this batch bc of faulty sample.\n File name:{train_dataset.get_state()}') + wandb.log(train_dataset.get_state(),step) + continue step += 1 steps_per_sec = 1 / (time.time() - start) @@ -393,6 +399,7 @@ def eval_step(network, data): "train/learning_rate": float(scheduler(network.state["opt_state"][-1].count[0].item())), "sequences_processed": sequences_processed, "tokens_processed": tokens_processed, + # "clip_global_gradient_norm": clip_by_global_norm(1)[1] } wandb_stats.update(noise_scale_stats) From acda66bf4981576bc4551eddf83ba05691c638a3 Mon Sep 17 00:00:00 2001 From: Erfan Al-Hossami Date: Fri, 31 Dec 2021 05:00:20 -0500 Subject: [PATCH 5/8] Update device_train.py --- device_train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/device_train.py b/device_train.py index 765b24bf..cfd50f15 100644 --- a/device_train.py +++ b/device_train.py @@ -399,6 +399,7 @@ def eval_step(network, data): "train/learning_rate": float(scheduler(network.state["opt_state"][-1].count[0].item())), "sequences_processed": sequences_processed, "tokens_processed": tokens_processed, + #visualize clipped gradients # "clip_global_gradient_norm": clip_by_global_norm(1)[1] } wandb_stats.update(noise_scale_stats) From 62c50ebc9982af9c890ec7d9b3e2349dd09411fc Mon Sep 17 00:00:00 2001 From: Erfan Al-Hossami Date: Sun, 27 Mar 2022 22:46:16 -0400 Subject: [PATCH 6/8] Update generate_indexes.py --- generate_indexes.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/generate_indexes.py b/generate_indexes.py index 7938362d..db22db26 100644 --- a/generate_indexes.py +++ b/generate_indexes.py @@ -51,8 +51,13 @@ list_files = os.popen(f'gsutil ls -r gs://{args.gs_project_id}/{root_dir}').read().split('\n') train_indexes = [f for f in list_files if 'train' in str(f) and '.tfrecords' in str(f)] val_indexes = [f for f in list_files if 'valid' in str(f) and '.tfrecords' in str(f)] + with open(output_dir / "code_clippy.train.index", "w") as f: f.write("\n".join(train_indexes)) with open(output_dir / "code_clippy.val.index", "w") as f: f.write("\n".join(val_indexes)) + + + + From 9626f0d4363a65b7fbd03d6f559ef2be636c5450 Mon Sep 17 00:00:00 2001 From: Erfan Al-Hossami Date: Sun, 27 Mar 2022 22:46:39 -0400 Subject: [PATCH 7/8] Update requirements.txt --- requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9e747c0d..aa96d0d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy~=1.19.5 -tqdm +tqdm~=4.62.3 wandb>=0.11.2 einops~=0.3.0 requests~=2.25.1 @@ -17,6 +17,6 @@ transformers smart_open[gcs] func_timeout ftfy -fastapi -uvicorn -pathy +fastapi~=0.74.1 +uvicorn~=0.2.2 +pathy~=0.6.1 From e5504d15ca99fdcf547aebf3806fddf4d2728e61 Mon Sep 17 00:00:00 2001 From: Erfan Al-Hossami Date: Sun, 27 Mar 2022 22:48:03 -0400 Subject: [PATCH 8/8] Create load_data_to_hf.py --- load_data_to_hf.py | 78 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 load_data_to_hf.py diff --git a/load_data_to_hf.py b/load_data_to_hf.py new file mode 100644 index 00000000..8b271462 --- /dev/null +++ b/load_data_to_hf.py @@ -0,0 +1,78 @@ +""" +This script loads the data from Google Cloud Storage to a Huggingface dataset repository. +""" + +import argparse + +from pathlib import Path +import os +import time +import shutil +# from smart_open import open +# from google.cloud import storage +# from google.cloud.exceptions import NotFound + +parser = argparse.ArgumentParser() +parser.add_argument( + "--gs_project_id", + type=str, + help="Google Cloud Storage project ID", + default = 'code-clippy-bucket' +) + +parser.add_argument( + "--output_dir", type=str, help="Where the files will be temprorarily stored locally.", default = "../code_clippy_github/" +) + + +parser.add_argument( + "--input_dir", type=str, help="Where the dataset is stored on the GCS.", default = "code-clippy-dataset" +) + + +args = parser.parse_args() +root_dir = args.input_dir +output_dir = Path(args.output_dir) +# load_bucket = args.load_bucket + +os.chdir(output_dir) + +list_files = os.popen(f'gsutil ls -r gs://{args.gs_project_id}/{root_dir}').read().split('\n') + +uploaded_files = os.popen(f'ls -r {output_dir}').read().split('\n') + +#print(uploaded_files) +json_files_list = [f for f in list_files if '.json.gz' in str(f) and str(f.split('/')[-1]) not in uploaded_files] + +print(json_files_list) + +commited_files = [] + +for commit_num , file_path in enumerate(json_files_list,1): + + os.system( f'gsutil cp {file_path} {file_path.split("/")[-1]}') + time.sleep(0.5) + os.system( f'git add {file_path.split("/")[-1]}') + + commited_files.append(file_path.split("/")[-1]) + + if commit_num % 20 == 0: + time.sleep(1) + os.system(f'git commit -m \" adding dataset from GCS {commit_num}\"') + time.sleep(1) + os.system(f'git push https://USERNAME:PASSWORD@huggingface.co/datasets/repo.git') + + time.sleep(1) + + while len(commited_files) > 0: + os.remove(f'{commited_files.pop(0)}') + + print('Done Deleting') + + if commit_num % 200 == 0: + os.chdir('..') + shutil.rmtree(f'{str(output_dir).split("/")[-1]}') + # os.remove(f'{str(output_dir).split("/")[-1]}') + os.system(f'GIT_LFS_SKIP_SMUDGE=1 git clone https://USERNAME:PASSWORD@huggingface.co/datasets/CodedotAI/code_clippy_github.git') + os.chdir(f'{str(output_dir).split("/")[-1]}') + print(f'Completion: {commit_num/len(json_files_list) * 100} %')