From 895f6e0f320e7f4219d44c8b4b0e22258852e2e7 Mon Sep 17 00:00:00 2001
From: Erfan Al-Hossami <ealhossa@uncc.edu>
Date: Fri, 24 Dec 2021 02:12:12 -0500
Subject: [PATCH 1/8] Added loading from bucket

---
 generate_indexes.py | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/generate_indexes.py b/generate_indexes.py
index ef3ced65..7938362d 100644
--- a/generate_indexes.py
+++ b/generate_indexes.py
@@ -1,7 +1,7 @@
 import argparse
 
 from pathlib import Path
-
+import os
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--gs_project_id",
@@ -15,11 +15,13 @@
     "--output_dir", type=str, help="Where the indexes will be stored locally."
 )
 
+parser.add_argument ("--load_bucket", type=bool, help="Whether to load the folder structure from the Bucket directly", default = False, nargs = '?')
+
 args = parser.parse_args()
 input_dir = Path(args.input_dir)
 root_dir = input_dir.name
 output_dir = Path(args.output_dir)
-
+load_bucket = args.load_bucket
 # get the list of tfrecords
 # train_tfrecords = [
 #     str(f)
@@ -32,18 +34,23 @@
 #     if "valid" in str(f)
 # ]
 # construct index file paths in the format of a gs bucket
-train_indexes = [
-    f"gs://{args.gs_project_id}/{root_dir}/{str(f).split(f'/{root_dir}/')[-1]}"
-    for f in input_dir.glob("**/*.tfrecords")
-    if "train" in str(f)
-]
-print(train_indexes[:5])
-val_indexes = [
-    f"gs://{args.gs_project_id}/{root_dir}/{str(f).split(f'/{root_dir}/')[-1]}"
-    for f in input_dir.glob("**/*.tfrecords")
-    if "valid" in str(f)
-]
+if load_bucket  == False:
+    train_indexes = [
+                        f"gs://{args.gs_project_id}/{root_dir}/{str(f).split(f'/{root_dir}/')[-1]}"
+                        for f in input_dir.glob("**/*.tfrecords")
+                        if "train" in str(f)
+                    ]
+    print(train_indexes[:5])
+    val_indexes = [
+                        f"gs://{args.gs_project_id}/{root_dir}/{str(f).split(f'/{root_dir}/')[-1]}"
+                        for f in input_dir.glob("**/*.tfrecords")
+                        if "valid" in str(f)
+                        ]
 
+else:
+    list_files = os.popen(f'gsutil ls -r gs://{args.gs_project_id}/{root_dir}').read().split('\n')
+    train_indexes = [f for f in list_files if 'train' in str(f) and '.tfrecords' in str(f)]
+    val_indexes = [f for f in list_files if 'valid' in str(f) and '.tfrecords' in str(f)]
 with open(output_dir / "code_clippy.train.index", "w") as f:
     f.write("\n".join(train_indexes))
 

From 4b30c3ee88bd1dd0bbcf5fb876f035e5d1339752 Mon Sep 17 00:00:00 2001
From: Erfan Al-Hossami <ealhossa@uncc.edu>
Date: Fri, 24 Dec 2021 02:12:57 -0500
Subject: [PATCH 2/8] Update requirements.txt

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 29951424..9e747c0d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 numpy~=1.19.5
-tqdm~=4.45.0
+tqdm
 wandb>=0.11.2
 einops~=0.3.0
 requests~=2.25.1
@@ -19,4 +19,4 @@ func_timeout
 ftfy
 fastapi
 uvicorn
-pathy
\ No newline at end of file
+pathy

From 42aeeb3122d2601d4c519c36ca42330bf9eb73cd Mon Sep 17 00:00:00 2001
From: Erfan Al-Hossami <ealhossa@uncc.edu>
Date: Fri, 31 Dec 2021 04:38:33 -0500
Subject: [PATCH 3/8] Update code_clippy_6B.json

---
 configs/code_clippy_6B.json | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/configs/code_clippy_6B.json b/configs/code_clippy_6B.json
index f62e5987..bd27daa1 100644
--- a/configs/code_clippy_6B.json
+++ b/configs/code_clippy_6B.json
@@ -10,19 +10,19 @@
     "seq": 2048,
     "cores_per_replica": 8,
     "per_replica_batch": 1,
-    "gradient_accumulation_steps": 16,
+    "gradient_accumulation_steps": 32, 
   
-    "warmup_steps": 3000,
-    "anneal_steps": 300000,
-    "lr": 1.2e-4,
-    "end_lr": 1.2e-5,
+    "warmup_steps": 6200,
+    "anneal_steps": 613800,
+    "lr": 1e-5,
+    "end_lr": 1e-6,
     "weight_decay": 0.1,
-    "total_steps": 350000,
+    "total_steps": 620000,
   
     "tpu_size": 8,
   
     "bucket": "code-clippy-bucket",
-    "model_dir": "code_clippy_6B",
+    "model_dir": "code_clippy_6B_v2",
   
     "train_set": "code_clippy.train.index",
     "val_set": {
@@ -36,7 +36,7 @@
     "ckpt_every": 500,
     "keep_every": 10000,
   
-    "name": "code_clippy_6B",
+    "name": "code_clippy_6B_v2",
     "wandb_project": "mesh-transformer-jax",
-    "comment": ""
+    "comment": "Decreased learning rate and increased gradient steps"
   }

From b2c0b8e01e996b37bc5e9b72102401b20e6e6af6 Mon Sep 17 00:00:00 2001
From: Erfan Al-Hossami <ealhossa@uncc.edu>
Date: Fri, 31 Dec 2021 04:39:12 -0500
Subject: [PATCH 4/8] Update device_train.py

added an exception for faulty data samples
---
 device_train.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/device_train.py b/device_train.py
index e1131742..765b24bf 100644
--- a/device_train.py
+++ b/device_train.py
@@ -106,6 +106,7 @@ def save(network, step, bucket, path, mp, aux=None, keep_n=3, delete_old=True):
 
 
 def train_step(network, data):
+
     inputs = {
         "obs": data[:, :, :-1],
         "target": data[:, :, 1:],
@@ -324,9 +325,14 @@ def eval_step(network, data):
                 exit()
 
             start = time.time()
-            loss, last_loss, grad_norm, grad_norm_micro = train_step(
-                network, train_dataset.get_samples()
-            )
+            try:
+                 loss, last_loss, grad_norm, grad_norm_micro = train_step(
+                        network, train_dataset.get_samples()
+                 )
+            except:
+                print(f'Skipped this batch bc of faulty sample.\n File name:{train_dataset.get_state()}')
+                wandb.log(train_dataset.get_state(),step)
+                continue
             step += 1
 
             steps_per_sec = 1 / (time.time() - start)
@@ -393,6 +399,7 @@ def eval_step(network, data):
                 "train/learning_rate": float(scheduler(network.state["opt_state"][-1].count[0].item())),
                 "sequences_processed": sequences_processed,
                 "tokens_processed": tokens_processed,
+                # "clip_global_gradient_norm": clip_by_global_norm(1)[1]
             }
             wandb_stats.update(noise_scale_stats)
 

From acda66bf4981576bc4551eddf83ba05691c638a3 Mon Sep 17 00:00:00 2001
From: Erfan Al-Hossami <ealhossa@uncc.edu>
Date: Fri, 31 Dec 2021 05:00:20 -0500
Subject: [PATCH 5/8] Update device_train.py

---
 device_train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/device_train.py b/device_train.py
index 765b24bf..cfd50f15 100644
--- a/device_train.py
+++ b/device_train.py
@@ -399,6 +399,7 @@ def eval_step(network, data):
                 "train/learning_rate": float(scheduler(network.state["opt_state"][-1].count[0].item())),
                 "sequences_processed": sequences_processed,
                 "tokens_processed": tokens_processed,
+                #visualize clipped gradients
                 # "clip_global_gradient_norm": clip_by_global_norm(1)[1]
             }
             wandb_stats.update(noise_scale_stats)

From 62c50ebc9982af9c890ec7d9b3e2349dd09411fc Mon Sep 17 00:00:00 2001
From: Erfan Al-Hossami <ealhossa@uncc.edu>
Date: Sun, 27 Mar 2022 22:46:16 -0400
Subject: [PATCH 6/8] Update generate_indexes.py

---
 generate_indexes.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/generate_indexes.py b/generate_indexes.py
index 7938362d..db22db26 100644
--- a/generate_indexes.py
+++ b/generate_indexes.py
@@ -51,8 +51,13 @@
     list_files = os.popen(f'gsutil ls -r gs://{args.gs_project_id}/{root_dir}').read().split('\n')
     train_indexes = [f for f in list_files if 'train' in str(f) and '.tfrecords' in str(f)]
     val_indexes = [f for f in list_files if 'valid' in str(f) and '.tfrecords' in str(f)]
+
 with open(output_dir / "code_clippy.train.index", "w") as f:
     f.write("\n".join(train_indexes))
 
 with open(output_dir / "code_clippy.val.index", "w") as f:
     f.write("\n".join(val_indexes))
+
+
+
+    

From 9626f0d4363a65b7fbd03d6f559ef2be636c5450 Mon Sep 17 00:00:00 2001
From: Erfan Al-Hossami <ealhossa@uncc.edu>
Date: Sun, 27 Mar 2022 22:46:39 -0400
Subject: [PATCH 7/8] Update requirements.txt

---
 requirements.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9e747c0d..aa96d0d9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 numpy~=1.19.5
-tqdm
+tqdm~=4.62.3
 wandb>=0.11.2
 einops~=0.3.0
 requests~=2.25.1
@@ -17,6 +17,6 @@ transformers
 smart_open[gcs]
 func_timeout
 ftfy
-fastapi
-uvicorn
-pathy
+fastapi~=0.74.1
+uvicorn~=0.2.2
+pathy~=0.6.1

From e5504d15ca99fdcf547aebf3806fddf4d2728e61 Mon Sep 17 00:00:00 2001
From: Erfan Al-Hossami <ealhossa@uncc.edu>
Date: Sun, 27 Mar 2022 22:48:03 -0400
Subject: [PATCH 8/8] Create load_data_to_hf.py

---
 load_data_to_hf.py | 78 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 load_data_to_hf.py

diff --git a/load_data_to_hf.py b/load_data_to_hf.py
new file mode 100644
index 00000000..8b271462
--- /dev/null
+++ b/load_data_to_hf.py
@@ -0,0 +1,78 @@
+"""
+This script loads the data from Google Cloud Storage to a Huggingface dataset repository.
+"""
+
+import argparse
+
+from pathlib import Path
+import os
+import time
+import shutil
+# from smart_open import open
+# from google.cloud import storage
+# from google.cloud.exceptions import NotFound
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--gs_project_id",
+    type=str,
+    help="Google Cloud Storage project ID",
+    default = 'code-clippy-bucket'
+)
+
+parser.add_argument(
+    "--output_dir", type=str, help="Where the files will be temprorarily stored locally.",  default = "../code_clippy_github/"
+)
+
+
+parser.add_argument(
+    "--input_dir", type=str, help="Where the dataset is stored on the GCS.",  default = "code-clippy-dataset"
+)
+
+
+args = parser.parse_args()
+root_dir = args.input_dir
+output_dir = Path(args.output_dir)
+# load_bucket = args.load_bucket
+
+os.chdir(output_dir)
+
+list_files = os.popen(f'gsutil ls -r gs://{args.gs_project_id}/{root_dir}').read().split('\n')
+
+uploaded_files = os.popen(f'ls -r {output_dir}').read().split('\n')
+
+#print(uploaded_files)
+json_files_list = [f for f in list_files if '.json.gz' in str(f) and str(f.split('/')[-1]) not in uploaded_files]
+
+print(json_files_list)
+
+commited_files = []
+
+for commit_num , file_path in enumerate(json_files_list,1):
+
+    os.system( f'gsutil cp {file_path} {file_path.split("/")[-1]}')
+    time.sleep(0.5)
+    os.system( f'git add {file_path.split("/")[-1]}')
+
+    commited_files.append(file_path.split("/")[-1])
+
+    if commit_num % 20 == 0:
+        time.sleep(1)
+        os.system(f'git commit -m \" adding dataset from GCS {commit_num}\"')
+        time.sleep(1)
+        os.system(f'git push https://USERNAME:PASSWORD@huggingface.co/datasets/repo.git')
+
+        time.sleep(1)
+
+        while len(commited_files) > 0:
+            os.remove(f'{commited_files.pop(0)}')
+
+        print('Done Deleting')
+
+    if commit_num % 200 == 0:
+        os.chdir('..')
+        shutil.rmtree(f'{str(output_dir).split("/")[-1]}')
+       # os.remove(f'{str(output_dir).split("/")[-1]}')
+        os.system(f'GIT_LFS_SKIP_SMUDGE=1 git clone https://USERNAME:PASSWORD@huggingface.co/datasets/CodedotAI/code_clippy_github.git')
+        os.chdir(f'{str(output_dir).split("/")[-1]}')
+        print(f'Completion: {commit_num/len(json_files_list) * 100} %')