From 54810cedee7a809fe8393ad5dfba2fe3525cf5a5 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 5 Nov 2021 17:12:21 +0100
Subject: [PATCH 01/37] First draft of how I think seqio works

---
 t5x/configs/dataset/pile/task.py  | 57 +++++++++++++++++++++
 t5x/configs/dataset/pile/utils.py | 82 +++++++++++++++++++++++++++++++
 2 files changed, 139 insertions(+)
 create mode 100644 t5x/configs/dataset/pile/task.py
 create mode 100644 t5x/configs/dataset/pile/utils.py

diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/task.py
new file mode 100644
index 000000000..cbf48d534
--- /dev/null
+++ b/t5x/configs/dataset/pile/task.py
@@ -0,0 +1,57 @@
+import functools
+import seqio
+from t5.data import preprocessors
+
+from t5x.configs.dataset.pile.utils import PileDatasetFnCallable
+
+vocabulary = seqio.SentencePieceVocabulary(
+    'gs://t5-data/vocabs/cc_all.32000/sentencepiece.model', extra_ids=100)
+output_features = {
+    'inputs': seqio.Feature(vocabulary=vocabulary),
+    'targets': seqio.Feature(vocabulary=vocabulary)
+}
+
+DEFAULT_OUTPUT_FEATURES = {
+    "inputs": seqio.Feature(
+        vocabulary=vocabulary, add_eos=True,
+        required=False),
+    "targets": seqio.Feature(
+        vocabulary=vocabulary, add_eos=True)
+}
+
+seqio.TaskRegistry.add(
+    'pile_t2t_span_corruption',
+    source=seqio.FunctionDataSource(dataset_fn=PileDatasetFnCallable(), splits=["train", "val"]),
+    preprocessors=[
+        functools.partial(
+            preprocessors.rekey, key_map={
+                "inputs": None,
+                "targets": "text"
+            }),
+        seqio.preprocessors.tokenize,
+        seqio.CacheDatasetPlaceholder(required=True),
+        preprocessors.span_corruption,
+        seqio.preprocessors.append_eos_after_trim,
+    ],
+    output_features=DEFAULT_OUTPUT_FEATURES,
+    metric_fns=[]
+)
+
+# Prefix language modeling pretraining task used in Raffel et al., 2019.
+seqio.TaskRegistry.add(
+    "pile_t2t_prefix_lm",
+    source=seqio.FunctionDataSource(dataset_fn=PileDatasetFnCallable(), splits=["train", "val"]),
+    preprocessors=[
+        functools.partial(
+            preprocessors.rekey, key_map={
+                "inputs": None,
+                "targets": "text"
+            }),
+        seqio.preprocessors.tokenize,
+        seqio.CacheDatasetPlaceholder(required=True),
+        preprocessors.prefix_lm,
+        seqio.preprocessors.append_eos_after_trim,
+    ],
+    output_features=DEFAULT_OUTPUT_FEATURES,
+    metric_fns=[]
+)
\ No newline at end of file
diff --git a/t5x/configs/dataset/pile/utils.py b/t5x/configs/dataset/pile/utils.py
new file mode 100644
index 000000000..1e85af57c
--- /dev/null
+++ b/t5x/configs/dataset/pile/utils.py
@@ -0,0 +1,82 @@
+from pathlib import Path
+from typing import Optional
+
+import seqio
+from datasets import load_dataset
+import tensorflow as tf
+
+def load_from_local(dataset_dir: Path):
+    dataset_list = {
+        "train": [
+            "train/00.jsonl",
+            "train/01.jsonl",
+            "train/02.jsonl",
+            "train/03.jsonl",
+            "train/04.jsonl",
+            "train/05.jsonl",
+            "train/06.jsonl",
+            "train/07.jsonl",
+            "train/08.jsonl",
+            "train/09.jsonl",
+            "train/10.jsonl",
+            "train/11.jsonl",
+            "train/12.jsonl",
+            "train/13.jsonl",
+            "train/14.jsonl",
+            "train/15.jsonl",
+            "train/16.jsonl",
+            "train/17.jsonl",
+            "train/18.jsonl",
+            "train/19.jsonl",
+            "train/20.jsonl",
+            "train/21.jsonl",
+            "train/22.jsonl",
+            "train/23.jsonl",
+            "train/24.jsonl",
+            "train/25.jsonl",
+            "train/26.jsonl",
+            "train/27.jsonl",
+            "train/28.jsonl",
+            "train/29.jsonl"
+        ],
+        "test": [
+            "test.jsonl"
+        ],
+        "val": [
+            "val.jsonl"
+        ],
+    }
+
+    for split_name, filepaths in dataset_list:
+        load_dataset("json", data_files=[f"{dataset_dir}/{filepath}" for filepath in filepaths], data="text")
+
+def load_from_urls():
+    remote_urls = {
+        "test": [
+            "https://the-eye.eu/public/AI/pile/test.jsonl.zst",
+        ]
+    }
+
+    return {split_name: load_dataset("json", data_files=urls, field="text") for split_name, urls in remote_urls}
+
+class PileDatasetFnCallable(seqio.DatasetFnCallable):
+    def __init__(self):
+        self.datasets = load_from_urls()
+
+    def __call__(
+        self,
+        split: str,
+        shuffle_files: bool,
+        seed: Optional[int] = None
+    ) -> tf.data.Dataset:
+        datasets = load_from_urls()
+        if split not in datasets:
+            raise ValueError(f"Unrecognized split value, got {split} expected {datasets.keys()}")
+
+        dataset = datasets[split]
+        return dataset.to_tf_dataset(
+            columns="text",
+            batch_size=1000,
+            shuffle=shuffle_files
+        )
+

From 9a3146cfbcf3a1ffd77df11d7eff7798ce0af21f Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 5 Nov 2021 17:44:13 +0100
Subject: [PATCH 02/37] Improve pipeline

---
 t5x/configs/dataset/pile/task.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/task.py
index cbf48d534..89261ded1 100644
--- a/t5x/configs/dataset/pile/task.py
+++ b/t5x/configs/dataset/pile/task.py
@@ -1,8 +1,7 @@
 import functools
 import seqio
-from t5.data import preprocessors
-
-from t5x.configs.dataset.pile.utils import PileDatasetFnCallable
+from t5.data import preprocessors, utils
+import json as js
 
 vocabulary = seqio.SentencePieceVocabulary(
     'gs://t5-data/vocabs/cc_all.32000/sentencepiece.model', extra_ids=100)
@@ -19,10 +18,24 @@
         vocabulary=vocabulary, add_eos=True)
 }
 
+DATASET_FOLDER=""
+DATASET_SPLITS_TO_FILEPATTERN={
+    "train": f"{DATASET_FOLDER}/train/*.jsonl",
+    "val": f"{DATASET_FOLDER}/val.jsonl",
+    "test": f"{DATASET_FOLDER}/test.jsonl"
+}
+
+@utils.map_over_dataset
+def extract_text_from_json(json: str):
+    return js.loads(json)["text"]
+
 seqio.TaskRegistry.add(
     'pile_t2t_span_corruption',
-    source=seqio.FunctionDataSource(dataset_fn=PileDatasetFnCallable(), splits=["train", "val"]),
+    source=seqio.TextLineDataSource(
+        split_to_filepattern=DATASET_SPLITS_TO_FILEPATTERN,
+    ),
     preprocessors=[
+        extract_text_from_json,
         functools.partial(
             preprocessors.rekey, key_map={
                 "inputs": None,
@@ -40,8 +53,11 @@
 # Prefix language modeling pretraining task used in Raffel et al., 2019.
 seqio.TaskRegistry.add(
     "pile_t2t_prefix_lm",
-    source=seqio.FunctionDataSource(dataset_fn=PileDatasetFnCallable(), splits=["train", "val"]),
+    source=seqio.TextLineDataSource(
+        split_to_filepattern=DATASET_SPLITS_TO_FILEPATTERN,
+    ),
     preprocessors=[
+        extract_text_from_json,
         functools.partial(
             preprocessors.rekey, key_map={
                 "inputs": None,

From bccaa5faebcb94a5f20bc21ce09c2d45b1590191 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 6 Nov 2021 17:08:09 +0100
Subject: [PATCH 03/37] Added script to download pile to gcp bucket

---
 t5x/configs/dataset/pile/download_all_pile.py | 69 ++++++++++++++++
 t5x/configs/dataset/pile/task.py              | 19 ++++-
 t5x/configs/dataset/pile/utils.py             | 82 -------------------
 3 files changed, 84 insertions(+), 86 deletions(-)
 create mode 100644 t5x/configs/dataset/pile/download_all_pile.py
 delete mode 100644 t5x/configs/dataset/pile/utils.py

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
new file mode 100644
index 000000000..c40669afc
--- /dev/null
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -0,0 +1,69 @@
+import argparse
+import functools
+import subprocess
+from multiprocessing import Pool
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--procs", type=int, required=True, help="Number of processes."
+    )
+    parser.add_argument(
+        "--base-dir", type=str, required=True, help="Folder to download the document to"
+    )
+    return parser.parse_args()
+
+
+def download_and_unztd(relative_path, base_dir):
+    BASE_PILE_URL = "https://the-eye.eu/public/AI/pile"
+    local_path = f"{base_dir}/{relative_path}"
+
+    # Create folder
+    process = subprocess.Popen(["mkdir", "-p", local_path.rsplit("/", 1)])
+    process.wait()
+
+    # download files
+    process = subprocess.Popen(['wget', "-O", local_path , f"{BASE_PILE_URL}/{relative_path}"],
+                               stdout=subprocess.PIPE,
+                               stderr=subprocess.PIPE)
+    process.wait()
+
+    # decompress files
+    process = subprocess.Popen(['zstd', '-d', local_path],
+                               stdout=subprocess.PIPE,
+                               stderr=subprocess.PIPE)
+    process.wait()
+
+def main():
+    args = get_args()
+
+    pile_urls = {
+        "train": [
+            f"train/{i:02d}.jsonl.zst" for i in range(30)
+        ],
+        "test": [
+            f"test.jsonl.zst"
+        ],
+        "val": [
+            f"val.jsonl.zst"
+        ]
+    }
+    base_dir = args.base_dir
+    gcp_base = "gs://bigscience/pile/raw"
+
+    process = subprocess.Popen(["mkdir", "-p", base_dir])
+    process.wait()
+
+    pool = Pool(args.procs)
+
+    pool.imap(
+        functools.partial(download_and_unztd, base_dir=base_dir),
+        [local_path for _, local_paths in pile_urls for local_path in local_paths]
+    )
+
+    process = subprocess.Popen(["gsutil", "cp", "-r", base_dir, gcp_base])
+    process.wait()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/task.py
index 89261ded1..ec3f909c7 100644
--- a/t5x/configs/dataset/pile/task.py
+++ b/t5x/configs/dataset/pile/task.py
@@ -1,5 +1,7 @@
 import functools
+
 import seqio
+from seqio import feature_converters
 from t5.data import preprocessors, utils
 import json as js
 
@@ -18,7 +20,7 @@
         vocabulary=vocabulary, add_eos=True)
 }
 
-DATASET_FOLDER=""
+DATASET_FOLDER="gs://bigscience/pile/raw"
 DATASET_SPLITS_TO_FILEPATTERN={
     "train": f"{DATASET_FOLDER}/train/*.jsonl",
     "val": f"{DATASET_FOLDER}/val.jsonl",
@@ -42,7 +44,7 @@ def extract_text_from_json(json: str):
                 "targets": "text"
             }),
         seqio.preprocessors.tokenize,
-        seqio.CacheDatasetPlaceholder(required=True),
+        seqio.CacheDatasetPlaceholder(),
         preprocessors.span_corruption,
         seqio.preprocessors.append_eos_after_trim,
     ],
@@ -64,10 +66,19 @@ def extract_text_from_json(json: str):
                 "targets": "text"
             }),
         seqio.preprocessors.tokenize,
-        seqio.CacheDatasetPlaceholder(required=True),
+        seqio.CacheDatasetPlaceholder(),
         preprocessors.prefix_lm,
         seqio.preprocessors.append_eos_after_trim,
     ],
     output_features=DEFAULT_OUTPUT_FEATURES,
     metric_fns=[]
-)
\ No newline at end of file
+)
+
+if __name__ == "__main__":
+    task_feature_lengths = {"inputs": 7, "targets": 5}
+    converter = feature_converters.EncDecFeatureConverter(pack=True)
+    seqio.get_dataset(
+        "pile_t2t_span_corruption",
+        task_feature_lengths=task_feature_lengths,
+        feature_converter=converter,
+    )
\ No newline at end of file
diff --git a/t5x/configs/dataset/pile/utils.py b/t5x/configs/dataset/pile/utils.py
deleted file mode 100644
index 1e85af57c..000000000
--- a/t5x/configs/dataset/pile/utils.py
+++ /dev/null
@@ -1,82 +0,0 @@
-from pathlib import Path
-from typing import Optional
-
-import seqio
-from datasets import load_dataset
-import tensorflow as tf
-
-def load_from_local(dataset_dir: Path):
-    dataset_list = {
-        "train": [
-            "train/00.jsonl",
-            "train/01.jsonl",
-            "train/02.jsonl",
-            "train/03.jsonl",
-            "train/04.jsonl",
-            "train/05.jsonl",
-            "train/06.jsonl",
-            "train/07.jsonl",
-            "train/08.jsonl",
-            "train/09.jsonl",
-            "train/10.jsonl",
-            "train/11.jsonl",
-            "train/12.jsonl",
-            "train/13.jsonl",
-            "train/14.jsonl",
-            "train/15.jsonl",
-            "train/16.jsonl",
-            "train/17.jsonl",
-            "train/18.jsonl",
-            "train/19.jsonl",
-            "train/20.jsonl",
-            "train/21.jsonl",
-            "train/22.jsonl",
-            "train/23.jsonl",
-            "train/24.jsonl",
-            "train/25.jsonl",
-            "train/26.jsonl",
-            "train/27.jsonl",
-            "train/28.jsonl",
-            "train/29.jsonl"
-        ],
-        "test": [
-            "test.jsonl"
-        ],
-        "val": [
-            "val.jsonl"
-        ],
-    }
-
-    for split_name, filepaths in dataset_list:
-        load_dataset("json", data_files=[f"{dataset_dir}/{filepath}" for filepath in filepaths], data="text")
-
-def load_from_urls():
-    remote_urls = {
-        "test": [
-            "https://the-eye.eu/public/AI/pile/test.jsonl.zst",
-        ]
-    }
-
-    return {split_name: load_dataset("json", data_files=urls, field="text") for split_name, urls in remote_urls}
-
-class PileDatasetFnCallable(seqio.DatasetFnCallable):
-    def __init__(self):
-        self.datasets = load_from_urls()
-
-    def __call__(
-        self,
-        split: str,
-        shuffle_files: bool,
-        seed: Optional[int] = None
-    ) -> tf.data.Dataset:
-        datasets = load_from_urls()
-        if split not in datasets:
-            raise ValueError(f"Unrecognized split value, got {split} expected {datasets.keys()}")
-
-        dataset = datasets[split]
-        return dataset.to_tf_dataset(
-            columns="text",
-            batch_size=1000,
-            shuffle=shuffle_files
-        )
-

From ff7a387025caee7f267aea54e79808ed331b9e8a Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 6 Nov 2021 17:17:31 +0100
Subject: [PATCH 04/37] I need to send to remove file progressively as I don't
 have access to enough disk memory

---
 t5x/configs/dataset/pile/download_all_pile.py | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
index c40669afc..b7af54be0 100644
--- a/t5x/configs/dataset/pile/download_all_pile.py
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -10,14 +10,14 @@ def get_args():
         "--procs", type=int, required=True, help="Number of processes."
     )
     parser.add_argument(
-        "--base-dir", type=str, required=True, help="Folder to download the document to"
+        "--local-base-dir", type=str, required=True, help="Folder to download the document to"
     )
     return parser.parse_args()
 
 
-def download_and_unztd(relative_path, base_dir):
+def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base):
     BASE_PILE_URL = "https://the-eye.eu/public/AI/pile"
-    local_path = f"{base_dir}/{relative_path}"
+    local_path = f"{local_base_dir}/{relative_path}"
 
     # Create folder
     process = subprocess.Popen(["mkdir", "-p", local_path.rsplit("/", 1)])
@@ -35,6 +35,14 @@ def download_and_unztd(relative_path, base_dir):
                                stderr=subprocess.PIPE)
     process.wait()
 
+    # upload to gcp
+    process = subprocess.Popen(["gsutil", "cp", "-r", local_path, f"{gcp_base}/{relative_path}"])
+    process.wait()
+
+    # delete file locally
+    process = subprocess.Popen(['rm', local_path])
+    process.wait()
+
 def main():
     args = get_args()
 
@@ -49,7 +57,7 @@ def main():
             f"val.jsonl.zst"
         ]
     }
-    base_dir = args.base_dir
+    local_base_dir = args.base_dir
     gcp_base = "gs://bigscience/pile/raw"
 
     process = subprocess.Popen(["mkdir", "-p", base_dir])
@@ -58,12 +66,9 @@ def main():
     pool = Pool(args.procs)
 
     pool.imap(
-        functools.partial(download_and_unztd, base_dir=base_dir),
+        functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base),
         [local_path for _, local_paths in pile_urls for local_path in local_paths]
     )
 
-    process = subprocess.Popen(["gsutil", "cp", "-r", base_dir, gcp_base])
-    process.wait()
-
 if __name__ == "__main__":
     main()
\ No newline at end of file

From adbc5a9288023e315da2b5521882d4f3f50ec0c5 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 6 Nov 2021 17:19:36 +0100
Subject: [PATCH 05/37] Woops

---
 t5x/configs/dataset/pile/download_all_pile.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
index b7af54be0..47b90bd4e 100644
--- a/t5x/configs/dataset/pile/download_all_pile.py
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -57,10 +57,10 @@ def main():
             f"val.jsonl.zst"
         ]
     }
-    local_base_dir = args.base_dir
+    local_base_dir = args.local_base_dir
     gcp_base = "gs://bigscience/pile/raw"
 
-    process = subprocess.Popen(["mkdir", "-p", base_dir])
+    process = subprocess.Popen(["mkdir", "-p", local_base_dir])
     process.wait()
 
     pool = Pool(args.procs)

From 4cb7ae8a85321d8846a45007ec2b694cbe6f1a6e Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 6 Nov 2021 17:20:02 +0100
Subject: [PATCH 06/37] Woops2

---
 t5x/configs/dataset/pile/download_all_pile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
index 47b90bd4e..256782818 100644
--- a/t5x/configs/dataset/pile/download_all_pile.py
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -67,7 +67,7 @@ def main():
 
     pool.imap(
         functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base),
-        [local_path for _, local_paths in pile_urls for local_path in local_paths]
+        [local_path for _, local_paths in pile_urls.items() for local_path in local_paths]
     )
 
 if __name__ == "__main__":

From 1096de7549704fd4dde4b1a067fa16cf680973ac Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 6 Nov 2021 17:20:42 +0100
Subject: [PATCH 07/37] convert o map instead

---
 t5x/configs/dataset/pile/download_all_pile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
index 256782818..647112258 100644
--- a/t5x/configs/dataset/pile/download_all_pile.py
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -65,7 +65,7 @@ def main():
 
     pool = Pool(args.procs)
 
-    pool.imap(
+    pool.map(
         functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base),
         [local_path for _, local_paths in pile_urls.items() for local_path in local_paths]
     )

From f53f21b0c81064612b96d26ec5552e9237e12b98 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 6 Nov 2021 17:22:49 +0100
Subject: [PATCH 08/37] Woops 3

---
 t5x/configs/dataset/pile/download_all_pile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
index 647112258..56f052b97 100644
--- a/t5x/configs/dataset/pile/download_all_pile.py
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -20,7 +20,7 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base):
     local_path = f"{local_base_dir}/{relative_path}"
 
     # Create folder
-    process = subprocess.Popen(["mkdir", "-p", local_path.rsplit("/", 1)])
+    process = subprocess.Popen(["mkdir", "-p", local_path.rsplit("/", 1)[0]])
     process.wait()
 
     # download files

From a1f6e0af43129901844bdaef0c52863e717a6cc7 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 6 Nov 2021 17:39:25 +0100
Subject: [PATCH 09/37] Make it sequential

---
 t5x/configs/dataset/pile/download_all_pile.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
index 56f052b97..7ec8474f7 100644
--- a/t5x/configs/dataset/pile/download_all_pile.py
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -2,7 +2,7 @@
 import functools
 import subprocess
 from multiprocessing import Pool
-
+import wget
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -24,9 +24,7 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base):
     process.wait()
 
     # download files
-    process = subprocess.Popen(['wget', "-O", local_path , f"{BASE_PILE_URL}/{relative_path}"],
-                               stdout=subprocess.PIPE,
-                               stderr=subprocess.PIPE)
+    wget.download(f"{BASE_PILE_URL}/{relative_path}", local_path)
     process.wait()
 
     # decompress files
@@ -65,10 +63,12 @@ def main():
 
     pool = Pool(args.procs)
 
-    pool.map(
-        functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base),
-        [local_path for _, local_paths in pile_urls.items() for local_path in local_paths]
-    )
+    # pool.map(
+    #     functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base),
+    #     [local_path for _, local_paths in pile_urls.items() for local_path in local_paths]
+    # )
+    for local_path in [local_path for _, local_paths in pile_urls.items() for local_path in local_paths]:
+        download_unztd_and_send_to_gcloud(local_path, local_base_dir=local_base_dir, gcp_base=gcp_base)
 
 if __name__ == "__main__":
     main()
\ No newline at end of file

From a9d69f7331790d874091c1e0dbce293b03727fdc Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 6 Nov 2021 17:56:53 +0100
Subject: [PATCH 10/37] Remove recursive option

---
 t5x/configs/dataset/pile/download_all_pile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
index 7ec8474f7..afbd6000b 100644
--- a/t5x/configs/dataset/pile/download_all_pile.py
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -34,7 +34,7 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base):
     process.wait()
 
     # upload to gcp
-    process = subprocess.Popen(["gsutil", "cp", "-r", local_path, f"{gcp_base}/{relative_path}"])
+    process = subprocess.Popen(["gsutil", "cp", local_path, f"{gcp_base}/{relative_path}"])
     process.wait()
 
     # delete file locally

From 68255a1d09cece029618757349dfa1e82cb8a396 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 6 Nov 2021 17:59:52 +0100
Subject: [PATCH 11/37] Update script to remove uncompressed file as well

---
 t5x/configs/dataset/pile/download_all_pile.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
index afbd6000b..ae054a1f8 100644
--- a/t5x/configs/dataset/pile/download_all_pile.py
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -33,13 +33,18 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base):
                                stderr=subprocess.PIPE)
     process.wait()
 
+    assert local_path.endswith(".zst")
+    local_uncompressed_path = local_path.removesuffix(".zst")
+    gcp_uncompressed_path = f"{gcp_base}/{relative_path.removesuffix('.zst')}"
+
     # upload to gcp
-    process = subprocess.Popen(["gsutil", "cp", local_path, f"{gcp_base}/{relative_path}"])
+    process = subprocess.Popen(["gsutil", "cp", local_uncompressed_path, gcp_uncompressed_path])
     process.wait()
 
     # delete file locally
     process = subprocess.Popen(['rm', local_path])
     process.wait()
+    process = subprocess.Popen(['rm', local_uncompressed_path])
 
 def main():
     args = get_args()

From 744cc3585d3f0687cf7fb3f3c3208a184fe971c1 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 6 Nov 2021 18:25:18 +0100
Subject: [PATCH 12/37] Test out the rest of the script

---
 t5x/configs/dataset/pile/download_all_pile.py | 11 ++++++-----
 t5x/configs/dataset/pile/task.py              |  3 ++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
index ae054a1f8..a6d5292ea 100644
--- a/t5x/configs/dataset/pile/download_all_pile.py
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -23,9 +23,9 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base):
     process = subprocess.Popen(["mkdir", "-p", local_path.rsplit("/", 1)[0]])
     process.wait()
 
-    # download files
-    wget.download(f"{BASE_PILE_URL}/{relative_path}", local_path)
-    process.wait()
+    # # download files
+    # wget.download(f"{BASE_PILE_URL}/{relative_path}", local_path)
+    # process.wait()
 
     # decompress files
     process = subprocess.Popen(['zstd', '-d', local_path],
@@ -34,8 +34,9 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base):
     process.wait()
 
     assert local_path.endswith(".zst")
-    local_uncompressed_path = local_path.removesuffix(".zst")
-    gcp_uncompressed_path = f"{gcp_base}/{relative_path.removesuffix('.zst')}"
+    local_uncompressed_path = local_path[:-4]
+    assert relative_path.endswith(".zst")
+    gcp_uncompressed_path = f"{gcp_base}/{relative_path[:-4]}"
 
     # upload to gcp
     process = subprocess.Popen(["gsutil", "cp", local_uncompressed_path, gcp_uncompressed_path])
diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/task.py
index ec3f909c7..c61b18267 100644
--- a/t5x/configs/dataset/pile/task.py
+++ b/t5x/configs/dataset/pile/task.py
@@ -4,6 +4,7 @@
 from seqio import feature_converters
 from t5.data import preprocessors, utils
 import json as js
+import tensorflow as tf
 
 vocabulary = seqio.SentencePieceVocabulary(
     'gs://t5-data/vocabs/cc_all.32000/sentencepiece.model', extra_ids=100)
@@ -29,7 +30,7 @@
 
 @utils.map_over_dataset
 def extract_text_from_json(json: str):
-    return js.loads(json)["text"]
+    return tf.py_function(js.loads(json)["text"])
 
 seqio.TaskRegistry.add(
     'pile_t2t_span_corruption',

From e4502a66df27ae609fc8506f4774a0d9205fa18a Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 6 Nov 2021 18:26:14 +0100
Subject: [PATCH 13/37] Woops

---
 t5x/configs/dataset/pile/download_all_pile.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
index a6d5292ea..787e018e1 100644
--- a/t5x/configs/dataset/pile/download_all_pile.py
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -46,6 +46,7 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base):
     process = subprocess.Popen(['rm', local_path])
     process.wait()
     process = subprocess.Popen(['rm', local_uncompressed_path])
+    process.wait()
 
 def main():
     args = get_args()

From 340ed01357b8a9ade2f5e357bd0d68232b18141e Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 6 Nov 2021 18:47:54 +0100
Subject: [PATCH 14/37] Add back download step

---
 t5x/configs/dataset/pile/download_all_pile.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
index 787e018e1..c5523d958 100644
--- a/t5x/configs/dataset/pile/download_all_pile.py
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -23,9 +23,9 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base):
     process = subprocess.Popen(["mkdir", "-p", local_path.rsplit("/", 1)[0]])
     process.wait()
 
-    # # download files
-    # wget.download(f"{BASE_PILE_URL}/{relative_path}", local_path)
-    # process.wait()
+    # download files
+    wget.download(f"{BASE_PILE_URL}/{relative_path}", local_path)
+    process.wait()
 
     # decompress files
     process = subprocess.Popen(['zstd', '-d', local_path],
@@ -68,12 +68,12 @@ def main():
     process = subprocess.Popen(["mkdir", "-p", local_base_dir])
     process.wait()
 
-    pool = Pool(args.procs)
-
+    # pool = Pool(args.procs)
     # pool.map(
     #     functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base),
     #     [local_path for _, local_paths in pile_urls.items() for local_path in local_paths]
     # )
+
     for local_path in [local_path for _, local_paths in pile_urls.items() for local_path in local_paths]:
         download_unztd_and_send_to_gcloud(local_path, local_base_dir=local_base_dir, gcp_base=gcp_base)
 

From 4a58865df9b58e4d9e779b855935235384379b4e Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 11:45:32 +0100
Subject: [PATCH 15/37] Cache seqio task

---
 t5x/configs/dataset/pile/download_all_pile.py |  1 -
 t5x/configs/dataset/pile/task.py              | 25 +++++++------------
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py
index c5523d958..2851d7e50 100644
--- a/t5x/configs/dataset/pile/download_all_pile.py
+++ b/t5x/configs/dataset/pile/download_all_pile.py
@@ -73,7 +73,6 @@ def main():
     #     functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base),
     #     [local_path for _, local_paths in pile_urls.items() for local_path in local_paths]
     # )
-
     for local_path in [local_path for _, local_paths in pile_urls.items() for local_path in local_paths]:
         download_unztd_and_send_to_gcloud(local_path, local_base_dir=local_base_dir, gcp_base=gcp_base)
 
diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/task.py
index c61b18267..64cb9f353 100644
--- a/t5x/configs/dataset/pile/task.py
+++ b/t5x/configs/dataset/pile/task.py
@@ -1,9 +1,8 @@
 import functools
+import sys
 
 import seqio
-from seqio import feature_converters
 from t5.data import preprocessors, utils
-import json as js
 import tensorflow as tf
 
 vocabulary = seqio.SentencePieceVocabulary(
@@ -29,8 +28,12 @@
 }
 
 @utils.map_over_dataset
-def extract_text_from_json(json: str):
-    return tf.py_function(js.loads(json)["text"])
+def extract_text_from_json_tf(json: str):
+    tf.print(json,output_stream=sys.stdout)
+    output = tf.strings.split(json, '{"text": "', maxsplit=1)[1]
+    output = tf.strings.split(output, '", "meta": {', maxsplit=1)[0]
+    tf.print(output,output_stream=sys.stdout)
+    return output
 
 seqio.TaskRegistry.add(
     'pile_t2t_span_corruption',
@@ -38,7 +41,7 @@ def extract_text_from_json(json: str):
         split_to_filepattern=DATASET_SPLITS_TO_FILEPATTERN,
     ),
     preprocessors=[
-        extract_text_from_json,
+        extract_text_from_json_tf,
         functools.partial(
             preprocessors.rekey, key_map={
                 "inputs": None,
@@ -53,14 +56,13 @@ def extract_text_from_json(json: str):
     metric_fns=[]
 )
 
-# Prefix language modeling pretraining task used in Raffel et al., 2019.
 seqio.TaskRegistry.add(
     "pile_t2t_prefix_lm",
     source=seqio.TextLineDataSource(
         split_to_filepattern=DATASET_SPLITS_TO_FILEPATTERN,
     ),
     preprocessors=[
-        extract_text_from_json,
+        extract_text_from_json_tf,
         functools.partial(
             preprocessors.rekey, key_map={
                 "inputs": None,
@@ -74,12 +76,3 @@ def extract_text_from_json(json: str):
     output_features=DEFAULT_OUTPUT_FEATURES,
     metric_fns=[]
 )
-
-if __name__ == "__main__":
-    task_feature_lengths = {"inputs": 7, "targets": 5}
-    converter = feature_converters.EncDecFeatureConverter(pack=True)
-    seqio.get_dataset(
-        "pile_t2t_span_corruption",
-        task_feature_lengths=task_feature_lengths,
-        feature_converter=converter,
-    )
\ No newline at end of file

From 8995fbfa4404d3325ac6fc1dc584de42e908966a Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 14:50:25 +0100
Subject: [PATCH 16/37] Add script in order to run caching

---
 t5x/configs/dataset/pile/run_cache_tasks_main.sh | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 t5x/configs/dataset/pile/run_cache_tasks_main.sh

diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
new file mode 100644
index 000000000..a09ed2a08
--- /dev/null
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -0,0 +1,16 @@
+# Need to install seqio
+# gcloud auth application-default login
+
+
+MODULE_IMPORT=t5x.configs.dataset.pile.task
+TASK_NAME=pile_t2t_span_corruption
+JOB_NAME=pilet2tspancorruption # the name must consist of only the characters [-a-z0-9], starting with a letter and ending with a letter or number
+BUCKET=gs://bigscience/pile/$TASK_NAME # Don't know is cache needs to be task specific or not ...
+PROJECT=bigscience
+REGION=europe-west1
+
+seqio_cache_tasks \
+ --module_import=$MODULE_IMPORT \
+ --tasks=${TASK_NAME} \
+ --output_cache_dir=${BUCKET}/cache \
+ --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp"
\ No newline at end of file

From 844c4268e12d2c12c551b09d85c1f96752673eb3 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 15:03:39 +0100
Subject: [PATCH 17/37] Test something out

---
 t5x/configs/dataset/pile/dataflow_setup.py    | 124 ++++++++++++++++++
 .../dataset/pile/run_cache_tasks_main.sh      |   1 +
 2 files changed, 125 insertions(+)
 create mode 100644 t5x/configs/dataset/pile/dataflow_setup.py

diff --git a/t5x/configs/dataset/pile/dataflow_setup.py b/t5x/configs/dataset/pile/dataflow_setup.py
new file mode 100644
index 000000000..4050aca9a
--- /dev/null
+++ b/t5x/configs/dataset/pile/dataflow_setup.py
@@ -0,0 +1,124 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Setup.py module for the workflow's worker utilities.
+All the workflow related code is gathered in a package that will be built as a
+source distribution, staged in the staging area for the workflow being run and
+then installed in the workers when they start running.
+This behavior is triggered by specifying the --setup_file command line option
+when running the workflow for remote execution.
+"""
+
+# pytype: skip-file
+
+import subprocess
+from distutils.command.build import build as _build  # type: ignore
+
+import setuptools
+
+
+# This class handles the pip install mechanism.
+class build(_build):  # pylint: disable=invalid-name
+  """A build command class that will be invoked during package install.
+  The package built using the current setup.py will be staged and later
+  installed in the worker using `pip install package'. This class will be
+  instantiated during install for this specific scenario and will trigger
+  running the custom commands specified.
+  """
+  sub_commands = _build.sub_commands + [('CustomCommands', None)]
+
+
+# Some custom command to run during setup. The command is not essential for this
+# workflow. It is used here as an example. Each command will spawn a child
+# process. Typically, these commands will include steps to install non-Python
+# packages. For instance, to install a C++-based library libjpeg62 the following
+# two commands will have to be added:
+#
+#     ['apt-get', 'update'],
+#     ['apt-get', '--assume-yes', 'install', 'libjpeg62'],
+#
+# First, note that there is no need to use the sudo command because the setup
+# script runs with appropriate access.
+# Second, if apt-get tool is used then the first command needs to be 'apt-get
+# update' so the tool refreshes itself and initializes links to download
+# repositories.  Without this initial step the other apt-get install commands
+# will fail with package not found errors. Note also --assume-yes option which
+# shortcuts the interactive confirmation.
+#
+# Note that in this example custom commands will run after installing required
+# packages. If you have a PyPI package that depends on one of the custom
+# commands, move installation of the dependent package to the list of custom
+# commands, e.g.:
+#
+#     ['pip', 'install', 'my_package'],
+#
+# TODO(BEAM-3237): Output from the custom commands are missing from the logs.
+# The output of custom commands (including failures) will be logged in the
+# worker-startup log.
+CUSTOM_COMMANDS = [
+    ['echo', 'Custom command worked!'],
+    ['pip', 'install', 'seqio']
+]
+
+
+class CustomCommands(setuptools.Command):
+  """A setuptools Command class able to run arbitrary commands."""
+  def initialize_options(self):
+    pass
+
+  def finalize_options(self):
+    pass
+
+  def RunCustomCommand(self, command_list):
+    print('Running command: %s' % command_list)
+    p = subprocess.Popen(
+        command_list,
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT)
+    # Can use communicate(input='y\n'.encode()) if the command run requires
+    # some confirmation.
+    stdout_data, _ = p.communicate()
+    print('Command output: %s' % stdout_data)
+    if p.returncode != 0:
+      raise RuntimeError(
+          'Command %s failed: exit code: %s' % (command_list, p.returncode))
+
+  def run(self):
+    for command in CUSTOM_COMMANDS:
+      self.RunCustomCommand(command)
+
+
+# Configure the required packages and scripts to install.
+# Note that the Python Dataflow containers come with numpy already installed
+# so this dependency will not trigger anything to be installed unless a version
+# restriction is specified.
+REQUIRED_PACKAGES = [
+    'numpy',
+]
+
+setuptools.setup(
+    name='cache_pile',
+    version='0.0.1',
+    description='Cache pile set workflow package.',
+    install_requires=REQUIRED_PACKAGES,
+    packages=setuptools.find_packages(),
+    cmdclass={
+        # Command class instantiated and run during pip install scenarios.
+        'build': build,
+        'CustomCommands': CustomCommands,
+    })
\ No newline at end of file
diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
index a09ed2a08..cb9dd9971 100644
--- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -13,4 +13,5 @@ seqio_cache_tasks \
  --module_import=$MODULE_IMPORT \
  --tasks=${TASK_NAME} \
  --output_cache_dir=${BUCKET}/cache \
+ --setup_file t5x/configs/dataset/pile/dataflow_setup.py \
  --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp"
\ No newline at end of file

From 3e294d94105f5bc881133290c8b045100113799d Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 15:05:00 +0100
Subject: [PATCH 18/37] test something else:

---
 t5x/configs/dataset/pile/run_cache_tasks_main.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
index cb9dd9971..bff179c35 100644
--- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -13,5 +13,4 @@ seqio_cache_tasks \
  --module_import=$MODULE_IMPORT \
  --tasks=${TASK_NAME} \
  --output_cache_dir=${BUCKET}/cache \
- --setup_file t5x/configs/dataset/pile/dataflow_setup.py \
- --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp"
\ No newline at end of file
+ --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file t5x/configs/dataset/pile/dataflow_setup.py"
\ No newline at end of file

From 774e3e786c6df9a8d7293323ab9f453216e815a9 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 15:17:05 +0100
Subject: [PATCH 19/37] Woops

---
 t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
index bff179c35..d310b3d89 100644
--- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -13,4 +13,4 @@ seqio_cache_tasks \
  --module_import=$MODULE_IMPORT \
  --tasks=${TASK_NAME} \
  --output_cache_dir=${BUCKET}/cache \
- --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file t5x/configs/dataset/pile/dataflow_setup.py"
\ No newline at end of file
+ --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/dataflow_setup.py"
\ No newline at end of file

From 964340d7c5379efb69f0c53b3e80d33856babe29 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 15:21:26 +0100
Subject: [PATCH 20/37] Setup file needs to be names setup.py

---
 t5x/configs/dataset/pile/run_cache_tasks_main.sh         | 2 +-
 t5x/configs/dataset/pile/{dataflow_setup.py => setup.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename t5x/configs/dataset/pile/{dataflow_setup.py => setup.py} (100%)

diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
index d310b3d89..32e89d806 100644
--- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -13,4 +13,4 @@ seqio_cache_tasks \
  --module_import=$MODULE_IMPORT \
  --tasks=${TASK_NAME} \
  --output_cache_dir=${BUCKET}/cache \
- --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/dataflow_setup.py"
\ No newline at end of file
+ --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/setup.py"
\ No newline at end of file
diff --git a/t5x/configs/dataset/pile/dataflow_setup.py b/t5x/configs/dataset/pile/setup.py
similarity index 100%
rename from t5x/configs/dataset/pile/dataflow_setup.py
rename to t5x/configs/dataset/pile/setup.py

From e17d454f587f1e0551aedd2edd88bb3d5ff4ef2a Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 15:53:14 +0100
Subject: [PATCH 21/37] Make a another package

---
 t5x/configs/dataset/pile/{ => pile}/task.py      | 0
 t5x/configs/dataset/pile/run_cache_tasks_main.sh | 5 ++---
 t5x/configs/dataset/pile/setup.py                | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)
 rename t5x/configs/dataset/pile/{ => pile}/task.py (100%)

diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/pile/task.py
similarity index 100%
rename from t5x/configs/dataset/pile/task.py
rename to t5x/configs/dataset/pile/pile/task.py
diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
index 32e89d806..eb93f6ab7 100644
--- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -1,8 +1,7 @@
 # Need to install seqio
 # gcloud auth application-default login
 
-
-MODULE_IMPORT=t5x.configs.dataset.pile.task
+MODULE_IMPORT=pile.task
 TASK_NAME=pile_t2t_span_corruption
 JOB_NAME=pilet2tspancorruption # the name must consist of only the characters [-a-z0-9], starting with a letter and ending with a letter or number
 BUCKET=gs://bigscience/pile/$TASK_NAME # Don't know is cache needs to be task specific or not ...
@@ -13,4 +12,4 @@ seqio_cache_tasks \
  --module_import=$MODULE_IMPORT \
  --tasks=${TASK_NAME} \
  --output_cache_dir=${BUCKET}/cache \
- --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/setup.py"
\ No newline at end of file
+ --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py"
\ No newline at end of file
diff --git a/t5x/configs/dataset/pile/setup.py b/t5x/configs/dataset/pile/setup.py
index 4050aca9a..d6a1f2ec4 100644
--- a/t5x/configs/dataset/pile/setup.py
+++ b/t5x/configs/dataset/pile/setup.py
@@ -112,7 +112,7 @@ def run(self):
 ]
 
 setuptools.setup(
-    name='cache_pile',
+    name='pile',
     version='0.0.1',
     description='Cache pile set workflow package.',
     install_requires=REQUIRED_PACKAGES,

From cc74fb4dbc7ad30b5145ce4843c6a6d7e4676b87 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 15:55:19 +0100
Subject: [PATCH 22/37] Somehow task is not part of pile package

---
 t5x/configs/dataset/pile/pile/__init__.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 t5x/configs/dataset/pile/pile/__init__.py

diff --git a/t5x/configs/dataset/pile/pile/__init__.py b/t5x/configs/dataset/pile/pile/__init__.py
new file mode 100644
index 000000000..7c5666472
--- /dev/null
+++ b/t5x/configs/dataset/pile/pile/__init__.py
@@ -0,0 +1 @@
+import pile.task
\ No newline at end of file

From 7da3b2a9a48843954884439a170a398e17833c6e Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 15:56:33 +0100
Subject: [PATCH 23/37] Revert "Make a another package"

This reverts commit e17d454f587f1e0551aedd2edd88bb3d5ff4ef2a.
---
 t5x/configs/dataset/pile/run_cache_tasks_main.sh | 5 +++--
 t5x/configs/dataset/pile/setup.py                | 2 +-
 t5x/configs/dataset/pile/{pile => }/task.py      | 0
 3 files changed, 4 insertions(+), 3 deletions(-)
 rename t5x/configs/dataset/pile/{pile => }/task.py (100%)

diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
index eb93f6ab7..32e89d806 100644
--- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -1,7 +1,8 @@
 # Need to install seqio
 # gcloud auth application-default login
 
-MODULE_IMPORT=pile.task
+
+MODULE_IMPORT=t5x.configs.dataset.pile.task
 TASK_NAME=pile_t2t_span_corruption
 JOB_NAME=pilet2tspancorruption # the name must consist of only the characters [-a-z0-9], starting with a letter and ending with a letter or number
 BUCKET=gs://bigscience/pile/$TASK_NAME # Don't know is cache needs to be task specific or not ...
@@ -12,4 +13,4 @@ seqio_cache_tasks \
  --module_import=$MODULE_IMPORT \
  --tasks=${TASK_NAME} \
  --output_cache_dir=${BUCKET}/cache \
- --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py"
\ No newline at end of file
+ --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/setup.py"
\ No newline at end of file
diff --git a/t5x/configs/dataset/pile/setup.py b/t5x/configs/dataset/pile/setup.py
index d6a1f2ec4..4050aca9a 100644
--- a/t5x/configs/dataset/pile/setup.py
+++ b/t5x/configs/dataset/pile/setup.py
@@ -112,7 +112,7 @@ def run(self):
 ]
 
 setuptools.setup(
-    name='pile',
+    name='cache_pile',
     version='0.0.1',
     description='Cache pile set workflow package.',
     install_requires=REQUIRED_PACKAGES,
diff --git a/t5x/configs/dataset/pile/pile/task.py b/t5x/configs/dataset/pile/task.py
similarity index 100%
rename from t5x/configs/dataset/pile/pile/task.py
rename to t5x/configs/dataset/pile/task.py

From a9edd8a03b612654595e0c3140251413c2f7c28c Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 16:00:00 +0100
Subject: [PATCH 24/37] Remove __init__

---
 t5x/configs/dataset/pile/pile/__init__.py | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 t5x/configs/dataset/pile/pile/__init__.py

diff --git a/t5x/configs/dataset/pile/pile/__init__.py b/t5x/configs/dataset/pile/pile/__init__.py
deleted file mode 100644
index 7c5666472..000000000
--- a/t5x/configs/dataset/pile/pile/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-import pile.task
\ No newline at end of file

From 385f262d657103159a04c2ac7332dda6ac211e50 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 16:06:57 +0100
Subject: [PATCH 25/37] Rename to pile

---
 t5x/configs/dataset/pile/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/setup.py b/t5x/configs/dataset/pile/setup.py
index 4050aca9a..d6a1f2ec4 100644
--- a/t5x/configs/dataset/pile/setup.py
+++ b/t5x/configs/dataset/pile/setup.py
@@ -112,7 +112,7 @@ def run(self):
 ]
 
 setuptools.setup(
-    name='cache_pile',
+    name='pile',
     version='0.0.1',
     description='Cache pile set workflow package.',
     install_requires=REQUIRED_PACKAGES,

From 0282f7be57208e97583d4233767a3c13592ff83d Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 16:10:16 +0100
Subject: [PATCH 26/37] Fix

---
 t5x/configs/dataset/pile/pile/__init__.py   | 0
 t5x/configs/dataset/pile/{ => pile}/task.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 t5x/configs/dataset/pile/pile/__init__.py
 rename t5x/configs/dataset/pile/{ => pile}/task.py (100%)

diff --git a/t5x/configs/dataset/pile/pile/__init__.py b/t5x/configs/dataset/pile/pile/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/pile/task.py
similarity index 100%
rename from t5x/configs/dataset/pile/task.py
rename to t5x/configs/dataset/pile/pile/task.py

From c4cf1466ed3e6341abb5111a322479f8ebc182d4 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 16:34:23 +0100
Subject: [PATCH 27/37] We need to install t5

---
 t5x/configs/dataset/pile/pile.egg-info/PKG-INFO        | 10 ++++++++++
 t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt     |  7 +++++++
 .../dataset/pile/pile.egg-info/dependency_links.txt    |  1 +
 t5x/configs/dataset/pile/pile.egg-info/requires.txt    |  1 +
 t5x/configs/dataset/pile/pile.egg-info/top_level.txt   |  1 +
 t5x/configs/dataset/pile/setup.py                      |  3 ++-
 6 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 t5x/configs/dataset/pile/pile.egg-info/PKG-INFO
 create mode 100644 t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt
 create mode 100644 t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt
 create mode 100644 t5x/configs/dataset/pile/pile.egg-info/requires.txt
 create mode 100644 t5x/configs/dataset/pile/pile.egg-info/top_level.txt

diff --git a/t5x/configs/dataset/pile/pile.egg-info/PKG-INFO b/t5x/configs/dataset/pile/pile.egg-info/PKG-INFO
new file mode 100644
index 000000000..26e6f367a
--- /dev/null
+++ b/t5x/configs/dataset/pile/pile.egg-info/PKG-INFO
@@ -0,0 +1,10 @@
+Metadata-Version: 2.1
+Name: pile
+Version: 0.0.1
+Summary: Cache pile set workflow package.
+Home-page: UNKNOWN
+License: UNKNOWN
+Platform: UNKNOWN
+
+UNKNOWN
+
diff --git a/t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt b/t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt
new file mode 100644
index 000000000..8dd9e9e70
--- /dev/null
+++ b/t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt
@@ -0,0 +1,7 @@
+setup.py
+pile/__init__.py
+pile.egg-info/PKG-INFO
+pile.egg-info/SOURCES.txt
+pile.egg-info/dependency_links.txt
+pile.egg-info/requires.txt
+pile.egg-info/top_level.txt
\ No newline at end of file
diff --git a/t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt b/t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/t5x/configs/dataset/pile/pile.egg-info/requires.txt b/t5x/configs/dataset/pile/pile.egg-info/requires.txt
new file mode 100644
index 000000000..24ce15ab7
--- /dev/null
+++ b/t5x/configs/dataset/pile/pile.egg-info/requires.txt
@@ -0,0 +1 @@
+numpy
diff --git a/t5x/configs/dataset/pile/pile.egg-info/top_level.txt b/t5x/configs/dataset/pile/pile.egg-info/top_level.txt
new file mode 100644
index 000000000..db718410b
--- /dev/null
+++ b/t5x/configs/dataset/pile/pile.egg-info/top_level.txt
@@ -0,0 +1 @@
+pile
diff --git a/t5x/configs/dataset/pile/setup.py b/t5x/configs/dataset/pile/setup.py
index d6a1f2ec4..e314f7e61 100644
--- a/t5x/configs/dataset/pile/setup.py
+++ b/t5x/configs/dataset/pile/setup.py
@@ -71,7 +71,8 @@ class build(_build):  # pylint: disable=invalid-name
 # worker-startup log.
 CUSTOM_COMMANDS = [
     ['echo', 'Custom command worked!'],
-    ['pip', 'install', 'seqio']
+    ['pip', 'install', 'seqio'],
+    ['pip', 'install', 't5[cache-tasks]']
 ]
 
 

From f3ea974577fce3b422bd595852702d2e4e7bf225 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 16:49:00 +0100
Subject: [PATCH 28/37] No need to commit egg-info

---
 t5x/configs/dataset/pile/pile.egg-info/PKG-INFO        | 10 ----------
 t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt     |  7 -------
 .../dataset/pile/pile.egg-info/dependency_links.txt    |  1 -
 t5x/configs/dataset/pile/pile.egg-info/requires.txt    |  1 -
 t5x/configs/dataset/pile/pile.egg-info/top_level.txt   |  1 -
 5 files changed, 20 deletions(-)
 delete mode 100644 t5x/configs/dataset/pile/pile.egg-info/PKG-INFO
 delete mode 100644 t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt
 delete mode 100644 t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt
 delete mode 100644 t5x/configs/dataset/pile/pile.egg-info/requires.txt
 delete mode 100644 t5x/configs/dataset/pile/pile.egg-info/top_level.txt

diff --git a/t5x/configs/dataset/pile/pile.egg-info/PKG-INFO b/t5x/configs/dataset/pile/pile.egg-info/PKG-INFO
deleted file mode 100644
index 26e6f367a..000000000
--- a/t5x/configs/dataset/pile/pile.egg-info/PKG-INFO
+++ /dev/null
@@ -1,10 +0,0 @@
-Metadata-Version: 2.1
-Name: pile
-Version: 0.0.1
-Summary: Cache pile set workflow package.
-Home-page: UNKNOWN
-License: UNKNOWN
-Platform: UNKNOWN
-
-UNKNOWN
-
diff --git a/t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt b/t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt
deleted file mode 100644
index 8dd9e9e70..000000000
--- a/t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-setup.py
-pile/__init__.py
-pile.egg-info/PKG-INFO
-pile.egg-info/SOURCES.txt
-pile.egg-info/dependency_links.txt
-pile.egg-info/requires.txt
-pile.egg-info/top_level.txt
\ No newline at end of file
diff --git a/t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt b/t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt
deleted file mode 100644
index 8b1378917..000000000
--- a/t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/t5x/configs/dataset/pile/pile.egg-info/requires.txt b/t5x/configs/dataset/pile/pile.egg-info/requires.txt
deleted file mode 100644
index 24ce15ab7..000000000
--- a/t5x/configs/dataset/pile/pile.egg-info/requires.txt
+++ /dev/null
@@ -1 +0,0 @@
-numpy
diff --git a/t5x/configs/dataset/pile/pile.egg-info/top_level.txt b/t5x/configs/dataset/pile/pile.egg-info/top_level.txt
deleted file mode 100644
index db718410b..000000000
--- a/t5x/configs/dataset/pile/pile.egg-info/top_level.txt
+++ /dev/null
@@ -1 +0,0 @@
-pile

From a91207a3cb532ea11abbc6dee84a8ebcfecbada8 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 17:03:04 +0100
Subject: [PATCH 29/37] Some more fixing

---
 t5x/configs/dataset/pile/pile/task.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/t5x/configs/dataset/pile/pile/task.py b/t5x/configs/dataset/pile/pile/task.py
index 64cb9f353..60b150a08 100644
--- a/t5x/configs/dataset/pile/pile/task.py
+++ b/t5x/configs/dataset/pile/pile/task.py
@@ -1,6 +1,4 @@
 import functools
-import sys
-
 import seqio
 from t5.data import preprocessors, utils
 import tensorflow as tf
@@ -29,11 +27,9 @@
 
 @utils.map_over_dataset
 def extract_text_from_json_tf(json: str):
-    tf.print(json,output_stream=sys.stdout)
     output = tf.strings.split(json, '{"text": "', maxsplit=1)[1]
     output = tf.strings.split(output, '", "meta": {', maxsplit=1)[0]
-    tf.print(output,output_stream=sys.stdout)
-    return output
+    return {"text": output}
 
 seqio.TaskRegistry.add(
     'pile_t2t_span_corruption',

From 3ced63b516870d0abdaea70985ff8934e7b5bcef Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 17:57:18 +0100
Subject: [PATCH 30/37] Use 32 workers, ie the number of files

---
 t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
index 32e89d806..7e3cec8fb 100644
--- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -13,4 +13,4 @@ seqio_cache_tasks \
  --module_import=$MODULE_IMPORT \
  --tasks=${TASK_NAME} \
  --output_cache_dir=${BUCKET}/cache \
- --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/setup.py"
\ No newline at end of file
+ --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/setup.py,--numWorkers=32,--autoscalingAlgorithm=NONE"
\ No newline at end of file

From 4cd94eaf30d1365906c82cd33ec654fd1004ee2a Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 18:11:35 +0100
Subject: [PATCH 31/37] update script

---
 t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
index 7e3cec8fb..f119dac13 100644
--- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -2,7 +2,7 @@
 # gcloud auth application-default login
 
 
-MODULE_IMPORT=t5x.configs.dataset.pile.task
+MODULE_IMPORT=pile.task
 TASK_NAME=pile_t2t_span_corruption
 JOB_NAME=pilet2tspancorruption # the name must consist of only the characters [-a-z0-9], starting with a letter and ending with a letter or number
 BUCKET=gs://bigscience/pile/$TASK_NAME # Don't know is cache needs to be task specific or not ...

From 630143eadaf7caedee296827e73f5428c4ff94df Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 18:18:01 +0100
Subject: [PATCH 32/37] woops

---
 t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
index f119dac13..116ebae51 100644
--- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -13,4 +13,4 @@ seqio_cache_tasks \
  --module_import=$MODULE_IMPORT \
  --tasks=${TASK_NAME} \
  --output_cache_dir=${BUCKET}/cache \
- --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/setup.py,--numWorkers=32,--autoscalingAlgorithm=NONE"
\ No newline at end of file
+ --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py,--numWorkers=32,--autoscalingAlgorithm=NONE"

From 7591d12f3087beeee2f9ac54013aaaaf4efae3e2 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 18:32:51 +0100
Subject: [PATCH 33/37] Woops read wrong doc

---
 t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
index 116ebae51..48569bd60 100644
--- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -13,4 +13,4 @@ seqio_cache_tasks \
  --module_import=$MODULE_IMPORT \
  --tasks=${TASK_NAME} \
  --output_cache_dir=${BUCKET}/cache \
- --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py,--numWorkers=32,--autoscalingAlgorithm=NONE"
+ --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py,--num_workers=32,--autoscaling_algorithm=NONE"

From 81316477f21cddf21f8bb3993b8345e6b7687c23 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 19:37:55 +0100
Subject: [PATCH 34/37] Choose machine type

---
 t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
index 48569bd60..c99cf5724 100644
--- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -13,4 +13,4 @@ seqio_cache_tasks \
  --module_import=$MODULE_IMPORT \
  --tasks=${TASK_NAME} \
  --output_cache_dir=${BUCKET}/cache \
- --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py,--num_workers=32,--autoscaling_algorithm=NONE"
+ --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py,--num_workers=32,--autoscaling_algorithm=NONE,--machine_type=n1-highmem-2"

From c85fbb3141d5abdb3a027c1f9d94e9010df908ff Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 8 Nov 2021 19:59:01 +0100
Subject: [PATCH 35/37] Preprocess only the two first files

---
 t5x/configs/dataset/pile/pile/task.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/pile/task.py b/t5x/configs/dataset/pile/pile/task.py
index 60b150a08..b2163f91b 100644
--- a/t5x/configs/dataset/pile/pile/task.py
+++ b/t5x/configs/dataset/pile/pile/task.py
@@ -20,7 +20,9 @@
 
 DATASET_FOLDER="gs://bigscience/pile/raw"
 DATASET_SPLITS_TO_FILEPATTERN={
-    "train": f"{DATASET_FOLDER}/train/*.jsonl",
+    "train": [
+        f"{DATASET_FOLDER}/train/{i:02d}.jsonl" for i in range(2)
+    ],
     "val": f"{DATASET_FOLDER}/val.jsonl",
     "test": f"{DATASET_FOLDER}/test.jsonl"
 }

From 01f9c6d5d8cc9b3a1b219330f5fb43f64ae5d40f Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 9 Nov 2021 02:10:05 +0100
Subject: [PATCH 36/37] Revert "Preprocess only the two first files"

This reverts commit c85fbb3141d5abdb3a027c1f9d94e9010df908ff.
---
 t5x/configs/dataset/pile/pile/task.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/t5x/configs/dataset/pile/pile/task.py b/t5x/configs/dataset/pile/pile/task.py
index b2163f91b..60b150a08 100644
--- a/t5x/configs/dataset/pile/pile/task.py
+++ b/t5x/configs/dataset/pile/pile/task.py
@@ -20,9 +20,7 @@
 
 DATASET_FOLDER="gs://bigscience/pile/raw"
 DATASET_SPLITS_TO_FILEPATTERN={
-    "train": [
-        f"{DATASET_FOLDER}/train/{i:02d}.jsonl" for i in range(2)
-    ],
+    "train": f"{DATASET_FOLDER}/train/*.jsonl",
     "val": f"{DATASET_FOLDER}/val.jsonl",
     "test": f"{DATASET_FOLDER}/test.jsonl"
 }

From ec3989be6922e443138c8d5beba60166f88049c7 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 9 Nov 2021 02:15:59 +0100
Subject: [PATCH 37/37] Update path of cache

---
 t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
index c99cf5724..e479da942 100644
--- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh
+++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh
@@ -5,7 +5,7 @@
 MODULE_IMPORT=pile.task
 TASK_NAME=pile_t2t_span_corruption
 JOB_NAME=pilet2tspancorruption # the name must consist of only the characters [-a-z0-9], starting with a letter and ending with a letter or number
-BUCKET=gs://bigscience/pile/$TASK_NAME # Don't know is cache needs to be task specific or not ...
+BUCKET=gs://bigscience/seqio_cached_tasks/$TASK_NAME # Don't know is cache needs to be task specific or not ...
 PROJECT=bigscience
 REGION=europe-west1