From 54810cedee7a809fe8393ad5dfba2fe3525cf5a5 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 5 Nov 2021 17:12:21 +0100 Subject: [PATCH 01/37] First draft of how I think seqio works --- t5x/configs/dataset/pile/task.py | 57 +++++++++++++++++++++ t5x/configs/dataset/pile/utils.py | 82 +++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 t5x/configs/dataset/pile/task.py create mode 100644 t5x/configs/dataset/pile/utils.py diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/task.py new file mode 100644 index 000000000..cbf48d534 --- /dev/null +++ b/t5x/configs/dataset/pile/task.py @@ -0,0 +1,57 @@ +import functools +import seqio +from t5.data import preprocessors + +from t5x.configs.dataset.pile.utils import PileDatasetFnCallable + +vocabulary = seqio.SentencePieceVocabulary( + 'gs://t5-data/vocabs/cc_all.32000/sentencepiece.model', extra_ids=100) +output_features = { + 'inputs': seqio.Feature(vocabulary=vocabulary), + 'targets': seqio.Feature(vocabulary=vocabulary) +} + +DEFAULT_OUTPUT_FEATURES = { + "inputs": seqio.Feature( + vocabulary=vocabulary, add_eos=True, + required=False), + "targets": seqio.Feature( + vocabulary=vocabulary, add_eos=True) +} + +seqio.TaskRegistry.add( + 'pile_t2t_span_corruption', + source=seqio.FunctionDataSource(dataset_fn=PileDatasetFnCallable(), splits=["train", "val"]), + preprocessors=[ + functools.partial( + preprocessors.rekey, key_map={ + "inputs": None, + "targets": "text" + }), + seqio.preprocessors.tokenize, + seqio.CacheDatasetPlaceholder(required=True), + preprocessors.span_corruption, + seqio.preprocessors.append_eos_after_trim, + ], + output_features=DEFAULT_OUTPUT_FEATURES, + metric_fns=[] +) + +# Prefix language modeling pretraining task used in Raffel et al., 2019. +seqio.TaskRegistry.add( + "pile_t2t_prefix_lm", + source=seqio.FunctionDataSource(dataset_fn=PileDatasetFnCallable(), splits=["train", "val"]), + preprocessors=[ + functools.partial( + preprocessors.rekey, key_map={ + "inputs": None, + "targets": "text" + }), + seqio.preprocessors.tokenize, + seqio.CacheDatasetPlaceholder(required=True), + preprocessors.prefix_lm, + seqio.preprocessors.append_eos_after_trim, + ], + output_features=DEFAULT_OUTPUT_FEATURES, + metric_fns=[] +) \ No newline at end of file diff --git a/t5x/configs/dataset/pile/utils.py b/t5x/configs/dataset/pile/utils.py new file mode 100644 index 000000000..1e85af57c --- /dev/null +++ b/t5x/configs/dataset/pile/utils.py @@ -0,0 +1,82 @@ +from pathlib import Path +from typing import Optional + +import seqio +from datasets import load_dataset +import tensorflow as tf + +def load_from_local(dataset_dir: Path): + dataset_list = { + "train": [ + "train/00.jsonl", + "train/01.jsonl", + "train/02.jsonl", + "train/03.jsonl", + "train/04.jsonl", + "train/05.jsonl", + "train/06.jsonl", + "train/07.jsonl", + "train/08.jsonl", + "train/09.jsonl", + "train/10.jsonl", + "train/11.jsonl", + "train/12.jsonl", + "train/13.jsonl", + "train/14.jsonl", + "train/15.jsonl", + "train/16.jsonl", + "train/17.jsonl", + "train/18.jsonl", + "train/19.jsonl", + "train/20.jsonl", + "train/21.jsonl", + "train/22.jsonl", + "train/23.jsonl", + "train/24.jsonl", + "train/25.jsonl", + "train/26.jsonl", + "train/27.jsonl", + "train/28.jsonl", + "train/29.jsonl" + ], + "test": [ + "test.jsonl" + ], + "val": [ + "val.jsonl" + ], + } + + for split_name, filepaths in dataset_list: + load_dataset("json", data_files=[f"{dataset_dir}/{filepath}" for filepath in filepaths], data="text") + +def load_from_urls(): + remote_urls = { + "test": [ + "https://the-eye.eu/public/AI/pile/test.jsonl.zst", + ] + } + + return {split_name: load_dataset("json", data_files=urls, field="text") for split_name, urls in remote_urls} + +class PileDatasetFnCallable(seqio.DatasetFnCallable): + def __init__(self): + self.datasets = load_from_urls() + + def __call__( + self, + split: str, + shuffle_files: bool, + seed: Optional[int] = None + ) -> tf.data.Dataset: + datasets = load_from_urls() + if split not in datasets: + raise ValueError(f"Unrecognized split value, got {split} expected {datasets.keys()}") + + dataset = datasets[split] + return dataset.to_tf_dataset( + columns="text", + batch_size=1000, + shuffle=shuffle_files + ) + From 9a3146cfbcf3a1ffd77df11d7eff7798ce0af21f Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 5 Nov 2021 17:44:13 +0100 Subject: [PATCH 02/37] Improve pipeline --- t5x/configs/dataset/pile/task.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/task.py index cbf48d534..89261ded1 100644 --- a/t5x/configs/dataset/pile/task.py +++ b/t5x/configs/dataset/pile/task.py @@ -1,8 +1,7 @@ import functools import seqio -from t5.data import preprocessors - -from t5x.configs.dataset.pile.utils import PileDatasetFnCallable +from t5.data import preprocessors, utils +import json as js vocabulary = seqio.SentencePieceVocabulary( 'gs://t5-data/vocabs/cc_all.32000/sentencepiece.model', extra_ids=100) @@ -19,10 +18,24 @@ vocabulary=vocabulary, add_eos=True) } +DATASET_FOLDER="" +DATASET_SPLITS_TO_FILEPATTERN={ + "train": f"{DATASET_FOLDER}/train/*.jsonl", + "val": f"{DATASET_FOLDER}/val.jsonl", + "test": f"{DATASET_FOLDER}/test.jsonl" +} + +@utils.map_over_dataset +def extract_text_from_json(json: str): + return js.loads(json)["text"] + seqio.TaskRegistry.add( 'pile_t2t_span_corruption', - source=seqio.FunctionDataSource(dataset_fn=PileDatasetFnCallable(), splits=["train", "val"]), + source=seqio.TextLineDataSource( + split_to_filepattern=DATASET_SPLITS_TO_FILEPATTERN, + ), preprocessors=[ + extract_text_from_json, functools.partial( preprocessors.rekey, key_map={ "inputs": None, @@ -40,8 +53,11 @@ # Prefix language modeling pretraining task used in Raffel et al., 2019. seqio.TaskRegistry.add( "pile_t2t_prefix_lm", - source=seqio.FunctionDataSource(dataset_fn=PileDatasetFnCallable(), splits=["train", "val"]), + source=seqio.TextLineDataSource( + split_to_filepattern=DATASET_SPLITS_TO_FILEPATTERN, + ), preprocessors=[ + extract_text_from_json, functools.partial( preprocessors.rekey, key_map={ "inputs": None, From bccaa5faebcb94a5f20bc21ce09c2d45b1590191 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 6 Nov 2021 17:08:09 +0100 Subject: [PATCH 03/37] Added script to download pile to gcp bucket --- t5x/configs/dataset/pile/download_all_pile.py | 69 ++++++++++++++++ t5x/configs/dataset/pile/task.py | 19 ++++- t5x/configs/dataset/pile/utils.py | 82 ------------------- 3 files changed, 84 insertions(+), 86 deletions(-) create mode 100644 t5x/configs/dataset/pile/download_all_pile.py delete mode 100644 t5x/configs/dataset/pile/utils.py diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py new file mode 100644 index 000000000..c40669afc --- /dev/null +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -0,0 +1,69 @@ +import argparse +import functools +import subprocess +from multiprocessing import Pool + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--procs", type=int, required=True, help="Number of processes." + ) + parser.add_argument( + "--base-dir", type=str, required=True, help="Folder to download the document to" + ) + return parser.parse_args() + + +def download_and_unztd(relative_path, base_dir): + BASE_PILE_URL = "https://the-eye.eu/public/AI/pile" + local_path = f"{base_dir}/{relative_path}" + + # Create folder + process = subprocess.Popen(["mkdir", "-p", local_path.rsplit("/", 1)]) + process.wait() + + # download files + process = subprocess.Popen(['wget', "-O", local_path , f"{BASE_PILE_URL}/{relative_path}"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + process.wait() + + # decompress files + process = subprocess.Popen(['zstd', '-d', local_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + process.wait() + +def main(): + args = get_args() + + pile_urls = { + "train": [ + f"train/{i:02d}.jsonl.zst" for i in range(30) + ], + "test": [ + f"test.jsonl.zst" + ], + "val": [ + f"val.jsonl.zst" + ] + } + base_dir = args.base_dir + gcp_base = "gs://bigscience/pile/raw" + + process = subprocess.Popen(["mkdir", "-p", base_dir]) + process.wait() + + pool = Pool(args.procs) + + pool.imap( + functools.partial(download_and_unztd, base_dir=base_dir), + [local_path for _, local_paths in pile_urls for local_path in local_paths] + ) + + process = subprocess.Popen(["gsutil", "cp", "-r", base_dir, gcp_base]) + process.wait() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/task.py index 89261ded1..ec3f909c7 100644 --- a/t5x/configs/dataset/pile/task.py +++ b/t5x/configs/dataset/pile/task.py @@ -1,5 +1,7 @@ import functools + import seqio +from seqio import feature_converters from t5.data import preprocessors, utils import json as js @@ -18,7 +20,7 @@ vocabulary=vocabulary, add_eos=True) } -DATASET_FOLDER="" +DATASET_FOLDER="gs://bigscience/pile/raw" DATASET_SPLITS_TO_FILEPATTERN={ "train": f"{DATASET_FOLDER}/train/*.jsonl", "val": f"{DATASET_FOLDER}/val.jsonl", @@ -42,7 +44,7 @@ def extract_text_from_json(json: str): "targets": "text" }), seqio.preprocessors.tokenize, - seqio.CacheDatasetPlaceholder(required=True), + seqio.CacheDatasetPlaceholder(), preprocessors.span_corruption, seqio.preprocessors.append_eos_after_trim, ], @@ -64,10 +66,19 @@ def extract_text_from_json(json: str): "targets": "text" }), seqio.preprocessors.tokenize, - seqio.CacheDatasetPlaceholder(required=True), + seqio.CacheDatasetPlaceholder(), preprocessors.prefix_lm, seqio.preprocessors.append_eos_after_trim, ], output_features=DEFAULT_OUTPUT_FEATURES, metric_fns=[] -) \ No newline at end of file +) + +if __name__ == "__main__": + task_feature_lengths = {"inputs": 7, "targets": 5} + converter = feature_converters.EncDecFeatureConverter(pack=True) + seqio.get_dataset( + "pile_t2t_span_corruption", + task_feature_lengths=task_feature_lengths, + feature_converter=converter, + ) \ No newline at end of file diff --git a/t5x/configs/dataset/pile/utils.py b/t5x/configs/dataset/pile/utils.py deleted file mode 100644 index 1e85af57c..000000000 --- a/t5x/configs/dataset/pile/utils.py +++ /dev/null @@ -1,82 +0,0 @@ -from pathlib import Path -from typing import Optional - -import seqio -from datasets import load_dataset -import tensorflow as tf - -def load_from_local(dataset_dir: Path): - dataset_list = { - "train": [ - "train/00.jsonl", - "train/01.jsonl", - "train/02.jsonl", - "train/03.jsonl", - "train/04.jsonl", - "train/05.jsonl", - "train/06.jsonl", - "train/07.jsonl", - "train/08.jsonl", - "train/09.jsonl", - "train/10.jsonl", - "train/11.jsonl", - "train/12.jsonl", - "train/13.jsonl", - "train/14.jsonl", - "train/15.jsonl", - "train/16.jsonl", - "train/17.jsonl", - "train/18.jsonl", - "train/19.jsonl", - "train/20.jsonl", - "train/21.jsonl", - "train/22.jsonl", - "train/23.jsonl", - "train/24.jsonl", - "train/25.jsonl", - "train/26.jsonl", - "train/27.jsonl", - "train/28.jsonl", - "train/29.jsonl" - ], - "test": [ - "test.jsonl" - ], - "val": [ - "val.jsonl" - ], - } - - for split_name, filepaths in dataset_list: - load_dataset("json", data_files=[f"{dataset_dir}/{filepath}" for filepath in filepaths], data="text") - -def load_from_urls(): - remote_urls = { - "test": [ - "https://the-eye.eu/public/AI/pile/test.jsonl.zst", - ] - } - - return {split_name: load_dataset("json", data_files=urls, field="text") for split_name, urls in remote_urls} - -class PileDatasetFnCallable(seqio.DatasetFnCallable): - def __init__(self): - self.datasets = load_from_urls() - - def __call__( - self, - split: str, - shuffle_files: bool, - seed: Optional[int] = None - ) -> tf.data.Dataset: - datasets = load_from_urls() - if split not in datasets: - raise ValueError(f"Unrecognized split value, got {split} expected {datasets.keys()}") - - dataset = datasets[split] - return dataset.to_tf_dataset( - columns="text", - batch_size=1000, - shuffle=shuffle_files - ) - From ff7a387025caee7f267aea54e79808ed331b9e8a Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 6 Nov 2021 17:17:31 +0100 Subject: [PATCH 04/37] I need to send to remove file progressively as I don't have access to enough disk memory --- t5x/configs/dataset/pile/download_all_pile.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py index c40669afc..b7af54be0 100644 --- a/t5x/configs/dataset/pile/download_all_pile.py +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -10,14 +10,14 @@ def get_args(): "--procs", type=int, required=True, help="Number of processes." ) parser.add_argument( - "--base-dir", type=str, required=True, help="Folder to download the document to" + "--local-base-dir", type=str, required=True, help="Folder to download the document to" ) return parser.parse_args() -def download_and_unztd(relative_path, base_dir): +def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base): BASE_PILE_URL = "https://the-eye.eu/public/AI/pile" - local_path = f"{base_dir}/{relative_path}" + local_path = f"{local_base_dir}/{relative_path}" # Create folder process = subprocess.Popen(["mkdir", "-p", local_path.rsplit("/", 1)]) @@ -35,6 +35,14 @@ def download_and_unztd(relative_path, base_dir): stderr=subprocess.PIPE) process.wait() + # upload to gcp + process = subprocess.Popen(["gsutil", "cp", "-r", local_path, f"{gcp_base}/{relative_path}"]) + process.wait() + + # delete file locally + process = subprocess.Popen(['rm', local_path]) + process.wait() + def main(): args = get_args() @@ -49,7 +57,7 @@ def main(): f"val.jsonl.zst" ] } - base_dir = args.base_dir + local_base_dir = args.base_dir gcp_base = "gs://bigscience/pile/raw" process = subprocess.Popen(["mkdir", "-p", base_dir]) @@ -58,12 +66,9 @@ def main(): pool = Pool(args.procs) pool.imap( - functools.partial(download_and_unztd, base_dir=base_dir), + functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base), [local_path for _, local_paths in pile_urls for local_path in local_paths] ) - process = subprocess.Popen(["gsutil", "cp", "-r", base_dir, gcp_base]) - process.wait() - if __name__ == "__main__": main() \ No newline at end of file From adbc5a9288023e315da2b5521882d4f3f50ec0c5 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 6 Nov 2021 17:19:36 +0100 Subject: [PATCH 05/37] Woops --- t5x/configs/dataset/pile/download_all_pile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py index b7af54be0..47b90bd4e 100644 --- a/t5x/configs/dataset/pile/download_all_pile.py +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -57,10 +57,10 @@ def main(): f"val.jsonl.zst" ] } - local_base_dir = args.base_dir + local_base_dir = args.local_base_dir gcp_base = "gs://bigscience/pile/raw" - process = subprocess.Popen(["mkdir", "-p", base_dir]) + process = subprocess.Popen(["mkdir", "-p", local_base_dir]) process.wait() pool = Pool(args.procs) From 4cb7ae8a85321d8846a45007ec2b694cbe6f1a6e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 6 Nov 2021 17:20:02 +0100 Subject: [PATCH 06/37] Woops2 --- t5x/configs/dataset/pile/download_all_pile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py index 47b90bd4e..256782818 100644 --- a/t5x/configs/dataset/pile/download_all_pile.py +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -67,7 +67,7 @@ def main(): pool.imap( functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base), - [local_path for _, local_paths in pile_urls for local_path in local_paths] + [local_path for _, local_paths in pile_urls.items() for local_path in local_paths] ) if __name__ == "__main__": From 1096de7549704fd4dde4b1a067fa16cf680973ac Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 6 Nov 2021 17:20:42 +0100 Subject: [PATCH 07/37] convert o map instead --- t5x/configs/dataset/pile/download_all_pile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py index 256782818..647112258 100644 --- a/t5x/configs/dataset/pile/download_all_pile.py +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -65,7 +65,7 @@ def main(): pool = Pool(args.procs) - pool.imap( + pool.map( functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base), [local_path for _, local_paths in pile_urls.items() for local_path in local_paths] ) From f53f21b0c81064612b96d26ec5552e9237e12b98 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 6 Nov 2021 17:22:49 +0100 Subject: [PATCH 08/37] Woops 3 --- t5x/configs/dataset/pile/download_all_pile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py index 647112258..56f052b97 100644 --- a/t5x/configs/dataset/pile/download_all_pile.py +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -20,7 +20,7 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base): local_path = f"{local_base_dir}/{relative_path}" # Create folder - process = subprocess.Popen(["mkdir", "-p", local_path.rsplit("/", 1)]) + process = subprocess.Popen(["mkdir", "-p", local_path.rsplit("/", 1)[0]]) process.wait() # download files From a1f6e0af43129901844bdaef0c52863e717a6cc7 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 6 Nov 2021 17:39:25 +0100 Subject: [PATCH 09/37] Make it sequential --- t5x/configs/dataset/pile/download_all_pile.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py index 56f052b97..7ec8474f7 100644 --- a/t5x/configs/dataset/pile/download_all_pile.py +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -2,7 +2,7 @@ import functools import subprocess from multiprocessing import Pool - +import wget def get_args(): parser = argparse.ArgumentParser() @@ -24,9 +24,7 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base): process.wait() # download files - process = subprocess.Popen(['wget', "-O", local_path , f"{BASE_PILE_URL}/{relative_path}"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + wget.download(f"{BASE_PILE_URL}/{relative_path}", local_path) process.wait() # decompress files @@ -65,10 +63,12 @@ def main(): pool = Pool(args.procs) - pool.map( - functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base), - [local_path for _, local_paths in pile_urls.items() for local_path in local_paths] - ) + # pool.map( + # functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base), + # [local_path for _, local_paths in pile_urls.items() for local_path in local_paths] + # ) + for local_path in [local_path for _, local_paths in pile_urls.items() for local_path in local_paths]: + download_unztd_and_send_to_gcloud(local_path, local_base_dir=local_base_dir, gcp_base=gcp_base) if __name__ == "__main__": main() \ No newline at end of file From a9d69f7331790d874091c1e0dbce293b03727fdc Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 6 Nov 2021 17:56:53 +0100 Subject: [PATCH 10/37] Remove recursive option --- t5x/configs/dataset/pile/download_all_pile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py index 7ec8474f7..afbd6000b 100644 --- a/t5x/configs/dataset/pile/download_all_pile.py +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -34,7 +34,7 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base): process.wait() # upload to gcp - process = subprocess.Popen(["gsutil", "cp", "-r", local_path, f"{gcp_base}/{relative_path}"]) + process = subprocess.Popen(["gsutil", "cp", local_path, f"{gcp_base}/{relative_path}"]) process.wait() # delete file locally From 68255a1d09cece029618757349dfa1e82cb8a396 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 6 Nov 2021 17:59:52 +0100 Subject: [PATCH 11/37] Update script to remove uncompressed file as well --- t5x/configs/dataset/pile/download_all_pile.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py index afbd6000b..ae054a1f8 100644 --- a/t5x/configs/dataset/pile/download_all_pile.py +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -33,13 +33,18 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base): stderr=subprocess.PIPE) process.wait() + assert local_path.endswith(".zst") + local_uncompressed_path = local_path.removesuffix(".zst") + gcp_uncompressed_path = f"{gcp_base}/{relative_path.removesuffix('.zst')}" + # upload to gcp - process = subprocess.Popen(["gsutil", "cp", local_path, f"{gcp_base}/{relative_path}"]) + process = subprocess.Popen(["gsutil", "cp", local_uncompressed_path, gcp_uncompressed_path]) process.wait() # delete file locally process = subprocess.Popen(['rm', local_path]) process.wait() + process = subprocess.Popen(['rm', local_uncompressed_path]) def main(): args = get_args() From 744cc3585d3f0687cf7fb3f3c3208a184fe971c1 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 6 Nov 2021 18:25:18 +0100 Subject: [PATCH 12/37] Test out the rest of the script --- t5x/configs/dataset/pile/download_all_pile.py | 11 ++++++----- t5x/configs/dataset/pile/task.py | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py index ae054a1f8..a6d5292ea 100644 --- a/t5x/configs/dataset/pile/download_all_pile.py +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -23,9 +23,9 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base): process = subprocess.Popen(["mkdir", "-p", local_path.rsplit("/", 1)[0]]) process.wait() - # download files - wget.download(f"{BASE_PILE_URL}/{relative_path}", local_path) - process.wait() + # # download files + # wget.download(f"{BASE_PILE_URL}/{relative_path}", local_path) + # process.wait() # decompress files process = subprocess.Popen(['zstd', '-d', local_path], @@ -34,8 +34,9 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base): process.wait() assert local_path.endswith(".zst") - local_uncompressed_path = local_path.removesuffix(".zst") - gcp_uncompressed_path = f"{gcp_base}/{relative_path.removesuffix('.zst')}" + local_uncompressed_path = local_path[:-4] + assert relative_path.endswith(".zst") + gcp_uncompressed_path = f"{gcp_base}/{relative_path[:-4]}" # upload to gcp process = subprocess.Popen(["gsutil", "cp", local_uncompressed_path, gcp_uncompressed_path]) diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/task.py index ec3f909c7..c61b18267 100644 --- a/t5x/configs/dataset/pile/task.py +++ b/t5x/configs/dataset/pile/task.py @@ -4,6 +4,7 @@ from seqio import feature_converters from t5.data import preprocessors, utils import json as js +import tensorflow as tf vocabulary = seqio.SentencePieceVocabulary( 'gs://t5-data/vocabs/cc_all.32000/sentencepiece.model', extra_ids=100) @@ -29,7 +30,7 @@ @utils.map_over_dataset def extract_text_from_json(json: str): - return js.loads(json)["text"] + return tf.py_function(js.loads(json)["text"]) seqio.TaskRegistry.add( 'pile_t2t_span_corruption', From e4502a66df27ae609fc8506f4774a0d9205fa18a Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 6 Nov 2021 18:26:14 +0100 Subject: [PATCH 13/37] Woops --- t5x/configs/dataset/pile/download_all_pile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py index a6d5292ea..787e018e1 100644 --- a/t5x/configs/dataset/pile/download_all_pile.py +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -46,6 +46,7 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base): process = subprocess.Popen(['rm', local_path]) process.wait() process = subprocess.Popen(['rm', local_uncompressed_path]) + process.wait() def main(): args = get_args() From 340ed01357b8a9ade2f5e357bd0d68232b18141e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 6 Nov 2021 18:47:54 +0100 Subject: [PATCH 14/37] Add back download step --- t5x/configs/dataset/pile/download_all_pile.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py index 787e018e1..c5523d958 100644 --- a/t5x/configs/dataset/pile/download_all_pile.py +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -23,9 +23,9 @@ def download_unztd_and_send_to_gcloud(relative_path, local_base_dir, gcp_base): process = subprocess.Popen(["mkdir", "-p", local_path.rsplit("/", 1)[0]]) process.wait() - # # download files - # wget.download(f"{BASE_PILE_URL}/{relative_path}", local_path) - # process.wait() + # download files + wget.download(f"{BASE_PILE_URL}/{relative_path}", local_path) + process.wait() # decompress files process = subprocess.Popen(['zstd', '-d', local_path], @@ -68,12 +68,12 @@ def main(): process = subprocess.Popen(["mkdir", "-p", local_base_dir]) process.wait() - pool = Pool(args.procs) - + # pool = Pool(args.procs) # pool.map( # functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base), # [local_path for _, local_paths in pile_urls.items() for local_path in local_paths] # ) + for local_path in [local_path for _, local_paths in pile_urls.items() for local_path in local_paths]: download_unztd_and_send_to_gcloud(local_path, local_base_dir=local_base_dir, gcp_base=gcp_base) From 4a58865df9b58e4d9e779b855935235384379b4e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 11:45:32 +0100 Subject: [PATCH 15/37] Cache seqio task --- t5x/configs/dataset/pile/download_all_pile.py | 1 - t5x/configs/dataset/pile/task.py | 25 +++++++------------ 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/t5x/configs/dataset/pile/download_all_pile.py b/t5x/configs/dataset/pile/download_all_pile.py index c5523d958..2851d7e50 100644 --- a/t5x/configs/dataset/pile/download_all_pile.py +++ b/t5x/configs/dataset/pile/download_all_pile.py @@ -73,7 +73,6 @@ def main(): # functools.partial(download_unztd_and_send_to_gcloud, local_base_dir=local_base_dir, gcp_base=gcp_base), # [local_path for _, local_paths in pile_urls.items() for local_path in local_paths] # ) - for local_path in [local_path for _, local_paths in pile_urls.items() for local_path in local_paths]: download_unztd_and_send_to_gcloud(local_path, local_base_dir=local_base_dir, gcp_base=gcp_base) diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/task.py index c61b18267..64cb9f353 100644 --- a/t5x/configs/dataset/pile/task.py +++ b/t5x/configs/dataset/pile/task.py @@ -1,9 +1,8 @@ import functools +import sys import seqio -from seqio import feature_converters from t5.data import preprocessors, utils -import json as js import tensorflow as tf vocabulary = seqio.SentencePieceVocabulary( @@ -29,8 +28,12 @@ } @utils.map_over_dataset -def extract_text_from_json(json: str): - return tf.py_function(js.loads(json)["text"]) +def extract_text_from_json_tf(json: str): + tf.print(json,output_stream=sys.stdout) + output = tf.strings.split(json, '{"text": "', maxsplit=1)[1] + output = tf.strings.split(output, '", "meta": {', maxsplit=1)[0] + tf.print(output,output_stream=sys.stdout) + return output seqio.TaskRegistry.add( 'pile_t2t_span_corruption', @@ -38,7 +41,7 @@ def extract_text_from_json(json: str): split_to_filepattern=DATASET_SPLITS_TO_FILEPATTERN, ), preprocessors=[ - extract_text_from_json, + extract_text_from_json_tf, functools.partial( preprocessors.rekey, key_map={ "inputs": None, @@ -53,14 +56,13 @@ def extract_text_from_json(json: str): metric_fns=[] ) -# Prefix language modeling pretraining task used in Raffel et al., 2019. seqio.TaskRegistry.add( "pile_t2t_prefix_lm", source=seqio.TextLineDataSource( split_to_filepattern=DATASET_SPLITS_TO_FILEPATTERN, ), preprocessors=[ - extract_text_from_json, + extract_text_from_json_tf, functools.partial( preprocessors.rekey, key_map={ "inputs": None, @@ -74,12 +76,3 @@ def extract_text_from_json(json: str): output_features=DEFAULT_OUTPUT_FEATURES, metric_fns=[] ) - -if __name__ == "__main__": - task_feature_lengths = {"inputs": 7, "targets": 5} - converter = feature_converters.EncDecFeatureConverter(pack=True) - seqio.get_dataset( - "pile_t2t_span_corruption", - task_feature_lengths=task_feature_lengths, - feature_converter=converter, - ) \ No newline at end of file From 8995fbfa4404d3325ac6fc1dc584de42e908966a Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 14:50:25 +0100 Subject: [PATCH 16/37] Add script in order to run caching --- t5x/configs/dataset/pile/run_cache_tasks_main.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 t5x/configs/dataset/pile/run_cache_tasks_main.sh diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh new file mode 100644 index 000000000..a09ed2a08 --- /dev/null +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -0,0 +1,16 @@ +# Need to install seqio +# gcloud auth application-default login + + +MODULE_IMPORT=t5x.configs.dataset.pile.task +TASK_NAME=pile_t2t_span_corruption +JOB_NAME=pilet2tspancorruption # the name must consist of only the characters [-a-z0-9], starting with a letter and ending with a letter or number +BUCKET=gs://bigscience/pile/$TASK_NAME # Don't know is cache needs to be task specific or not ... +PROJECT=bigscience +REGION=europe-west1 + +seqio_cache_tasks \ + --module_import=$MODULE_IMPORT \ + --tasks=${TASK_NAME} \ + --output_cache_dir=${BUCKET}/cache \ + --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp" \ No newline at end of file From 844c4268e12d2c12c551b09d85c1f96752673eb3 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 15:03:39 +0100 Subject: [PATCH 17/37] Test something out --- t5x/configs/dataset/pile/dataflow_setup.py | 124 ++++++++++++++++++ .../dataset/pile/run_cache_tasks_main.sh | 1 + 2 files changed, 125 insertions(+) create mode 100644 t5x/configs/dataset/pile/dataflow_setup.py diff --git a/t5x/configs/dataset/pile/dataflow_setup.py b/t5x/configs/dataset/pile/dataflow_setup.py new file mode 100644 index 000000000..4050aca9a --- /dev/null +++ b/t5x/configs/dataset/pile/dataflow_setup.py @@ -0,0 +1,124 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Setup.py module for the workflow's worker utilities. +All the workflow related code is gathered in a package that will be built as a +source distribution, staged in the staging area for the workflow being run and +then installed in the workers when they start running. +This behavior is triggered by specifying the --setup_file command line option +when running the workflow for remote execution. +""" + +# pytype: skip-file + +import subprocess +from distutils.command.build import build as _build # type: ignore + +import setuptools + + +# This class handles the pip install mechanism. +class build(_build): # pylint: disable=invalid-name + """A build command class that will be invoked during package install. + The package built using the current setup.py will be staged and later + installed in the worker using `pip install package'. This class will be + instantiated during install for this specific scenario and will trigger + running the custom commands specified. + """ + sub_commands = _build.sub_commands + [('CustomCommands', None)] + + +# Some custom command to run during setup. The command is not essential for this +# workflow. It is used here as an example. Each command will spawn a child +# process. Typically, these commands will include steps to install non-Python +# packages. For instance, to install a C++-based library libjpeg62 the following +# two commands will have to be added: +# +# ['apt-get', 'update'], +# ['apt-get', '--assume-yes', 'install', 'libjpeg62'], +# +# First, note that there is no need to use the sudo command because the setup +# script runs with appropriate access. +# Second, if apt-get tool is used then the first command needs to be 'apt-get +# update' so the tool refreshes itself and initializes links to download +# repositories. Without this initial step the other apt-get install commands +# will fail with package not found errors. Note also --assume-yes option which +# shortcuts the interactive confirmation. +# +# Note that in this example custom commands will run after installing required +# packages. If you have a PyPI package that depends on one of the custom +# commands, move installation of the dependent package to the list of custom +# commands, e.g.: +# +# ['pip', 'install', 'my_package'], +# +# TODO(BEAM-3237): Output from the custom commands are missing from the logs. +# The output of custom commands (including failures) will be logged in the +# worker-startup log. +CUSTOM_COMMANDS = [ + ['echo', 'Custom command worked!'], + ['pip', 'install', 'seqio'] +] + + +class CustomCommands(setuptools.Command): + """A setuptools Command class able to run arbitrary commands.""" + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def RunCustomCommand(self, command_list): + print('Running command: %s' % command_list) + p = subprocess.Popen( + command_list, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + # Can use communicate(input='y\n'.encode()) if the command run requires + # some confirmation. + stdout_data, _ = p.communicate() + print('Command output: %s' % stdout_data) + if p.returncode != 0: + raise RuntimeError( + 'Command %s failed: exit code: %s' % (command_list, p.returncode)) + + def run(self): + for command in CUSTOM_COMMANDS: + self.RunCustomCommand(command) + + +# Configure the required packages and scripts to install. +# Note that the Python Dataflow containers come with numpy already installed +# so this dependency will not trigger anything to be installed unless a version +# restriction is specified. +REQUIRED_PACKAGES = [ + 'numpy', +] + +setuptools.setup( + name='cache_pile', + version='0.0.1', + description='Cache pile set workflow package.', + install_requires=REQUIRED_PACKAGES, + packages=setuptools.find_packages(), + cmdclass={ + # Command class instantiated and run during pip install scenarios. + 'build': build, + 'CustomCommands': CustomCommands, + }) \ No newline at end of file diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh index a09ed2a08..cb9dd9971 100644 --- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -13,4 +13,5 @@ seqio_cache_tasks \ --module_import=$MODULE_IMPORT \ --tasks=${TASK_NAME} \ --output_cache_dir=${BUCKET}/cache \ + --setup_file t5x/configs/dataset/pile/dataflow_setup.py \ --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp" \ No newline at end of file From 3e294d94105f5bc881133290c8b045100113799d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 15:05:00 +0100 Subject: [PATCH 18/37] test something else: --- t5x/configs/dataset/pile/run_cache_tasks_main.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh index cb9dd9971..bff179c35 100644 --- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -13,5 +13,4 @@ seqio_cache_tasks \ --module_import=$MODULE_IMPORT \ --tasks=${TASK_NAME} \ --output_cache_dir=${BUCKET}/cache \ - --setup_file t5x/configs/dataset/pile/dataflow_setup.py \ - --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp" \ No newline at end of file + --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file t5x/configs/dataset/pile/dataflow_setup.py" \ No newline at end of file From 774e3e786c6df9a8d7293323ab9f453216e815a9 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 15:17:05 +0100 Subject: [PATCH 19/37] Woops --- t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh index bff179c35..d310b3d89 100644 --- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -13,4 +13,4 @@ seqio_cache_tasks \ --module_import=$MODULE_IMPORT \ --tasks=${TASK_NAME} \ --output_cache_dir=${BUCKET}/cache \ - --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file t5x/configs/dataset/pile/dataflow_setup.py" \ No newline at end of file + --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/dataflow_setup.py" \ No newline at end of file From 964340d7c5379efb69f0c53b3e80d33856babe29 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 15:21:26 +0100 Subject: [PATCH 20/37] Setup file needs to be names setup.py --- t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +- t5x/configs/dataset/pile/{dataflow_setup.py => setup.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename t5x/configs/dataset/pile/{dataflow_setup.py => setup.py} (100%) diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh index d310b3d89..32e89d806 100644 --- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -13,4 +13,4 @@ seqio_cache_tasks \ --module_import=$MODULE_IMPORT \ --tasks=${TASK_NAME} \ --output_cache_dir=${BUCKET}/cache \ - --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/dataflow_setup.py" \ No newline at end of file + --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/setup.py" \ No newline at end of file diff --git a/t5x/configs/dataset/pile/dataflow_setup.py b/t5x/configs/dataset/pile/setup.py similarity index 100% rename from t5x/configs/dataset/pile/dataflow_setup.py rename to t5x/configs/dataset/pile/setup.py From e17d454f587f1e0551aedd2edd88bb3d5ff4ef2a Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 15:53:14 +0100 Subject: [PATCH 21/37] Make a another package --- t5x/configs/dataset/pile/{ => pile}/task.py | 0 t5x/configs/dataset/pile/run_cache_tasks_main.sh | 5 ++--- t5x/configs/dataset/pile/setup.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) rename t5x/configs/dataset/pile/{ => pile}/task.py (100%) diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/pile/task.py similarity index 100% rename from t5x/configs/dataset/pile/task.py rename to t5x/configs/dataset/pile/pile/task.py diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh index 32e89d806..eb93f6ab7 100644 --- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -1,8 +1,7 @@ # Need to install seqio # gcloud auth application-default login - -MODULE_IMPORT=t5x.configs.dataset.pile.task +MODULE_IMPORT=pile.task TASK_NAME=pile_t2t_span_corruption JOB_NAME=pilet2tspancorruption # the name must consist of only the characters [-a-z0-9], starting with a letter and ending with a letter or number BUCKET=gs://bigscience/pile/$TASK_NAME # Don't know is cache needs to be task specific or not ... @@ -13,4 +12,4 @@ seqio_cache_tasks \ --module_import=$MODULE_IMPORT \ --tasks=${TASK_NAME} \ --output_cache_dir=${BUCKET}/cache \ - --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/setup.py" \ No newline at end of file + --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py" \ No newline at end of file diff --git a/t5x/configs/dataset/pile/setup.py b/t5x/configs/dataset/pile/setup.py index 4050aca9a..d6a1f2ec4 100644 --- a/t5x/configs/dataset/pile/setup.py +++ b/t5x/configs/dataset/pile/setup.py @@ -112,7 +112,7 @@ def run(self): ] setuptools.setup( - name='cache_pile', + name='pile', version='0.0.1', description='Cache pile set workflow package.', install_requires=REQUIRED_PACKAGES, From cc74fb4dbc7ad30b5145ce4843c6a6d7e4676b87 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 15:55:19 +0100 Subject: [PATCH 22/37] Somehow task is not part of pile package --- t5x/configs/dataset/pile/pile/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 t5x/configs/dataset/pile/pile/__init__.py diff --git a/t5x/configs/dataset/pile/pile/__init__.py b/t5x/configs/dataset/pile/pile/__init__.py new file mode 100644 index 000000000..7c5666472 --- /dev/null +++ b/t5x/configs/dataset/pile/pile/__init__.py @@ -0,0 +1 @@ +import pile.task \ No newline at end of file From 7da3b2a9a48843954884439a170a398e17833c6e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 15:56:33 +0100 Subject: [PATCH 23/37] Revert "Make a another package" This reverts commit e17d454f587f1e0551aedd2edd88bb3d5ff4ef2a. --- t5x/configs/dataset/pile/run_cache_tasks_main.sh | 5 +++-- t5x/configs/dataset/pile/setup.py | 2 +- t5x/configs/dataset/pile/{pile => }/task.py | 0 3 files changed, 4 insertions(+), 3 deletions(-) rename t5x/configs/dataset/pile/{pile => }/task.py (100%) diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh index eb93f6ab7..32e89d806 100644 --- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -1,7 +1,8 @@ # Need to install seqio # gcloud auth application-default login -MODULE_IMPORT=pile.task + +MODULE_IMPORT=t5x.configs.dataset.pile.task TASK_NAME=pile_t2t_span_corruption JOB_NAME=pilet2tspancorruption # the name must consist of only the characters [-a-z0-9], starting with a letter and ending with a letter or number BUCKET=gs://bigscience/pile/$TASK_NAME # Don't know is cache needs to be task specific or not ... @@ -12,4 +13,4 @@ seqio_cache_tasks \ --module_import=$MODULE_IMPORT \ --tasks=${TASK_NAME} \ --output_cache_dir=${BUCKET}/cache \ - --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py" \ No newline at end of file + --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/setup.py" \ No newline at end of file diff --git a/t5x/configs/dataset/pile/setup.py b/t5x/configs/dataset/pile/setup.py index d6a1f2ec4..4050aca9a 100644 --- a/t5x/configs/dataset/pile/setup.py +++ b/t5x/configs/dataset/pile/setup.py @@ -112,7 +112,7 @@ def run(self): ] setuptools.setup( - name='pile', + name='cache_pile', version='0.0.1', description='Cache pile set workflow package.', install_requires=REQUIRED_PACKAGES, diff --git a/t5x/configs/dataset/pile/pile/task.py b/t5x/configs/dataset/pile/task.py similarity index 100% rename from t5x/configs/dataset/pile/pile/task.py rename to t5x/configs/dataset/pile/task.py From a9edd8a03b612654595e0c3140251413c2f7c28c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 16:00:00 +0100 Subject: [PATCH 24/37] Remove __init__ --- t5x/configs/dataset/pile/pile/__init__.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 t5x/configs/dataset/pile/pile/__init__.py diff --git a/t5x/configs/dataset/pile/pile/__init__.py b/t5x/configs/dataset/pile/pile/__init__.py deleted file mode 100644 index 7c5666472..000000000 --- a/t5x/configs/dataset/pile/pile/__init__.py +++ /dev/null @@ -1 +0,0 @@ -import pile.task \ No newline at end of file From 385f262d657103159a04c2ac7332dda6ac211e50 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 16:06:57 +0100 Subject: [PATCH 25/37] Rename to pile --- t5x/configs/dataset/pile/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/setup.py b/t5x/configs/dataset/pile/setup.py index 4050aca9a..d6a1f2ec4 100644 --- a/t5x/configs/dataset/pile/setup.py +++ b/t5x/configs/dataset/pile/setup.py @@ -112,7 +112,7 @@ def run(self): ] setuptools.setup( - name='cache_pile', + name='pile', version='0.0.1', description='Cache pile set workflow package.', install_requires=REQUIRED_PACKAGES, From 0282f7be57208e97583d4233767a3c13592ff83d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 16:10:16 +0100 Subject: [PATCH 26/37] Fix --- t5x/configs/dataset/pile/pile/__init__.py | 0 t5x/configs/dataset/pile/{ => pile}/task.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 t5x/configs/dataset/pile/pile/__init__.py rename t5x/configs/dataset/pile/{ => pile}/task.py (100%) diff --git a/t5x/configs/dataset/pile/pile/__init__.py b/t5x/configs/dataset/pile/pile/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/t5x/configs/dataset/pile/task.py b/t5x/configs/dataset/pile/pile/task.py similarity index 100% rename from t5x/configs/dataset/pile/task.py rename to t5x/configs/dataset/pile/pile/task.py From c4cf1466ed3e6341abb5111a322479f8ebc182d4 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 16:34:23 +0100 Subject: [PATCH 27/37] We need to install t5 --- t5x/configs/dataset/pile/pile.egg-info/PKG-INFO | 10 ++++++++++ t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt | 7 +++++++ .../dataset/pile/pile.egg-info/dependency_links.txt | 1 + t5x/configs/dataset/pile/pile.egg-info/requires.txt | 1 + t5x/configs/dataset/pile/pile.egg-info/top_level.txt | 1 + t5x/configs/dataset/pile/setup.py | 3 ++- 6 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 t5x/configs/dataset/pile/pile.egg-info/PKG-INFO create mode 100644 t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt create mode 100644 t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt create mode 100644 t5x/configs/dataset/pile/pile.egg-info/requires.txt create mode 100644 t5x/configs/dataset/pile/pile.egg-info/top_level.txt diff --git a/t5x/configs/dataset/pile/pile.egg-info/PKG-INFO b/t5x/configs/dataset/pile/pile.egg-info/PKG-INFO new file mode 100644 index 000000000..26e6f367a --- /dev/null +++ b/t5x/configs/dataset/pile/pile.egg-info/PKG-INFO @@ -0,0 +1,10 @@ +Metadata-Version: 2.1 +Name: pile +Version: 0.0.1 +Summary: Cache pile set workflow package. +Home-page: UNKNOWN +License: UNKNOWN +Platform: UNKNOWN + +UNKNOWN + diff --git a/t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt b/t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt new file mode 100644 index 000000000..8dd9e9e70 --- /dev/null +++ b/t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt @@ -0,0 +1,7 @@ +setup.py +pile/__init__.py +pile.egg-info/PKG-INFO +pile.egg-info/SOURCES.txt +pile.egg-info/dependency_links.txt +pile.egg-info/requires.txt +pile.egg-info/top_level.txt \ No newline at end of file diff --git a/t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt b/t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/t5x/configs/dataset/pile/pile.egg-info/requires.txt b/t5x/configs/dataset/pile/pile.egg-info/requires.txt new file mode 100644 index 000000000..24ce15ab7 --- /dev/null +++ b/t5x/configs/dataset/pile/pile.egg-info/requires.txt @@ -0,0 +1 @@ +numpy diff --git a/t5x/configs/dataset/pile/pile.egg-info/top_level.txt b/t5x/configs/dataset/pile/pile.egg-info/top_level.txt new file mode 100644 index 000000000..db718410b --- /dev/null +++ b/t5x/configs/dataset/pile/pile.egg-info/top_level.txt @@ -0,0 +1 @@ +pile diff --git a/t5x/configs/dataset/pile/setup.py b/t5x/configs/dataset/pile/setup.py index d6a1f2ec4..e314f7e61 100644 --- a/t5x/configs/dataset/pile/setup.py +++ b/t5x/configs/dataset/pile/setup.py @@ -71,7 +71,8 @@ class build(_build): # pylint: disable=invalid-name # worker-startup log. CUSTOM_COMMANDS = [ ['echo', 'Custom command worked!'], - ['pip', 'install', 'seqio'] + ['pip', 'install', 'seqio'], + ['pip', 'install', 't5[cache-tasks]'] ] From f3ea974577fce3b422bd595852702d2e4e7bf225 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 16:49:00 +0100 Subject: [PATCH 28/37] No need to commit egg-info --- t5x/configs/dataset/pile/pile.egg-info/PKG-INFO | 10 ---------- t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt | 7 ------- .../dataset/pile/pile.egg-info/dependency_links.txt | 1 - t5x/configs/dataset/pile/pile.egg-info/requires.txt | 1 - t5x/configs/dataset/pile/pile.egg-info/top_level.txt | 1 - 5 files changed, 20 deletions(-) delete mode 100644 t5x/configs/dataset/pile/pile.egg-info/PKG-INFO delete mode 100644 t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt delete mode 100644 t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt delete mode 100644 t5x/configs/dataset/pile/pile.egg-info/requires.txt delete mode 100644 t5x/configs/dataset/pile/pile.egg-info/top_level.txt diff --git a/t5x/configs/dataset/pile/pile.egg-info/PKG-INFO b/t5x/configs/dataset/pile/pile.egg-info/PKG-INFO deleted file mode 100644 index 26e6f367a..000000000 --- a/t5x/configs/dataset/pile/pile.egg-info/PKG-INFO +++ /dev/null @@ -1,10 +0,0 @@ -Metadata-Version: 2.1 -Name: pile -Version: 0.0.1 -Summary: Cache pile set workflow package. -Home-page: UNKNOWN -License: UNKNOWN -Platform: UNKNOWN - -UNKNOWN - diff --git a/t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt b/t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt deleted file mode 100644 index 8dd9e9e70..000000000 --- a/t5x/configs/dataset/pile/pile.egg-info/SOURCES.txt +++ /dev/null @@ -1,7 +0,0 @@ -setup.py -pile/__init__.py -pile.egg-info/PKG-INFO -pile.egg-info/SOURCES.txt -pile.egg-info/dependency_links.txt -pile.egg-info/requires.txt -pile.egg-info/top_level.txt \ No newline at end of file diff --git a/t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt b/t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt deleted file mode 100644 index 8b1378917..000000000 --- a/t5x/configs/dataset/pile/pile.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/t5x/configs/dataset/pile/pile.egg-info/requires.txt b/t5x/configs/dataset/pile/pile.egg-info/requires.txt deleted file mode 100644 index 24ce15ab7..000000000 --- a/t5x/configs/dataset/pile/pile.egg-info/requires.txt +++ /dev/null @@ -1 +0,0 @@ -numpy diff --git a/t5x/configs/dataset/pile/pile.egg-info/top_level.txt b/t5x/configs/dataset/pile/pile.egg-info/top_level.txt deleted file mode 100644 index db718410b..000000000 --- a/t5x/configs/dataset/pile/pile.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -pile From a91207a3cb532ea11abbc6dee84a8ebcfecbada8 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 17:03:04 +0100 Subject: [PATCH 29/37] Some more fixing --- t5x/configs/dataset/pile/pile/task.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/t5x/configs/dataset/pile/pile/task.py b/t5x/configs/dataset/pile/pile/task.py index 64cb9f353..60b150a08 100644 --- a/t5x/configs/dataset/pile/pile/task.py +++ b/t5x/configs/dataset/pile/pile/task.py @@ -1,6 +1,4 @@ import functools -import sys - import seqio from t5.data import preprocessors, utils import tensorflow as tf @@ -29,11 +27,9 @@ @utils.map_over_dataset def extract_text_from_json_tf(json: str): - tf.print(json,output_stream=sys.stdout) output = tf.strings.split(json, '{"text": "', maxsplit=1)[1] output = tf.strings.split(output, '", "meta": {', maxsplit=1)[0] - tf.print(output,output_stream=sys.stdout) - return output + return {"text": output} seqio.TaskRegistry.add( 'pile_t2t_span_corruption', From 3ced63b516870d0abdaea70985ff8934e7b5bcef Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 17:57:18 +0100 Subject: [PATCH 30/37] Use 32 workers, ie the number of files --- t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh index 32e89d806..7e3cec8fb 100644 --- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -13,4 +13,4 @@ seqio_cache_tasks \ --module_import=$MODULE_IMPORT \ --tasks=${TASK_NAME} \ --output_cache_dir=${BUCKET}/cache \ - --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/setup.py" \ No newline at end of file + --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/setup.py,--numWorkers=32,--autoscalingAlgorithm=NONE" \ No newline at end of file From 4cd94eaf30d1365906c82cd33ec654fd1004ee2a Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 18:11:35 +0100 Subject: [PATCH 31/37] update script --- t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh index 7e3cec8fb..f119dac13 100644 --- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -2,7 +2,7 @@ # gcloud auth application-default login -MODULE_IMPORT=t5x.configs.dataset.pile.task +MODULE_IMPORT=pile.task TASK_NAME=pile_t2t_span_corruption JOB_NAME=pilet2tspancorruption # the name must consist of only the characters [-a-z0-9], starting with a letter and ending with a letter or number BUCKET=gs://bigscience/pile/$TASK_NAME # Don't know is cache needs to be task specific or not ... From 630143eadaf7caedee296827e73f5428c4ff94df Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 18:18:01 +0100 Subject: [PATCH 32/37] woops --- t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh index f119dac13..116ebae51 100644 --- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -13,4 +13,4 @@ seqio_cache_tasks \ --module_import=$MODULE_IMPORT \ --tasks=${TASK_NAME} \ --output_cache_dir=${BUCKET}/cache \ - --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/t5x/configs/dataset/pile/setup.py,--numWorkers=32,--autoscalingAlgorithm=NONE" \ No newline at end of file + --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py,--numWorkers=32,--autoscalingAlgorithm=NONE" From 7591d12f3087beeee2f9ac54013aaaaf4efae3e2 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 18:32:51 +0100 Subject: [PATCH 33/37] Woops read wrong doc --- t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh index 116ebae51..48569bd60 100644 --- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -13,4 +13,4 @@ seqio_cache_tasks \ --module_import=$MODULE_IMPORT \ --tasks=${TASK_NAME} \ --output_cache_dir=${BUCKET}/cache \ - --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py,--numWorkers=32,--autoscalingAlgorithm=NONE" + --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py,--num_workers=32,--autoscaling_algorithm=NONE" From 81316477f21cddf21f8bb3993b8345e6b7687c23 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 19:37:55 +0100 Subject: [PATCH 34/37] Choose machine type --- t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh index 48569bd60..c99cf5724 100644 --- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -13,4 +13,4 @@ seqio_cache_tasks \ --module_import=$MODULE_IMPORT \ --tasks=${TASK_NAME} \ --output_cache_dir=${BUCKET}/cache \ - --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py,--num_workers=32,--autoscaling_algorithm=NONE" + --pipeline_options="--runner=DataflowRunner,--project=$PROJECT,--region=$REGION,--job_name=$JOB_NAME,--staging_location=$BUCKET/binaries,--temp_location=$BUCKET/tmp,--setup_file=$PWD/setup.py,--num_workers=32,--autoscaling_algorithm=NONE,--machine_type=n1-highmem-2" From c85fbb3141d5abdb3a027c1f9d94e9010df908ff Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 8 Nov 2021 19:59:01 +0100 Subject: [PATCH 35/37] Preprocess only the two first files --- t5x/configs/dataset/pile/pile/task.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/pile/task.py b/t5x/configs/dataset/pile/pile/task.py index 60b150a08..b2163f91b 100644 --- a/t5x/configs/dataset/pile/pile/task.py +++ b/t5x/configs/dataset/pile/pile/task.py @@ -20,7 +20,9 @@ DATASET_FOLDER="gs://bigscience/pile/raw" DATASET_SPLITS_TO_FILEPATTERN={ - "train": f"{DATASET_FOLDER}/train/*.jsonl", + "train": [ + f"{DATASET_FOLDER}/train/{i:02d}.jsonl" for i in range(2) + ], "val": f"{DATASET_FOLDER}/val.jsonl", "test": f"{DATASET_FOLDER}/test.jsonl" } From 01f9c6d5d8cc9b3a1b219330f5fb43f64ae5d40f Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 9 Nov 2021 02:10:05 +0100 Subject: [PATCH 36/37] Revert "Preprocess only the two first files" This reverts commit c85fbb3141d5abdb3a027c1f9d94e9010df908ff. --- t5x/configs/dataset/pile/pile/task.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/t5x/configs/dataset/pile/pile/task.py b/t5x/configs/dataset/pile/pile/task.py index b2163f91b..60b150a08 100644 --- a/t5x/configs/dataset/pile/pile/task.py +++ b/t5x/configs/dataset/pile/pile/task.py @@ -20,9 +20,7 @@ DATASET_FOLDER="gs://bigscience/pile/raw" DATASET_SPLITS_TO_FILEPATTERN={ - "train": [ - f"{DATASET_FOLDER}/train/{i:02d}.jsonl" for i in range(2) - ], + "train": f"{DATASET_FOLDER}/train/*.jsonl", "val": f"{DATASET_FOLDER}/val.jsonl", "test": f"{DATASET_FOLDER}/test.jsonl" } From ec3989be6922e443138c8d5beba60166f88049c7 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 9 Nov 2021 02:15:59 +0100 Subject: [PATCH 37/37] Update path of cache --- t5x/configs/dataset/pile/run_cache_tasks_main.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t5x/configs/dataset/pile/run_cache_tasks_main.sh b/t5x/configs/dataset/pile/run_cache_tasks_main.sh index c99cf5724..e479da942 100644 --- a/t5x/configs/dataset/pile/run_cache_tasks_main.sh +++ b/t5x/configs/dataset/pile/run_cache_tasks_main.sh @@ -5,7 +5,7 @@ MODULE_IMPORT=pile.task TASK_NAME=pile_t2t_span_corruption JOB_NAME=pilet2tspancorruption # the name must consist of only the characters [-a-z0-9], starting with a letter and ending with a letter or number -BUCKET=gs://bigscience/pile/$TASK_NAME # Don't know is cache needs to be task specific or not ... +BUCKET=gs://bigscience/seqio_cached_tasks/$TASK_NAME # Don't know is cache needs to be task specific or not ... PROJECT=bigscience REGION=europe-west1