remove all non-English texts and notice (rasbt#304)

TITC · rasbt · web-flow · commit 09a3a73f2d3c · 2024-08-09T17:09:14.000-05:00
* remove all non-English texts and notice

1. almost 18GB txt left after `is_english` filtered.
2. remove notice use gutenberg's strip_headers
3. after re-run get_data.py, seems all data are under `gutenberg/data/.mirror` folder.

* some improvements

* update readme

---------

Co-authored-by: rasbt &lt;mail@sebastianraschka.com&gt;
diff --git a/ch05/03_bonus_pretraining_on_gutenberg/README.md b/ch05/03_bonus_pretraining_on_gutenberg/README.md
@@ -82,11 +82,18 @@ Next, run the `prepare_dataset.py` script, which concatenates the (as of this wr
 
 ```bash
 python prepare_dataset.py \
-  --data_dir gutenberg/data \
+  --data_dir gutenberg/data/raw \
   --max_size_mb 500 \
   --output_dir gutenberg_preprocessed
 ```
 
+```
+...
+Skipping gutenberg/data/raw/PG29836_raw.txt as it does not contain primarily English text.                                     Skipping gutenberg/data/raw/PG16527_raw.txt as it does not contain primarily English text.                                     100%|██████████████████████████████████████████████████████████| 57250/57250 [25:04<00:00, 38.05it/s]
+42 file(s) saved in /Users/sebastian/Developer/LLMs-from-scratch/ch05/03_bonus_pretraining_on_gutenberg/gutenberg_preprocessed
+```
+
+
 > [!TIP] 
 > Note that the produced files are stored in plaintext format and are not pre-tokenized for simplicity. However, you may want to update the codes to store the dataset in a pre-tokenized form to save computation time if you are planning to use the dataset more often or train for multiple epochs. See the *Design Decisions and Improvements* at the bottom of this page for more information.
 
diff --git a/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py b/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
@@ -10,6 +10,13 @@
 import argparse
 import os
 import re
+from tqdm import tqdm
+from gutenberg.src.cleanup import strip_headers
+
+
+def is_english(text, threshold=0.9):
+    ascii_chars = sum(1 for c in text if ord(c) < 128)
+    return ascii_chars / len(text) > threshold
 
 
 def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"):
@@ -20,16 +27,21 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
     current_size = 0
     file_counter = 1
 
-    for file_path in file_paths:
+    for file_path in tqdm(file_paths):
         try:
             with open(file_path, "r", encoding="utf-8") as file:
                 content = file.read()
         except UnicodeDecodeError:
             # Attempt to read the file with a fallback encoding
-            print(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
+            tqdm.write(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
             with open(file_path, "r", encoding=fallback_encoding) as file:
                 content = file.read()
 
+        if not is_english(content):
+            tqdm.write(f"Skipping {file_path} as it does not contain primarily English text.")
+            continue
+        content = strip_headers(content)
+
         # Regular expression to replace multiple blank lines with a single blank line
         content = re.sub(r'\n\s*\n', '\n\n', content)
         estimated_size = len(content.encode("utf-8"))
@@ -56,7 +68,7 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
 
     parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining")
 
-    parser.add_argument("--data_dir", type=str, default="gutenberg/data",
+    parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw",
                         help="Directory containing the downloaded raw training data")
     parser.add_argument("--max_size_mb", type=int, default=500,
                         help="The maximum file size for each concatenated file in megabytes")
@@ -66,7 +78,7 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
     args = parser.parse_args()
 
     all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir)
-                 for name in files if name.endswith((".txt", ".txt.utf8")) and "raw" not in path]
+                 for name in files if name.endswith((".txt", ".txt.utf8"))]
 
     print(f"{len(all_files)} file(s) to process.")
     file_counter = combine_files(all_files, args.output_dir, max_size_mb=args.max_size_mb)