Skip to content

Commit 09a3a73

Browse files
TITCrasbt
andauthored
remove all non-English texts and notice (rasbt#304)
* remove all non-English texts and notice 1. almost 18GB txt left after `is_english` filtered. 2. remove notice use gutenberg's strip_headers 3. after re-run get_data.py, seems all data are under `gutenberg/data/.mirror` folder. * some improvements * update readme --------- Co-authored-by: rasbt <[email protected]>
1 parent f1c3d45 commit 09a3a73

File tree

2 files changed

+24
-5
lines changed

2 files changed

+24
-5
lines changed

ch05/03_bonus_pretraining_on_gutenberg/README.md

+8-1
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,18 @@ Next, run the `prepare_dataset.py` script, which concatenates the (as of this wr
8282

8383
```bash
8484
python prepare_dataset.py \
85-
--data_dir gutenberg/data \
85+
--data_dir gutenberg/data/raw \
8686
--max_size_mb 500 \
8787
--output_dir gutenberg_preprocessed
8888
```
8989

90+
```
91+
...
92+
Skipping gutenberg/data/raw/PG29836_raw.txt as it does not contain primarily English text. Skipping gutenberg/data/raw/PG16527_raw.txt as it does not contain primarily English text. 100%|██████████████████████████████████████████████████████████| 57250/57250 [25:04<00:00, 38.05it/s]
93+
42 file(s) saved in /Users/sebastian/Developer/LLMs-from-scratch/ch05/03_bonus_pretraining_on_gutenberg/gutenberg_preprocessed
94+
```
95+
96+
9097
> [!TIP]
9198
> Note that the produced files are stored in plaintext format and are not pre-tokenized for simplicity. However, you may want to update the codes to store the dataset in a pre-tokenized form to save computation time if you are planning to use the dataset more often or train for multiple epochs. See the *Design Decisions and Improvements* at the bottom of this page for more information.
9299

ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@
1010
import argparse
1111
import os
1212
import re
13+
from tqdm import tqdm
14+
from gutenberg.src.cleanup import strip_headers
15+
16+
17+
def is_english(text, threshold=0.9):
18+
ascii_chars = sum(1 for c in text if ord(c) < 128)
19+
return ascii_chars / len(text) > threshold
1320

1421

1522
def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"):
@@ -20,16 +27,21 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
2027
current_size = 0
2128
file_counter = 1
2229

23-
for file_path in file_paths:
30+
for file_path in tqdm(file_paths):
2431
try:
2532
with open(file_path, "r", encoding="utf-8") as file:
2633
content = file.read()
2734
except UnicodeDecodeError:
2835
# Attempt to read the file with a fallback encoding
29-
print(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
36+
tqdm.write(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
3037
with open(file_path, "r", encoding=fallback_encoding) as file:
3138
content = file.read()
3239

40+
if not is_english(content):
41+
tqdm.write(f"Skipping {file_path} as it does not contain primarily English text.")
42+
continue
43+
content = strip_headers(content)
44+
3345
# Regular expression to replace multiple blank lines with a single blank line
3446
content = re.sub(r'\n\s*\n', '\n\n', content)
3547
estimated_size = len(content.encode("utf-8"))
@@ -56,7 +68,7 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
5668

5769
parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining")
5870

59-
parser.add_argument("--data_dir", type=str, default="gutenberg/data",
71+
parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw",
6072
help="Directory containing the downloaded raw training data")
6173
parser.add_argument("--max_size_mb", type=int, default=500,
6274
help="The maximum file size for each concatenated file in megabytes")
@@ -66,7 +78,7 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
6678
args = parser.parse_args()
6779

6880
all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir)
69-
for name in files if name.endswith((".txt", ".txt.utf8")) and "raw" not in path]
81+
for name in files if name.endswith((".txt", ".txt.utf8"))]
7082

7183
print(f"{len(all_files)} file(s) to process.")
7284
file_counter = combine_files(all_files, args.output_dir, max_size_mb=args.max_size_mb)

0 commit comments

Comments
 (0)