Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 127 additions & 62 deletions notebooks/02_data_text_prep.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@
"\n",
"\n",
"def normalize_unicode(text: str) -> str:\n",
" # NFKC makes quotes, full-width chars, and compatibility forms consistent\n",
" # Unicode NFKC normalization:\n",
" # - Makes quotes, full-width chars, and compatibility forms consistent\n",
" # - Also decomposes/recomposes characters and may change glyphs (e.g. fi -> fi)\n",
" # See Unicode Normalization Forms: https://unicode.org/reports/tr15/\n",
" text = unicodedata.normalize(\"NFKC\", text)\n",
" return text\n",
"\n",
Expand Down Expand Up @@ -94,19 +97,24 @@
"\n",
"\n",
"def strip_html(text: str) -> str:\n",
" \"\"\"Strip HTML tags from text. Handles malformed HTML gracefully.\"\"\"\n",
" if \"<\" not in text or \">\" not in text:\n",
" return text\n",
"\n",
" if BeautifulSoup is not None:\n",
" soup = BeautifulSoup(text, \"html.parser\")\n",
" # Remove script/style\n",
" for tag in soup([\"script\", \"style\", \"noscript\"]):\n",
" tag.decompose()\n",
" return soup.get_text(separator=\"\\n\")\n",
"\n",
" stripper = _HTMLStripper()\n",
" stripper.feed(text)\n",
" return stripper.get_text()\n",
" try:\n",
" if BeautifulSoup is not None:\n",
" soup = BeautifulSoup(text, \"html.parser\")\n",
" # Remove script/style\n",
" for tag in soup([\"script\", \"style\", \"noscript\"]):\n",
" tag.decompose()\n",
" return soup.get_text(separator=\"\\n\")\n",
"\n",
" stripper = _HTMLStripper()\n",
" stripper.feed(text)\n",
" return stripper.get_text()\n",
" except Exception:\n",
" # If HTML parsing fails, return text as-is\n",
" return text\n",
"\n",
"\n",
"def dedupe_consecutive_lines(text: str) -> str:\n",
Expand Down Expand Up @@ -145,7 +153,7 @@
" return True\n",
" if ratio_punct > max_punct_ratio:\n",
" return True\n",
" if drop_all_caps_short and ln.isupper() and len(ln) <= 25:\n",
" if drop_all_caps_short and ln.isupper() and 8 <= len(ln) <= 25 and \" \" in ln:\n",
" return True\n",
" return False\n",
"\n",
Expand All @@ -156,7 +164,14 @@
" return normalize_whitespace(\"\\n\".join(kept))\n",
"\n",
"\n",
"_RE_SENT_SPLIT = re.compile(r\"(?<=[.!?])\\s+(?=[A-Z0-9\"])\")\n",
"# Prefer NLTK sentence tokenizer if available; fall back to regex.\n",
"try:\n",
" from nltk.tokenize import sent_tokenize as _nltk_sent_tokenize # type: ignore\n",
"except Exception: # NLTK not installed or misconfigured\n",
" _nltk_sent_tokenize = None\n",
"\n",
"# Regex fallback is lightweight and has known limitations (abbreviations, ellipses, mid-sentence quotes).\n",
"_RE_SENT_SPLIT = re.compile(r\"(?<=[.!?])\\s+(?=[A-Z0-9\\\"'])\")\n",
"\n",
"\n",
"def split_sentences(text: str) -> List[str]:\n",
Expand All @@ -165,7 +180,17 @@
" if not text:\n",
" return []\n",
"\n",
" # Treat newlines as strong separators\n",
" if _nltk_sent_tokenize is not None:\n",
" # Use NLTK's Punkt sentence tokenizer when available (more robust than regex).\n",
" sentences: List[str] = []\n",
" for para in text.split(\"\\n\"):\n",
" para = para.strip()\n",
" if not para:\n",
" continue\n",
" sentences.extend(_nltk_sent_tokenize(para))\n",
" return sentences\n",
"\n",
" # Treat newlines as strong separators (regex-based fallback)\n",
" parts: List[str] = []\n",
" for para in text.split(\"\\n\"):\n",
" para = para.strip()\n",
Expand Down Expand Up @@ -195,7 +220,8 @@
" chunks.append(\" \".join(current).strip())\n",
" if overlap > 0:\n",
" current = current[-overlap:]\n",
" current_len = sum(len(s) + 1 for s in current)\n",
" # Recalculate length: sum of sentence lengths + spaces between them\n",
" current_len = sum(len(s) for s in current) + max(0, len(current) - 1)\n",
" else:\n",
" current = []\n",
" current_len = 0\n",
Expand All @@ -204,9 +230,11 @@
" s = s.strip()\n",
" if not s:\n",
" continue\n",
" s_len = len(s) + 1\n",
" # Calculate length including space separator (if not first sentence)\n",
" s_len = len(s) + (1 if current else 0)\n",
" if current and (current_len + s_len) > max_chars:\n",
" flush()\n",
" s_len = len(s) # Recalculate for new chunk (no leading space)\n",
" current.append(s)\n",
" current_len += s_len\n",
"\n",
Expand All @@ -217,10 +245,10 @@
" return [c for c in chunks if c]\n",
"\n",
"\n",
"_RE_EMAIL = re.compile(r\"\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}\\b\", re.IGNORECASE)\n",
"_RE_EMAIL = re.compile(r\"\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z0-9-]{2,63}\\b\", re.IGNORECASE)\n",
"_RE_PHONE = re.compile(r\"\\b(?:\\+?\\d{1,3}[-. ]?)?(?:\\(?\\d{2,4}\\)?[-. ]?)?\\d{3,4}[-. ]?\\d{3,4}\\b\")\n",
"_RE_URL = re.compile(r\"\\bhttps?://[^\\s]+\", re.IGNORECASE)\n",
"_RE_CREDIT_CARD = re.compile(r\"\\b(?:\\d[ -]*?){13,19}\\b\")\n",
"_RE_URL = re.compile(r\"\\bhttps?://[^\\s]+?(?=[\\s\\)\\]\\}>\\\"\\'.,!?]|$)\", re.IGNORECASE)\n",
"_RE_CREDIT_CARD = re.compile(r\"\\b(?:\\d{13,19}|\\d{4}(?:[ -]\\d{4}){3})\\b\")\n",
"\n",
"\n",
"def _luhn_ok(number: str) -> bool:\n",
Expand All @@ -239,39 +267,48 @@
"\n",
"\n",
"def redact_pii(text: str) -> Tuple[str, Dict[str, List[str]]]:\n",
" \"\"\"Redact PII from text. Handles regex errors gracefully.\"\"\"\n",
" entities: Dict[str, List[str]] = {\"email\": [], \"phone\": [], \"url\": [], \"card\": []}\n",
"\n",
" def repl_factory(kind: str):\n",
" def _repl(m: re.Match) -> str:\n",
" try:\n",
" def repl_factory(kind: str):\n",
" def _repl(m: re.Match) -> str:\n",
" val = m.group(0)\n",
" entities[kind].append(val)\n",
" return f\"[{kind.upper()}_{len(entities[kind])}]\"\n",
" return _repl\n",
"\n",
" text = _RE_EMAIL.sub(repl_factory(\"email\"), text)\n",
" text = _RE_URL.sub(repl_factory(\"url\"), text)\n",
"\n",
" # Phone regex can overmatch; keep conservative by only replacing matches with enough digits\n",
" def phone_repl(m: re.Match) -> str:\n",
" val = m.group(0)\n",
" entities[kind].append(val)\n",
" return f\"[{kind.upper()}_{len(entities[kind])}]\"\n",
" return _repl\n",
"\n",
" text = _RE_EMAIL.sub(repl_factory(\"email\"), text)\n",
" text = _RE_URL.sub(repl_factory(\"url\"), text)\n",
" digits = re.sub(r\"\\D\", \"\", val)\n",
" if len(digits) < 9:\n",
" return val\n",
" entities[\"phone\"].append(val)\n",
" return f\"[PHONE_{len(entities['phone'])}]\"\n",
"\n",
" # Phone regex can overmatch; keep conservative by only replacing matches with enough digits\n",
" def phone_repl(m: re.Match) -> str:\n",
" val = m.group(0)\n",
" digits = re.sub(r\"\\D\", \"\", val)\n",
" if len(digits) < 9:\n",
" return val\n",
" entities[\"phone\"].append(val)\n",
" return f\"[PHONE_{len(entities['phone'])}]\"\n",
" text = _RE_PHONE.sub(phone_repl, text)\n",
"\n",
" text = _RE_PHONE.sub(phone_repl, text)\n",
" # Credit cards: validate by Luhn\n",
" def card_repl(m: re.Match) -> str:\n",
" val = m.group(0)\n",
" if not _luhn_ok(val):\n",
" return val\n",
" entities[\"card\"].append(val)\n",
" return f\"[CARD_{len(entities['card'])}]\"\n",
"\n",
" # Credit cards: validate by Luhn\n",
" def card_repl(m: re.Match) -> str:\n",
" val = m.group(0)\n",
" if not _luhn_ok(val):\n",
" return val\n",
" entities[\"card\"].append(val)\n",
" return f\"[CARD_{len(entities['card'])}]\"\n",
" text = _RE_CREDIT_CARD.sub(card_repl, text)\n",
" except Exception:\n",
" # If redaction fails, return text as-is with no entities\n",
" pass\n",
"\n",
" text = _RE_CREDIT_CARD.sub(card_repl, text)\n",
" return text, {k: v for k, v in entities.items() if v}\n",
" # Do not return raw PII values to avoid retaining sensitive data in memory.\n",
" # Instead, return only non-sensitive placeholders for detected entities.\n",
" safe_entities = {k: [\"[REDACTED]\" for _ in v] for k, v in entities.items() if v}\n",
" return text, safe_entities\n",
"\n",
"\n",
"def detect_language(text: str) -> str:\n",
Expand All @@ -288,7 +325,11 @@
"\n",
"\n",
"def _estimate_syllables(word: str) -> int:\n",
" # Heuristic syllable estimator (English-ish). Good enough for gating, not research-grade.\n",
" \"\"\"\n",
" Heuristic syllable estimator (English-ish). Good enough for gating, not research-grade.\n",
" WARNING: This function is designed for English text and may produce unreliable results\n",
" for non-English languages.\n",
" \"\"\"\n",
" w = re.sub(r\"[^a-z]\", \"\", word.lower())\n",
" if not w:\n",
" return 0\n",
Expand Down Expand Up @@ -319,22 +360,34 @@
"\n",
"\n",
"def compute_stats(text: str) -> TextStats:\n",
" \"\"\"\n",
" Compute text statistics including Flesch Reading Ease score.\n",
" WARNING: Syllable estimation and sentence splitting are English-focused heuristics.\n",
" Results may be unreliable for non-English text.\n",
" \"\"\"\n",
" lang = detect_language(text)\n",
" words = re.findall(r\"[A-Za-z]+(?:'[A-Za-z]+)?\", text)\n",
" sentences = split_sentences(text)\n",
"\n",
" word_count = len(words)\n",
" sentence_count = max(1, len(sentences))\n",
" avg_sentence_len = (word_count / sentence_count) if sentence_count else 0.0\n",
" sentence_count = max(1, len(sentences)) # Ensure at least 1 to avoid division by zero\n",
" avg_sentence_len = word_count / sentence_count # Safe: sentence_count >= 1\n",
" avg_word_len = (sum(len(w) for w in words) / max(1, word_count))\n",
"\n",
" long_words = [w for w in words if _estimate_syllables(w) >= 3 or len(w) >= 10]\n",
" long_ratio = len(long_words) / max(1, word_count)\n",
"\n",
" # Flesch Reading Ease: 206.835 − 1.015*(words/sentences) − 84.6*(syllables/words)\n",
" # Flesch Reading Ease (FRE) score.\n",
" # Coefficients from the standard FRE formula:\n",
" # FRE = 206.835 − 1.015*(words/sentences) − 84.6*(syllables/words)\n",
" # See e.g. https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests\n",
" # Note: Sentence splitting is heuristic, so this FRE score is an approximation and may mis-estimate readability.\n",
" FRE_BASE = 206.835 # Base score for English texts in the FRE formula.\n",
" FRE_SENTENCE_WEIGHT = 1.015 # Penalty per average sentence length (words per sentence).\n",
" FRE_SYLLABLE_WEIGHT = 84.6 # Penalty per average syllables per word.\n",
" syllables = sum(_estimate_syllables(w) for w in words)\n",
" syllables_per_word = syllables / max(1, word_count)\n",
" fre = 206.835 - 1.015 * avg_sentence_len - 84.6 * syllables_per_word\n",
" fre = FRE_BASE - FRE_SENTENCE_WEIGHT * avg_sentence_len - FRE_SYLLABLE_WEIGHT * syllables_per_word\n",
"\n",
" return TextStats(\n",
" language=lang,\n",
Expand All @@ -348,18 +401,22 @@
" )\n",
"\n",
"\n",
"def should_send_to_llm(\n",
"def requires_llm_simplification(\n",
" stats: TextStats,\n",
" *,\n",
" min_words: int = 30,\n",
" max_words: int = 4000,\n",
" fre_threshold: float = 60.0,\n",
") -> bool:\n",
" # Simple gate: avoid LLM when too short/too long or already easy\n",
" \"\"\"\n",
" Determine if text needs LLM simplification based on readability metrics.\n",
" Uses Flesch Reading Ease score and word count to gate LLM processing.\n",
" \"\"\"\n",
" # Simple gate: avoid LLM when too short or already easy\n",
" if stats.word_count < min_words:\n",
" return False\n",
" if stats.word_count > max_words:\n",
" return True # will likely be chunked anyway\n",
" return True # Large documents likely need simplification; will be processed in chunks\n",
" # If FRE is high (easier), skip\n",
" return stats.flesch_reading_ease < fre_threshold\n",
"\n",
Expand All @@ -374,6 +431,14 @@
" max_chunk_chars: int = 1800,\n",
" chunk_overlap: int = 1,\n",
") -> Dict[str, object]:\n",
" # Validate chunking parameters\n",
" if max_chunk_chars <= 0:\n",
" raise ValueError(\"max_chunk_chars must be positive\")\n",
" if chunk_overlap < 0:\n",
" raise ValueError(\"chunk_overlap cannot be negative\")\n",
" if chunk_overlap >= max_chunk_chars:\n",
" raise ValueError(\"chunk_overlap must be less than max_chunk_chars\")\n",
"\n",
" original = text or \"\"\n",
"\n",
" cleaned = original\n",
Expand All @@ -387,22 +452,22 @@
" if drop_noise:\n",
" cleaned = drop_noise_lines(cleaned)\n",
"\n",
" pii = {}\n",
" redacted_entities = {}\n",
" if redact:\n",
" cleaned, pii = redact_pii(cleaned)\n",
" cleaned, redacted_entities = redact_pii(cleaned)\n",
"\n",
" stats = compute_stats(cleaned)\n",
" sentences = split_sentences(cleaned)\n",
" chunks = chunk_sentences(sentences, max_chars=max_chunk_chars, overlap=chunk_overlap)\n",
"\n",
" return {\n",
" 'text_original': original,\n",
" 'text_clean': cleaned,\n",
" 'pii': pii,\n",
" 'stats': asdict(stats),\n",
" 'sentences': sentences,\n",
" 'chunks': chunks,\n",
" 'send_to_llm': should_send_to_llm(stats),\n",
" \"text_original\": original,\n",
" \"text_clean\": cleaned,\n",
" \"pii\": redacted_entities,\n",
" \"stats\": asdict(stats),\n",
" \"sentences\": sentences,\n",
" \"chunks\": chunks,\n",
" \"send_to_llm\": requires_llm_simplification(stats),\n",
" }\n",
"\n",
"\n",
Expand Down