EasySprache · HussiJS · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/notebooks/02_data_text_prep.ipynb b/notebooks/02_data_text_prep.ipynb
@@ -63,7 +63,10 @@
     "\n",
     "\n",
     "def normalize_unicode(text: str) -> str:\n",
-    "    # NFKC makes quotes, full-width chars, and compatibility forms consistent\n",
+    "    # Unicode NFKC normalization:\n",
+    "    # - Makes quotes, full-width chars, and compatibility forms consistent\n",
+    "    # - Also decomposes/recomposes characters and may change glyphs (e.g. ﬁ -> fi)\n",
+    "    #   See Unicode Normalization Forms: https://unicode.org/reports/tr15/\n",
     "    text = unicodedata.normalize(\"NFKC\", text)\n",
     "    return text\n",
     "\n",
@@ -94,19 +97,24 @@
     "\n",
     "\n",
     "def strip_html(text: str) -> str:\n",
+    "    \"\"\"Strip HTML tags from text. Handles malformed HTML gracefully.\"\"\"\n",
     "    if \"<\" not in text or \">\" not in text:\n",
     "        return text\n",
     "\n",
-    "    if BeautifulSoup is not None:\n",
-    "        soup = BeautifulSoup(text, \"html.parser\")\n",
-    "        # Remove script/style\n",
-    "        for tag in soup([\"script\", \"style\", \"noscript\"]):\n",
-    "            tag.decompose()\n",
-    "        return soup.get_text(separator=\"\\n\")\n",
-    "\n",
-    "    stripper = _HTMLStripper()\n",
-    "    stripper.feed(text)\n",
-    "    return stripper.get_text()\n",
+    "    try:\n",
+    "        if BeautifulSoup is not None:\n",
+    "            soup = BeautifulSoup(text, \"html.parser\")\n",
+    "            # Remove script/style\n",
+    "            for tag in soup([\"script\", \"style\", \"noscript\"]):\n",
+    "                tag.decompose()\n",
+    "            return soup.get_text(separator=\"\\n\")\n",
+    "\n",
+    "        stripper = _HTMLStripper()\n",
+    "        stripper.feed(text)\n",
+    "        return stripper.get_text()\n",
+    "    except Exception:\n",
+    "        # If HTML parsing fails, return text as-is\n",
+    "        return text\n",
     "\n",
     "\n",
     "def dedupe_consecutive_lines(text: str) -> str:\n",
@@ -145,7 +153,7 @@
     "            return True\n",
     "        if ratio_punct > max_punct_ratio:\n",
     "            return True\n",
-    "        if drop_all_caps_short and ln.isupper() and len(ln) <= 25:\n",
+    "        if drop_all_caps_short and ln.isupper() and 8 <= len(ln) <= 25 and \" \" in ln:\n",
     "            return True\n",
     "        return False\n",
     "\n",
@@ -156,7 +164,14 @@
     "    return normalize_whitespace(\"\\n\".join(kept))\n",
     "\n",
     "\n",
-    "_RE_SENT_SPLIT = re.compile(r\"(?<=[.!?])\\s+(?=[A-Z0-9\"])\")\n",
+    "# Prefer NLTK sentence tokenizer if available; fall back to regex.\n",
+    "try:\n",
+    "    from nltk.tokenize import sent_tokenize as _nltk_sent_tokenize  # type: ignore\n",
+    "except Exception:  # NLTK not installed or misconfigured\n",
+    "    _nltk_sent_tokenize = None\n",
+    "\n",
+    "# Regex fallback is lightweight and has known limitations (abbreviations, ellipses, mid-sentence quotes).\n",
+    "_RE_SENT_SPLIT = re.compile(r\"(?<=[.!?])\\s+(?=[A-Z0-9\\\"'])\")\n",
     "\n",
     "\n",
     "def split_sentences(text: str) -> List[str]:\n",
@@ -165,7 +180,17 @@
     "    if not text:\n",
     "        return []\n",
     "\n",
-    "    # Treat newlines as strong separators\n",
+    "    if _nltk_sent_tokenize is not None:\n",
+    "        # Use NLTK's Punkt sentence tokenizer when available (more robust than regex).\n",
+    "        sentences: List[str] = []\n",
+    "        for para in text.split(\"\\n\"):\n",
+    "            para = para.strip()\n",
+    "            if not para:\n",
+    "                continue\n",
+    "            sentences.extend(_nltk_sent_tokenize(para))\n",
+    "        return sentences\n",
+    "\n",
+    "    # Treat newlines as strong separators (regex-based fallback)\n",
     "    parts: List[str] = []\n",
     "    for para in text.split(\"\\n\"):\n",
     "        para = para.strip()\n",
@@ -195,7 +220,8 @@
     "            chunks.append(\" \".join(current).strip())\n",
     "            if overlap > 0:\n",
     "                current = current[-overlap:]\n",
-    "                current_len = sum(len(s) + 1 for s in current)\n",
+    "                # Recalculate length: sum of sentence lengths + spaces between them\n",
+    "                current_len = sum(len(s) for s in current) + max(0, len(current) - 1)\n",
     "            else:\n",
     "                current = []\n",
     "                current_len = 0\n",
@@ -204,9 +230,11 @@
     "        s = s.strip()\n",
     "        if not s:\n",
     "            continue\n",
-    "        s_len = len(s) + 1\n",
+    "        # Calculate length including space separator (if not first sentence)\n",
+    "        s_len = len(s) + (1 if current else 0)\n",
     "        if current and (current_len + s_len) > max_chars:\n",
     "            flush()\n",
+    "            s_len = len(s)  # Recalculate for new chunk (no leading space)\n",
     "        current.append(s)\n",
     "        current_len += s_len\n",
     "\n",
@@ -217,10 +245,10 @@
     "    return [c for c in chunks if c]\n",
     "\n",
     "\n",
-    "_RE_EMAIL = re.compile(r\"\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}\\b\", re.IGNORECASE)\n",
+    "_RE_EMAIL = re.compile(r\"\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z0-9-]{2,63}\\b\", re.IGNORECASE)\n",
     "_RE_PHONE = re.compile(r\"\\b(?:\\+?\\d{1,3}[-. ]?)?(?:\\(?\\d{2,4}\\)?[-. ]?)?\\d{3,4}[-. ]?\\d{3,4}\\b\")\n",
-    "_RE_URL = re.compile(r\"\\bhttps?://[^\\s]+\", re.IGNORECASE)\n",
-    "_RE_CREDIT_CARD = re.compile(r\"\\b(?:\\d[ -]*?){13,19}\\b\")\n",
+    "_RE_URL = re.compile(r\"\\bhttps?://[^\\s]+?(?=[\\s\\)\\]\\}>\\\"\\'.,!?]|$)\", re.IGNORECASE)\n",
+    "_RE_CREDIT_CARD = re.compile(r\"\\b(?:\\d{13,19}|\\d{4}(?:[ -]\\d{4}){3})\\b\")\n",
     "\n",
     "\n",
     "def _luhn_ok(number: str) -> bool:\n",
@@ -239,39 +267,48 @@
     "\n",
     "\n",
     "def redact_pii(text: str) -> Tuple[str, Dict[str, List[str]]]:\n",
+    "    \"\"\"Redact PII from text. Handles regex errors gracefully.\"\"\"\n",
     "    entities: Dict[str, List[str]] = {\"email\": [], \"phone\": [], \"url\": [], \"card\": []}\n",
     "\n",
-    "    def repl_factory(kind: str):\n",
-    "        def _repl(m: re.Match) -> str:\n",
+    "    try:\n",
+    "        def repl_factory(kind: str):\n",
+    "            def _repl(m: re.Match) -> str:\n",
+    "                val = m.group(0)\n",
+    "                entities[kind].append(val)\n",
+    "                return f\"[{kind.upper()}_{len(entities[kind])}]\"\n",
+    "            return _repl\n",
+    "\n",
+    "        text = _RE_EMAIL.sub(repl_factory(\"email\"), text)\n",
+    "        text = _RE_URL.sub(repl_factory(\"url\"), text)\n",
+    "\n",
+    "        # Phone regex can overmatch; keep conservative by only replacing matches with enough digits\n",
+    "        def phone_repl(m: re.Match) -> str:\n",
     "            val = m.group(0)\n",
-    "            entities[kind].append(val)\n",
-    "            return f\"[{kind.upper()}_{len(entities[kind])}]\"\n",
-    "        return _repl\n",
-    "\n",
-    "    text = _RE_EMAIL.sub(repl_factory(\"email\"), text)\n",
-    "    text = _RE_URL.sub(repl_factory(\"url\"), text)\n",
+    "            digits = re.sub(r\"\\D\", \"\", val)\n",
+    "            if len(digits) < 9:\n",
+    "                return val\n",
+    "            entities[\"phone\"].append(val)\n",
+    "            return f\"[PHONE_{len(entities['phone'])}]\"\n",
     "\n",
-    "    # Phone regex can overmatch; keep conservative by only replacing matches with enough digits\n",
-    "    def phone_repl(m: re.Match) -> str:\n",
-    "        val = m.group(0)\n",
-    "        digits = re.sub(r\"\\D\", \"\", val)\n",
-    "        if len(digits) < 9:\n",
-    "            return val\n",
-    "        entities[\"phone\"].append(val)\n",
-    "        return f\"[PHONE_{len(entities['phone'])}]\"\n",
+    "        text = _RE_PHONE.sub(phone_repl, text)\n",
     "\n",
-    "    text = _RE_PHONE.sub(phone_repl, text)\n",
+    "        # Credit cards: validate by Luhn\n",
+    "        def card_repl(m: re.Match) -> str:\n",
+    "            val = m.group(0)\n",
+    "            if not _luhn_ok(val):\n",
+    "                return val\n",
+    "            entities[\"card\"].append(val)\n",
+    "            return f\"[CARD_{len(entities['card'])}]\"\n",
     "\n",
-    "    # Credit cards: validate by Luhn\n",
-    "    def card_repl(m: re.Match) -> str:\n",
-    "        val = m.group(0)\n",
-    "        if not _luhn_ok(val):\n",
-    "            return val\n",
-    "        entities[\"card\"].append(val)\n",
-    "        return f\"[CARD_{len(entities['card'])}]\"\n",
+    "        text = _RE_CREDIT_CARD.sub(card_repl, text)\n",
+    "    except Exception:\n",
+    "        # If redaction fails, return text as-is with no entities\n",
+    "        pass\n",
     "\n",
-    "    text = _RE_CREDIT_CARD.sub(card_repl, text)\n",
-    "    return text, {k: v for k, v in entities.items() if v}\n",
+    "    # Do not return raw PII values to avoid retaining sensitive data in memory.\n",
+    "    # Instead, return only non-sensitive placeholders for detected entities.\n",
+    "    safe_entities = {k: [\"[REDACTED]\" for _ in v] for k, v in entities.items() if v}\n",
+    "    return text, safe_entities\n",
     "\n",
     "\n",
     "def detect_language(text: str) -> str:\n",
@@ -288,7 +325,11 @@
     "\n",
     "\n",
     "def _estimate_syllables(word: str) -> int:\n",
-    "    # Heuristic syllable estimator (English-ish). Good enough for gating, not research-grade.\n",
+    "    \"\"\"\n",
+    "    Heuristic syllable estimator (English-ish). Good enough for gating, not research-grade.\n",
+    "    WARNING: This function is designed for English text and may produce unreliable results\n",
+    "    for non-English languages.\n",
+    "    \"\"\"\n",
     "    w = re.sub(r\"[^a-z]\", \"\", word.lower())\n",
     "    if not w:\n",
     "        return 0\n",
@@ -319,22 +360,34 @@
     "\n",
     "\n",
     "def compute_stats(text: str) -> TextStats:\n",
+    "    \"\"\"\n",
+    "    Compute text statistics including Flesch Reading Ease score.\n",
+    "    WARNING: Syllable estimation and sentence splitting are English-focused heuristics.\n",
+    "    Results may be unreliable for non-English text.\n",
+    "    \"\"\"\n",
     "    lang = detect_language(text)\n",
     "    words = re.findall(r\"[A-Za-z]+(?:'[A-Za-z]+)?\", text)\n",
     "    sentences = split_sentences(text)\n",
     "\n",
     "    word_count = len(words)\n",
-    "    sentence_count = max(1, len(sentences))\n",
-    "    avg_sentence_len = (word_count / sentence_count) if sentence_count else 0.0\n",
+    "    sentence_count = max(1, len(sentences))  # Ensure at least 1 to avoid division by zero\n",
+    "    avg_sentence_len = word_count / sentence_count  # Safe: sentence_count >= 1\n",
     "    avg_word_len = (sum(len(w) for w in words) / max(1, word_count))\n",
     "\n",
     "    long_words = [w for w in words if _estimate_syllables(w) >= 3 or len(w) >= 10]\n",
     "    long_ratio = len(long_words) / max(1, word_count)\n",
     "\n",
-    "    # Flesch Reading Ease: 206.835 − 1.015*(words/sentences) − 84.6*(syllables/words)\n",
+    "    # Flesch Reading Ease (FRE) score.\n",
+    "    # Coefficients from the standard FRE formula:\n",
+    "    #   FRE = 206.835 − 1.015*(words/sentences) − 84.6*(syllables/words)\n",
+    "    # See e.g. https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests\n",
+    "    # Note: Sentence splitting is heuristic, so this FRE score is an approximation and may mis-estimate readability.\n",
+    "    FRE_BASE = 206.835  # Base score for English texts in the FRE formula.\n",
+    "    FRE_SENTENCE_WEIGHT = 1.015  # Penalty per average sentence length (words per sentence).\n",
+    "    FRE_SYLLABLE_WEIGHT = 84.6  # Penalty per average syllables per word.\n",
     "    syllables = sum(_estimate_syllables(w) for w in words)\n",
     "    syllables_per_word = syllables / max(1, word_count)\n",
-    "    fre = 206.835 - 1.015 * avg_sentence_len - 84.6 * syllables_per_word\n",
+    "    fre = FRE_BASE - FRE_SENTENCE_WEIGHT * avg_sentence_len - FRE_SYLLABLE_WEIGHT * syllables_per_word\n",
     "\n",
     "    return TextStats(\n",
     "        language=lang,\n",
@@ -348,18 +401,22 @@
     "    )\n",
     "\n",
     "\n",
-    "def should_send_to_llm(\n",
+    "def requires_llm_simplification(\n",
     "    stats: TextStats,\n",
     "    *,\n",
     "    min_words: int = 30,\n",
     "    max_words: int = 4000,\n",
     "    fre_threshold: float = 60.0,\n",
     ") -> bool:\n",
-    "    # Simple gate: avoid LLM when too short/too long or already easy\n",
+    "    \"\"\"\n",
+    "    Determine if text needs LLM simplification based on readability metrics.\n",
+    "    Uses Flesch Reading Ease score and word count to gate LLM processing.\n",
+    "    \"\"\"\n",
+    "    # Simple gate: avoid LLM when too short or already easy\n",
     "    if stats.word_count < min_words:\n",
     "        return False\n",
     "    if stats.word_count > max_words:\n",
-    "        return True  # will likely be chunked anyway\n",
+    "        return True  # Large documents likely need simplification; will be processed in chunks\n",
     "    # If FRE is high (easier), skip\n",
     "    return stats.flesch_reading_ease < fre_threshold\n",
     "\n",
@@ -374,6 +431,14 @@
     "    max_chunk_chars: int = 1800,\n",
     "    chunk_overlap: int = 1,\n",
     ") -> Dict[str, object]:\n",
+    "    # Validate chunking parameters\n",
+    "    if max_chunk_chars <= 0:\n",
+    "        raise ValueError(\"max_chunk_chars must be positive\")\n",
+    "    if chunk_overlap < 0:\n",
+    "        raise ValueError(\"chunk_overlap cannot be negative\")\n",
+    "    if chunk_overlap >= max_chunk_chars:\n",
+    "        raise ValueError(\"chunk_overlap must be less than max_chunk_chars\")\n",
+    "\n",
     "    original = text or \"\"\n",
     "\n",
     "    cleaned = original\n",
@@ -387,22 +452,22 @@
     "    if drop_noise:\n",
     "        cleaned = drop_noise_lines(cleaned)\n",
     "\n",
-    "    pii = {}\n",
+    "    redacted_entities = {}\n",
     "    if redact:\n",
-    "        cleaned, pii = redact_pii(cleaned)\n",
+    "        cleaned, redacted_entities = redact_pii(cleaned)\n",
     "\n",
     "    stats = compute_stats(cleaned)\n",
     "    sentences = split_sentences(cleaned)\n",
     "    chunks = chunk_sentences(sentences, max_chars=max_chunk_chars, overlap=chunk_overlap)\n",
     "\n",
     "    return {\n",
-    "        'text_original': original,\n",
-    "        'text_clean': cleaned,\n",
-    "        'pii': pii,\n",
-    "        'stats': asdict(stats),\n",
-    "        'sentences': sentences,\n",
-    "        'chunks': chunks,\n",
-    "        'send_to_llm': should_send_to_llm(stats),\n",
+    "        \"text_original\": original,\n",
+    "        \"text_clean\": cleaned,\n",
+    "        \"pii\": redacted_entities,\n",
+    "        \"stats\": asdict(stats),\n",
+    "        \"sentences\": sentences,\n",
+    "        \"chunks\": chunks,\n",
+    "        \"send_to_llm\": requires_llm_simplification(stats),\n",
     "    }\n",
     "\n",
     "\n",