diff --git a/notebooks/02_data_text_prep.ipynb b/notebooks/02_data_text_prep.ipynb new file mode 100644 index 0000000..bc66635 --- /dev/null +++ b/notebooks/02_data_text_prep.ipynb @@ -0,0 +1,543 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "84c56b8c", + "metadata": {}, + "source": [ + "## 1) Setup\n", + "The pipeline is designed to run with **stdlib only**. Optional packages (BeautifulSoup, langdetect) are used if installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34dc7bff", + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import annotations\n", + "\n", + "import re\n", + "import math\n", + "import json\n", + "import unicodedata\n", + "from dataclasses import dataclass, asdict\n", + "from html.parser import HTMLParser\n", + "from pathlib import Path\n", + "from typing import Iterable, List, Dict, Tuple, Optional\n", + "\n", + "# Optional dependencies\n", + "try:\n", + " from bs4 import BeautifulSoup # type: ignore\n", + "except Exception:\n", + " BeautifulSoup = None\n", + "\n", + "try:\n", + " from langdetect import detect as lang_detect # type: ignore\n", + "except Exception:\n", + " lang_detect = None\n", + "\n", + "print(\"✅ Imports loaded (optional deps are optional).\")" + ] + }, + { + "cell_type": "markdown", + "id": "31c96071", + "metadata": {}, + "source": [ + "## 2) Core utilities\n", + "This section defines the preprocessing steps and a single `preprocess_text()` entrypoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d99e4f4d", + "metadata": {}, + "outputs": [], + "source": [ + "_RE_MULTISPACE = re.compile(r\"[\\t\\r\\f\\v ]+\")\n", + "_RE_MULTINEWLINE = re.compile(r\"\\n{3,}\")\n", + "_RE_ZERO_WIDTH = re.compile(r\"[\\u200B-\\u200D\\uFEFF]\")\n", + "\n", + "\n", + "def normalize_unicode(text: str) -> str:\n", + " # Unicode NFKC normalization:\n", + " # - Makes quotes, full-width chars, and compatibility forms consistent\n", + " # - Also decomposes/recomposes characters and may change glyphs (e.g. fi -> fi)\n", + " # See Unicode Normalization Forms: https://unicode.org/reports/tr15/\n", + " text = unicodedata.normalize(\"NFKC\", text)\n", + " return text\n", + "\n", + "\n", + "def normalize_whitespace(text: str) -> str:\n", + " text = text.replace(\"\\r\\n\", \"\\n\").replace(\"\\r\", \"\\n\")\n", + " text = _RE_ZERO_WIDTH.sub(\"\", text)\n", + " # Normalize space runs but preserve newlines\n", + " text = _RE_MULTISPACE.sub(\" \", text)\n", + " # Trim spaces around newlines\n", + " text = \"\\n\".join(line.strip() for line in text.split(\"\\n\"))\n", + " # Collapse excessive blank lines\n", + " text = _RE_MULTINEWLINE.sub(\"\\n\\n\", text)\n", + " return text.strip()\n", + "\n", + "\n", + "class _HTMLStripper(HTMLParser):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self._chunks: List[str] = []\n", + "\n", + " def handle_data(self, data: str) -> None:\n", + " if data:\n", + " self._chunks.append(data)\n", + "\n", + " def get_text(self) -> str:\n", + " return \"\".join(self._chunks)\n", + "\n", + "\n", + "def strip_html(text: str) -> str:\n", + " \"\"\"Strip HTML tags from text. Handles malformed HTML gracefully.\"\"\"\n", + " if \"<\" not in text or \">\" not in text:\n", + " return text\n", + "\n", + " try:\n", + " if BeautifulSoup is not None:\n", + " soup = BeautifulSoup(text, \"html.parser\")\n", + " # Remove script/style\n", + " for tag in soup([\"script\", \"style\", \"noscript\"]):\n", + " tag.decompose()\n", + " return soup.get_text(separator=\"\\n\")\n", + "\n", + " stripper = _HTMLStripper()\n", + " stripper.feed(text)\n", + " return stripper.get_text()\n", + " except Exception:\n", + " # If HTML parsing fails, return text as-is\n", + " return text\n", + "\n", + "\n", + "def dedupe_consecutive_lines(text: str) -> str:\n", + " lines = [ln.rstrip() for ln in text.split(\"\\n\")]\n", + " out: List[str] = []\n", + " last = None\n", + " for ln in lines:\n", + " if ln and last == ln:\n", + " continue\n", + " out.append(ln)\n", + " last = ln if ln else last\n", + " return \"\\n\".join(out).strip()\n", + "\n", + "\n", + "def drop_noise_lines(\n", + " text: str,\n", + " *,\n", + " min_chars: int = 3,\n", + " min_alpha_ratio: float = 0.25,\n", + " max_punct_ratio: float = 0.35,\n", + " drop_all_caps_short: bool = True,\n", + ") -> str:\n", + " def is_noise(line: str) -> bool:\n", + " ln = line.strip()\n", + " if not ln:\n", + " return False\n", + " if len(ln) < min_chars:\n", + " return True\n", + "\n", + " alpha = sum(ch.isalpha() for ch in ln)\n", + " punct = sum(unicodedata.category(ch).startswith(\"P\") for ch in ln)\n", + " ratio_alpha = alpha / max(1, len(ln))\n", + " ratio_punct = punct / max(1, len(ln))\n", + "\n", + " if ratio_alpha < min_alpha_ratio:\n", + " return True\n", + " if ratio_punct > max_punct_ratio:\n", + " return True\n", + " if drop_all_caps_short and ln.isupper() and 8 <= len(ln) <= 25 and \" \" in ln:\n", + " return True\n", + " return False\n", + "\n", + " kept: List[str] = []\n", + " for ln in text.split(\"\\n\"):\n", + " if not is_noise(ln):\n", + " kept.append(ln)\n", + " return normalize_whitespace(\"\\n\".join(kept))\n", + "\n", + "\n", + "# Prefer NLTK sentence tokenizer if available; fall back to regex.\n", + "try:\n", + " from nltk.tokenize import sent_tokenize as _nltk_sent_tokenize # type: ignore\n", + "except Exception: # NLTK not installed or misconfigured\n", + " _nltk_sent_tokenize = None\n", + "\n", + "# Regex fallback is lightweight and has known limitations (abbreviations, ellipses, mid-sentence quotes).\n", + "_RE_SENT_SPLIT = re.compile(r\"(?<=[.!?])\\s+(?=[A-Z0-9\\\"'])\")\n", + "\n", + "\n", + "def split_sentences(text: str) -> List[str]:\n", + " # Lightweight splitter: good enough for chunking (not perfect).\n", + " text = normalize_whitespace(text)\n", + " if not text:\n", + " return []\n", + "\n", + " if _nltk_sent_tokenize is not None:\n", + " # Use NLTK's Punkt sentence tokenizer when available (more robust than regex).\n", + " sentences: List[str] = []\n", + " for para in text.split(\"\\n\"):\n", + " para = para.strip()\n", + " if not para:\n", + " continue\n", + " sentences.extend(_nltk_sent_tokenize(para))\n", + " return sentences\n", + "\n", + " # Treat newlines as strong separators (regex-based fallback)\n", + " parts: List[str] = []\n", + " for para in text.split(\"\\n\"):\n", + " para = para.strip()\n", + " if not para:\n", + " continue\n", + " parts.extend(_RE_SENT_SPLIT.split(para))\n", + "\n", + " return [p.strip() for p in parts if p.strip()]\n", + "\n", + "\n", + "def chunk_sentences(\n", + " sentences: List[str],\n", + " *,\n", + " max_chars: int = 1800,\n", + " overlap: int = 1,\n", + ") -> List[str]:\n", + " if not sentences:\n", + " return []\n", + "\n", + " chunks: List[str] = []\n", + " current: List[str] = []\n", + " current_len = 0\n", + "\n", + " def flush() -> None:\n", + " nonlocal current, current_len\n", + " if current:\n", + " chunks.append(\" \".join(current).strip())\n", + " if overlap > 0:\n", + " current = current[-overlap:]\n", + " # Recalculate length: sum of sentence lengths + spaces between them\n", + " current_len = sum(len(s) for s in current) + max(0, len(current) - 1)\n", + " else:\n", + " current = []\n", + " current_len = 0\n", + "\n", + " for s in sentences:\n", + " s = s.strip()\n", + " if not s:\n", + " continue\n", + " # Calculate length including space separator (if not first sentence)\n", + " s_len = len(s) + (1 if current else 0)\n", + " if current and (current_len + s_len) > max_chars:\n", + " flush()\n", + " s_len = len(s) # Recalculate for new chunk (no leading space)\n", + " current.append(s)\n", + " current_len += s_len\n", + "\n", + " if current:\n", + " chunks.append(\" \".join(current).strip())\n", + "\n", + " # Safety: remove empty\n", + " return [c for c in chunks if c]\n", + "\n", + "\n", + "_RE_EMAIL = re.compile(r\"\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z0-9-]{2,63}\\b\", re.IGNORECASE)\n", + "_RE_PHONE = re.compile(r\"\\b(?:\\+?\\d{1,3}[-. ]?)?(?:\\(?\\d{2,4}\\)?[-. ]?)?\\d{3,4}[-. ]?\\d{3,4}\\b\")\n", + "_RE_URL = re.compile(r\"\\bhttps?://[^\\s]+?(?=[\\s\\)\\]\\}>\\\"\\'.,!?]|$)\", re.IGNORECASE)\n", + "_RE_CREDIT_CARD = re.compile(r\"\\b(?:\\d{13,19}|\\d{4}(?:[ -]\\d{4}){3})\\b\")\n", + "\n", + "\n", + "def _luhn_ok(number: str) -> bool:\n", + " digits = [int(ch) for ch in re.sub(r\"\\D\", \"\", number)]\n", + " if len(digits) < 13 or len(digits) > 19:\n", + " return False\n", + " checksum = 0\n", + " parity = len(digits) % 2\n", + " for i, d in enumerate(digits):\n", + " if i % 2 == parity:\n", + " d *= 2\n", + " if d > 9:\n", + " d -= 9\n", + " checksum += d\n", + " return checksum % 10 == 0\n", + "\n", + "\n", + "def redact_pii(text: str) -> Tuple[str, Dict[str, List[str]]]:\n", + " \"\"\"Redact PII from text. Handles regex errors gracefully.\"\"\"\n", + " entities: Dict[str, List[str]] = {\"email\": [], \"phone\": [], \"url\": [], \"card\": []}\n", + "\n", + " try:\n", + " def repl_factory(kind: str):\n", + " def _repl(m: re.Match) -> str:\n", + " val = m.group(0)\n", + " entities[kind].append(val)\n", + " return f\"[{kind.upper()}_{len(entities[kind])}]\"\n", + " return _repl\n", + "\n", + " text = _RE_EMAIL.sub(repl_factory(\"email\"), text)\n", + " text = _RE_URL.sub(repl_factory(\"url\"), text)\n", + "\n", + " # Phone regex can overmatch; keep conservative by only replacing matches with enough digits\n", + " def phone_repl(m: re.Match) -> str:\n", + " val = m.group(0)\n", + " digits = re.sub(r\"\\D\", \"\", val)\n", + " if len(digits) < 9:\n", + " return val\n", + " entities[\"phone\"].append(val)\n", + " return f\"[PHONE_{len(entities['phone'])}]\"\n", + "\n", + " text = _RE_PHONE.sub(phone_repl, text)\n", + "\n", + " # Credit cards: validate by Luhn\n", + " def card_repl(m: re.Match) -> str:\n", + " val = m.group(0)\n", + " if not _luhn_ok(val):\n", + " return val\n", + " entities[\"card\"].append(val)\n", + " return f\"[CARD_{len(entities['card'])}]\"\n", + "\n", + " text = _RE_CREDIT_CARD.sub(card_repl, text)\n", + " except Exception:\n", + " # If redaction fails, return text as-is with no entities\n", + " pass\n", + "\n", + " # Do not return raw PII values to avoid retaining sensitive data in memory.\n", + " # Instead, return only non-sensitive placeholders for detected entities.\n", + " safe_entities = {k: [\"[REDACTED]\" for _ in v] for k, v in entities.items() if v}\n", + " return text, safe_entities\n", + "\n", + "\n", + "def detect_language(text: str) -> str:\n", + " # Optional: only if langdetect installed\n", + " if lang_detect is None:\n", + " return \"unknown\"\n", + " sample = text\n", + " if len(sample) > 4000:\n", + " sample = sample[:4000]\n", + " try:\n", + " return lang_detect(sample)\n", + " except Exception:\n", + " return \"unknown\"\n", + "\n", + "\n", + "def _estimate_syllables(word: str) -> int:\n", + " \"\"\"\n", + " Heuristic syllable estimator (English-ish). Good enough for gating, not research-grade.\n", + " WARNING: This function is designed for English text and may produce unreliable results\n", + " for non-English languages.\n", + " \"\"\"\n", + " w = re.sub(r\"[^a-z]\", \"\", word.lower())\n", + " if not w:\n", + " return 0\n", + " vowels = \"aeiouy\"\n", + " count = 0\n", + " prev_vowel = False\n", + " for ch in w:\n", + " is_v = ch in vowels\n", + " if is_v and not prev_vowel:\n", + " count += 1\n", + " prev_vowel = is_v\n", + " # silent e\n", + " if w.endswith(\"e\") and count > 1:\n", + " count -= 1\n", + " return max(1, count)\n", + "\n", + "\n", + "@dataclass\n", + "class TextStats:\n", + " language: str\n", + " char_count: int\n", + " word_count: int\n", + " sentence_count: int\n", + " avg_sentence_len_words: float\n", + " avg_word_len_chars: float\n", + " long_word_ratio: float\n", + " flesch_reading_ease: float\n", + "\n", + "\n", + "def compute_stats(text: str) -> TextStats:\n", + " \"\"\"\n", + " Compute text statistics including Flesch Reading Ease score.\n", + " WARNING: Syllable estimation and sentence splitting are English-focused heuristics.\n", + " Results may be unreliable for non-English text.\n", + " \"\"\"\n", + " lang = detect_language(text)\n", + " words = re.findall(r\"[A-Za-z]+(?:'[A-Za-z]+)?\", text)\n", + " sentences = split_sentences(text)\n", + "\n", + " word_count = len(words)\n", + " sentence_count = max(1, len(sentences)) # Ensure at least 1 to avoid division by zero\n", + " avg_sentence_len = word_count / sentence_count # Safe: sentence_count >= 1\n", + " avg_word_len = (sum(len(w) for w in words) / max(1, word_count))\n", + "\n", + " long_words = [w for w in words if _estimate_syllables(w) >= 3 or len(w) >= 10]\n", + " long_ratio = len(long_words) / max(1, word_count)\n", + "\n", + " # Flesch Reading Ease (FRE) score.\n", + " # Coefficients from the standard FRE formula:\n", + " # FRE = 206.835 − 1.015*(words/sentences) − 84.6*(syllables/words)\n", + " # See e.g. https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests\n", + " # Note: Sentence splitting is heuristic, so this FRE score is an approximation and may mis-estimate readability.\n", + " FRE_BASE = 206.835 # Base score for English texts in the FRE formula.\n", + " FRE_SENTENCE_WEIGHT = 1.015 # Penalty per average sentence length (words per sentence).\n", + " FRE_SYLLABLE_WEIGHT = 84.6 # Penalty per average syllables per word.\n", + " syllables = sum(_estimate_syllables(w) for w in words)\n", + " syllables_per_word = syllables / max(1, word_count)\n", + " fre = FRE_BASE - FRE_SENTENCE_WEIGHT * avg_sentence_len - FRE_SYLLABLE_WEIGHT * syllables_per_word\n", + "\n", + " return TextStats(\n", + " language=lang,\n", + " char_count=len(text),\n", + " word_count=word_count,\n", + " sentence_count=len(sentences),\n", + " avg_sentence_len_words=float(avg_sentence_len),\n", + " avg_word_len_chars=float(avg_word_len),\n", + " long_word_ratio=float(long_ratio),\n", + " flesch_reading_ease=float(fre),\n", + " )\n", + "\n", + "\n", + "def requires_llm_simplification(\n", + " stats: TextStats,\n", + " *,\n", + " min_words: int = 30,\n", + " max_words: int = 4000,\n", + " fre_threshold: float = 60.0,\n", + ") -> bool:\n", + " \"\"\"\n", + " Determine if text needs LLM simplification based on readability metrics.\n", + " Uses Flesch Reading Ease score and word count to gate LLM processing.\n", + " \"\"\"\n", + " # Simple gate: avoid LLM when too short or already easy\n", + " if stats.word_count < min_words:\n", + " return False\n", + " if stats.word_count > max_words:\n", + " return True # Large documents likely need simplification; will be processed in chunks\n", + " # If FRE is high (easier), skip\n", + " return stats.flesch_reading_ease < fre_threshold\n", + "\n", + "\n", + "def preprocess_text(\n", + " text: str,\n", + " *,\n", + " strip_html_input: bool = True,\n", + " redact: bool = True,\n", + " drop_noise: bool = True,\n", + " dedupe_lines: bool = True,\n", + " max_chunk_chars: int = 1800,\n", + " chunk_overlap: int = 1,\n", + ") -> Dict[str, object]:\n", + " # Validate chunking parameters\n", + " if max_chunk_chars <= 0:\n", + " raise ValueError(\"max_chunk_chars must be positive\")\n", + " if chunk_overlap < 0:\n", + " raise ValueError(\"chunk_overlap cannot be negative\")\n", + " if chunk_overlap >= max_chunk_chars:\n", + " raise ValueError(\"chunk_overlap must be less than max_chunk_chars\")\n", + "\n", + " original = text or \"\"\n", + "\n", + " cleaned = original\n", + " if strip_html_input:\n", + " cleaned = strip_html(cleaned)\n", + " cleaned = normalize_unicode(cleaned)\n", + " cleaned = normalize_whitespace(cleaned)\n", + "\n", + " if dedupe_lines:\n", + " cleaned = dedupe_consecutive_lines(cleaned)\n", + " if drop_noise:\n", + " cleaned = drop_noise_lines(cleaned)\n", + "\n", + " redacted_entities = {}\n", + " if redact:\n", + " cleaned, redacted_entities = redact_pii(cleaned)\n", + "\n", + " stats = compute_stats(cleaned)\n", + " sentences = split_sentences(cleaned)\n", + " chunks = chunk_sentences(sentences, max_chars=max_chunk_chars, overlap=chunk_overlap)\n", + "\n", + " return {\n", + " \"text_original\": original,\n", + " \"text_clean\": cleaned,\n", + " \"pii\": redacted_entities,\n", + " \"stats\": asdict(stats),\n", + " \"sentences\": sentences,\n", + " \"chunks\": chunks,\n", + " \"send_to_llm\": requires_llm_simplification(stats),\n", + " }\n", + "\n", + "\n", + "print('✅ Preprocessing utilities defined.')" + ] + }, + { + "cell_type": "markdown", + "id": "85a08773", + "metadata": {}, + "source": [ + "## 3) Demo on repo sample\n", + "This tries to load `data/samples/sample_en.txt` and runs the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1df959a1", + "metadata": {}, + "outputs": [], + "source": [ + "sample_path = Path('data/samples/sample_en.txt')\n", + "if sample_path.exists():\n", + " raw_text = sample_path.read_text(encoding='utf-8')\n", + " print(f'✅ Loaded sample: {sample_path} ({len(raw_text)} chars)')\n", + "else:\n", + " raw_text = (\n", + " 'Make sure the area is safe, especially if you plan on walking home at night. '\n", + " 'It’s a good idea to use the buddy system. '\n", + " 'Contact us at support@example.com or visit https://example.com for details.'\n", + " )\n", + " print('ℹ️ Using inline demo text (sample file not found).')\n", + "\n", + "result = preprocess_text(raw_text, max_chunk_chars=900, chunk_overlap=1)\n", + "\n", + "print('--- STATS ---')\n", + "print(json.dumps(result['stats'], indent=2))\n", + "print('--- PII ---')\n", + "print(json.dumps(result['pii'], indent=2))\n", + "print('--- SEND TO LLM? ---')\n", + "print(result['send_to_llm'])\n", + "\n", + "print('--- CLEAN (preview) ---')\n", + "print(result['text_clean'][:600])\n", + "\n", + "print('--- CHUNKS ---')\n", + "for i, c in enumerate(result['chunks'][:5], 1):\n", + " print(f'[{i}] {len(c)} chars: {c[:160]}...')" + ] + }, + { + "cell_type": "markdown", + "id": "c933d811", + "metadata": {}, + "source": [ + "## 4) How to use with your LLM notebook\n", + "In your model notebook, call `preprocess_text(text)` first. Then send `result['chunks']` to the LLM one chunk at a time (or merge a few) depending on your context window.\n", + "\n", + "Recommended pattern:\n", + "- if `send_to_llm == False`: skip simplification or apply minimal rule-based changes\n", + "- else: send each chunk to the LLM and then join results" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}