|
161 | 161 | " print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")"
|
162 | 162 | ]
|
163 | 163 | },
|
| 164 | + { |
| 165 | + "cell_type": "markdown", |
| 166 | + "id": "40710019-7ec9-414e-ad72-1ba672cf5fc2", |
| 167 | + "metadata": {}, |
| 168 | + "source": [ |
| 169 | + "## Post-Conversion" |
| 170 | + ] |
| 171 | + }, |
| 172 | + { |
| 173 | + "cell_type": "markdown", |
| 174 | + "id": "2572e2d0-94dc-4ca0-b032-3978af26c9c9", |
| 175 | + "metadata": {}, |
| 176 | + "source": [ |
| 177 | + "This step guides you through analyzing docling-converted documents for problematic table structures, specifically merged table cells, using core functions from the Illuminator codebase." |
| 178 | + ] |
| 179 | + }, |
| 180 | + { |
| 181 | + "cell_type": "code", |
| 182 | + "execution_count": 3, |
| 183 | + "id": "09e07e35-befb-4ed5-9fe4-41544f88d943", |
| 184 | + "metadata": {}, |
| 185 | + "outputs": [ |
| 186 | + { |
| 187 | + "ename": "ModuleNotFoundError", |
| 188 | + "evalue": "No module named 'log_utils'", |
| 189 | + "output_type": "error", |
| 190 | + "traceback": [ |
| 191 | + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", |
| 192 | + "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", |
| 193 | + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[34;01mutils\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01milluminator\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01manalysis\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m analyze_docling_tables\n\u001b[32m 3\u001b[39m analyze_docling_tables({Path(json_output_path).resolve()})\n", |
| 194 | + "\u001b[36mFile \u001b[39m\u001b[32m~/dev/FNF/examples/notebooks/instructlab-knowledge/utils/illuminator/analysis.py:4\u001b[39m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[34;01mdocling\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdatamodel\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdocument\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DoclingDocument\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[34;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m List, Tuple, Dict, Any, Union, Set\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[34;01mlog_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m logger\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[34;01mos\u001b[39;00m\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[34mcell_is_merged\u001b[39m(cell) -> \u001b[38;5;28mbool\u001b[39m:\n", |
| 195 | + "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'log_utils'" |
| 196 | + ] |
| 197 | + } |
| 198 | + ], |
| 199 | + "source": [ |
| 200 | + "from utils.illuminator.analysis import analyze_docling_tables\n", |
| 201 | + "\n", |
| 202 | + "analyze_docling_tables({Path(json_output_path).resolve()})" |
| 203 | + ] |
| 204 | + }, |
| 205 | + { |
| 206 | + "cell_type": "markdown", |
| 207 | + "id": "eea0876e-ac55-45fc-93e8-3e646a6c3104", |
| 208 | + "metadata": {}, |
| 209 | + "source": [ |
| 210 | + "\n", |
| 211 | + "The output of this post-conversion step should help determine whether to avoid using the content for chunking entirely or to manually edit it before proceeding with chunking.\n" |
| 212 | + ] |
| 213 | + }, |
164 | 214 | {
|
165 | 215 | "cell_type": "markdown",
|
166 | 216 | "id": "cafad55e-a4c0-4d6e-9da0-49519fa9bf74",
|
|
674 | 724 | "name": "python",
|
675 | 725 | "nbconvert_exporter": "python",
|
676 | 726 | "pygments_lexer": "ipython3",
|
677 |
| - "version": "3.11.9" |
| 727 | + "version": "3.13.2" |
678 | 728 | }
|
679 | 729 | },
|
680 | 730 | "nbformat": 4,
|
|
0 commit comments