langchain-ai · eyurtsev · Feb 3, 2025 · Feb 3, 2025 · Feb 3, 2025 · Feb 3, 2025
diff --git a/docs/_scripts/notebook_convert.py b/docs/_scripts/notebook_convert.py
@@ -1,30 +1,54 @@
+import argparse
 import os
 import re
 from pathlib import Path
+from typing import Literal, Optional
 
 import nbformat
 from nbconvert.exporters import MarkdownExporter
 from nbconvert.preprocessors import Preprocessor
 
 
 class EscapePreprocessor(Preprocessor):
+    def __init__(self, rewrite_links: bool = True, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.rewrite_links = rewrite_links
+
     def preprocess_cell(self, cell, resources, cell_index):
         if cell.cell_type == "markdown":
-            # rewrite markdown links to html links (excluding image links)
-            cell.source = re.sub(
-                r"(?<!!)\[([^\]]*)\]\((?![^\)]*//)([^)]*)(?:\.ipynb)?\)",
-                r'<a href="\2">\1</a>',
-                cell.source,
-            )
+            if self.rewrite_links:
+                # We'll need to adjust the logic for this to keep markdown format
+                # but link to markdown files rather than ipynb files.
+                cell.source = re.sub(
+                    r"(?<!!)\[([^\]]*)\]\((?![^\)]*//)([^)]*)(?:\.ipynb)?\)",
+                    r'<a href="\2">\1</a>',
+                    cell.source,
+                )
+            else:
+                # Keep format but replace the .ipynb extension with .md
+                cell.source = re.sub(
+                    r"(?<!!)\[([^\]]*)\]\((?![^\)]*//)([^)]*)(?:\.ipynb)?\)",
+                    r"[\1](\2.md)",
+                    cell.source,
+                )
+
             # Fix image paths in <img> tags
             cell.source = re.sub(
                 r'<img\s+src="\.?/img/([^"]+)"', r'<img src="../img/\1"', cell.source
             )
 
         elif cell.cell_type == "code":
+            # Determine if the cell has bash or cell magic
+            if cell.source.startswith("%") or cell.source.startswith("!"):
+                # update metadata to denote that it's not a python cell
+                cell.metadata["language_info"] = {"name": "unknown"}
+
             # Remove noqa comments
-            cell.source = re.sub(r'#\s*noqa.*$', '', cell.source, flags=re.MULTILINE)
+            cell.source = re.sub(r"#\s*noqa.*$", "", cell.source, flags=re.MULTILINE)
             # escape ``` in code
+            # This is needed because the markdown exporter will wrap code blocks in
+            # triple backticks, which will break the markdown output if the code block
+            # contains triple backticks.
             cell.source = cell.source.replace("```", r"\`\`\`")
             # escape ``` in output
             if "outputs" in cell:
@@ -58,7 +82,7 @@ def preprocess_cell(self, cell, resources, cell_index):
 
 class ExtractAttachmentsPreprocessor(Preprocessor):
     """
-    Extracts all of the outputs from the notebook file.  The extracted
+    Extracts all the outputs from the notebook file.  The extracted
     outputs are returned in the 'resources' dictionary.
     """
 
@@ -82,7 +106,7 @@ def preprocess_cell(self, cell, resources, cell_index):
         if not isinstance(resources["outputs"], dict):
             resources["outputs"] = {}
 
-        # Loop through all of the attachments in the cell
+        # Loop through all the attachments in the cell
         for name, attach in cell.get("attachments", {}).items():
             for mime, data in attach.items():
                 if mime not in {
@@ -114,12 +138,65 @@ def preprocess_cell(self, cell, resources, cell_index):
     ],
 )
 
+md_executable = MarkdownExporter(
+    preprocessors=[
+        ExtractAttachmentsPreprocessor,
+        EscapePreprocessor(rewrite_links=False),
+    ],
+    template_name="md_executable",
+    extra_template_basedirs=[
+        os.path.join(os.path.dirname(__file__), "notebook_convert_templates")
+    ],
+)
+
 
 def convert_notebook(
     notebook_path: Path,
-) -> Path:
+    mode: Literal["markdown", "exec"] = "markdown",
+) -> str:
     with open(notebook_path) as f:
         nb = nbformat.read(f, as_version=4)
 
-    body, _ = exporter.from_notebook_node(nb)
+    nb.metadata.mode = mode
+    if mode == "markdown":
+        body, _ = exporter.from_notebook_node(nb)
+    else:
+        body, _ = md_executable.from_notebook_node(nb)
     return body
+
+
+HERE = Path(__file__).parent
+DOCS = HERE.parent / "docs"
+
+
+# Convert notebooks to markdown
+def _convert_notebooks(
+    *, output_dir: Optional[Path] = None, replace: bool = False
+) -> None:
+    """Converting notebooks."""
+    if not output_dir and not replace:
+        raise ValueError("Either --output_dir or --replace must be specified")
+
+    output_dir_path = DOCS if replace else Path(output_dir)
+    for notebook in DOCS.rglob("*.ipynb"):
+        markdown = convert_notebook(notebook, mode="exec")
+        markdown_path = output_dir_path / notebook.relative_to(DOCS).with_suffix(".md")
+        markdown_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(markdown_path, "w") as f:
+            f.write(markdown)
+        if replace:
+            notebook.unlink(missing_ok=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert notebooks to markdown")
+    parser.add_argument(
+        "--output_dir", default=None, help="Directory to output markdown files",
+    )
+    parser.add_argument(
+        "--replace",
+        action="store_true",
+        help="Replace original notebooks with markdown files",
+    )
+    args = parser.parse_args()
+    _convert_notebooks(replace=args.replace, output_dir=args.output_dir)
diff --git a/docs/_scripts/notebook_convert_templates/md_executable/conf.json b/docs/_scripts/notebook_convert_templates/md_executable/conf.json
@@ -0,0 +1,5 @@
+{
+  "mimetypes": {
+    "text/markdown": true
+  }
+}
diff --git a/docs/_scripts/notebook_convert_templates/md_executable/index.md.j2 b/docs/_scripts/notebook_convert_templates/md_executable/index.md.j2
@@ -0,0 +1,42 @@
+{#https://github.com/rdbisme/nbconvert/blob/master/share/jupyter/nbconvert/templates/markdown/index.md.j2#}
+{% extends 'markdown/index.md.j2' %}
+
+{% block input %}
+
+```
+{%- if 'magics_language' in cell.metadata  -%}
+    {{ cell.metadata.magics_language}}
+{%- elif 'name' in cell.metadata.get('language_info', {}) -%}
+    {%- if cell.metadata['language_info']['name'] == "python" -%}
+        {{ cell.metadata.language_info.name }} exec="1" source="below" result="ini"
+    {%- endif -%}
+{%- elif 'name' in nb.metadata.get('language_info', {}) -%}
+    {{ nb.metadata.language_info.name }} exec="1" source="below" result="ini"
+{%- endif %}
+{{ cell.source}}
+```
+
+{% endblock input %}
+
+{%- block traceback_line -%}
+{%- endblock traceback_line -%}
+
+{%- block stream -%}
+{%- endblock stream -%}
+
+{%- block data_text scoped -%}
+{%- endblock data_text -%}
+
+{%- block data_html scoped -%}
+```html
+{{ output.data['text/html'] | safe }} 
+```
+{%- endblock data_html -%}
+
+{%- block data_jpg scoped -%}
+![](data:image/jpg;base64,{{ output.data['image/jpeg'] }})
+{%- endblock data_jpg -%}
+
+{%- block data_png scoped -%}
+![](data:image/png;base64,{{ output.data['image/png'] }})
+{%- endblock data_png -%}
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -56,6 +56,7 @@ plugins:
   - search:
       separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])'
   - autorefs
+  - markdown-exec
   - mkdocstrings:
       handlers:
         python:

diff --git a/docs/tests/__init__.py b/docs/tests/__init__.py
diff --git a/docs/tests/test_notebook_convert.py b/docs/tests/test_notebook_convert.py
@@ -0,0 +1,46 @@
+import nbformat
+
+from _scripts.notebook_convert import exporter
+
+
+def _remove_consecutive_new_lines(s) -> str:
+    """Remove consecutive new lines from a string."""
+    return "\n".join([line for line in s.split("\n") if line.strip()])
+
+
+def test_convert_notebook():
+    # Test the convert_notebook function
+    # Create a new, minimal notebook programmatically
+    nb = nbformat.v4.new_notebook()
+    nb.metadata.kernelspec = {
+        "name": "python3",
+        "language": "python",
+        "display_name": "Python 3",
+    }
+    nb.metadata.language_info = {
+        "name": "python",
+        "mimetype": "text/x-python",
+        "codemirror_mode": {
+            "name": "ipython",
+            "version": 3,
+        },
+    }
+
+    # Add a markdown cell with a link to an .ipynb file
+    md_cell_source = "This is a [link](example_notebook.ipynb) in markdown."
+    nb.cells.append(nbformat.v4.new_markdown_cell(md_cell_source))
+
+    # Add a code cell with a noqa comment
+    code_cell_source = "print('hello')"
+    nb.cells.append(nbformat.v4.new_code_cell(code_cell_source))
+    nb.metadata.mode = "exec"
+
+    body, _ = exporter.from_notebook_node(nb)
+    assert (
+        _remove_consecutive_new_lines(body)
+        == """\
+This is a [link](example_notebook.ipynb) in markdown.
+```python exec="1" source="below" result="ini"
+print('hello')
+```"""
+    )