diff --git a/frontend/src/components/tiptap-extensions/image-extension.js b/frontend/src/components/tiptap-extensions/image-extension.js index f857dd6f..1b9146d6 100644 --- a/frontend/src/components/tiptap-extensions/image-extension.js +++ b/frontend/src/components/tiptap-extensions/image-extension.js @@ -31,7 +31,10 @@ const imageCaptionTokenizer = { tokenize(src, tokens, lexer) { // Match:  or  optionally followed by \n*caption* - const imagePattern = /^!\[([^\]]*)\]\(([^)"]+)(?:\s+"([^"]*)")?\)/; + // URL allows one level of balanced parens so Frappe filenames like + // `/files/image (24).png` survive (otherwise the inner `)` closes the markdown). + const imagePattern = + /^!\[([^\]]*)\]\(((?:[^()"\s]|\([^()"]*\))+)(?:\s+"([^"]*)")?\)/; const captionPattern = /^\n\*([^*]+)\*/; const imageMatch = imagePattern.exec(src); diff --git a/pyproject.toml b/pyproject.toml index 6dfd9940..2a4f55a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,8 @@ requires-python = ">=3.14" readme = "README.md" dynamic = ["version"] dependencies = [ - "mistune>=3.0", + "markdown-it-py>=3.0", + "mdit-py-plugins>=0.4", ] [project.urls] diff --git a/wiki/wiki/markdown.py b/wiki/wiki/markdown.py index 7b9a38a0..15869b54 100644 --- a/wiki/wiki/markdown.py +++ b/wiki/wiki/markdown.py @@ -1,10 +1,7 @@ """ -Custom Markdown Renderer with Callout/Aside Support +Wiki markdown → HTML renderer (markdown-it-py + custom callout/aside support). -This module provides a custom markdown-to-HTML renderer using Mistune, -with support for Astro Starlight-style callouts/asides. - -Syntax: +Callout syntax: :::note Content here ::: @@ -17,10 +14,11 @@ """ import re -from html import unescape -from urllib.parse import quote -import mistune +from markdown_it import MarkdownIt +from markdown_it.common.utils import escapeHtml +from mdit_py_plugins.footnote import footnote_plugin +from mdit_py_plugins.tasklists import tasklists_plugin def slugify(text: str) -> str: @@ -103,7 +101,6 @@ def _process_callouts_with_placeholders(content): and a list of callout data to be processed later. """ callouts = [] - # Use HTML comment-like placeholder that won't be parsed as markdown placeholder_prefix = "WIKICALLOUTPLACEHOLDER" def replacer(match): @@ -123,7 +120,6 @@ def replacer(match): "content": inner_content.strip(), } ) - # Return placeholder - use format that won't be parsed as markdown return f"\n\n{placeholder_prefix}{idx}END\n\n" # Process callouts (may be nested, so we process iteratively) @@ -135,12 +131,11 @@ def replacer(match): return content, callouts, placeholder_prefix -def _replace_callout_placeholders(html, callouts, placeholder_prefix, md_instance): +def _replace_callout_placeholders(html, callouts, placeholder_prefix, render_inner): """Replace callout placeholders with actual HTML after markdown rendering.""" for idx, callout in enumerate(callouts): placeholder = f"{placeholder_prefix}{idx}END" - # The placeholder might be wrapped in
tags, so handle both cases - inner_html = md_instance(callout["content"]) if callout["content"] else "" + inner_html = render_inner(callout["content"]) if callout["content"] else "" callout_html = _generate_callout_html(callout["type"], callout["title"], inner_html) # Replace placeholder (may be wrapped in
tags)
@@ -151,9 +146,10 @@ def _replace_callout_placeholders(html, callouts, placeholder_prefix, md_instanc
# Pattern to match markdown image syntax:  or 
-# Captures: alt text, URL, and optional title
+# URL allows one level of balanced parens so Frappe uploads named like
+# `/files/image (14).png` are matched whole.
IMAGE_PATTERN = re.compile(
- r'!\[([^\]]*)\]\(([^)"\s]+(?:\s[^)]*)?)\)',
+ r'!\[([^\]]*)\]\(((?:[^()"]|\([^()"]*\))+?)(?:\s+"([^"]*)")?\)',
)
VIDEO_EXTENSIONS = (
@@ -228,7 +224,6 @@ def replacer(match):
"title": match.group("title") or "",
}
)
- # Force paragraph break around video block
return f"\n\n{placeholder_prefix}{idx}END\n\n"
return VIDEO_MARKDOWN_PATTERN.sub(replacer, content), videos, placeholder_prefix
@@ -244,24 +239,38 @@ def _replace_video_placeholders(html: str, videos: list[dict], placeholder_prefi
return html
+def _encode_image_url_spaces(content: str) -> str:
+ """
+ Pre-process markdown to URL-encode literal spaces in image URLs.
+
+ CommonMark forbids unescaped whitespace in URLs, but Frappe uploads
+ routinely contain spaces (e.g. `/files/my image.png`). The matching
+ regex tolerates balanced parens so URLs like `/files/image (14).png`
+ are captured whole — the parser handles those parens natively.
+ """
+
+ def encode_url(match):
+ alt_text = match.group(1)
+ url = match.group(2).strip().replace(" ", "%20")
+ title = match.group(3)
+
+ if title:
+ return f''
+ return f""
+
+ return IMAGE_PATTERN.sub(encode_url, content)
+
+
# Private-use Unicode sentinel — stands in for `|` inside inline-code on table
-# rows during Mistune parsing, then gets swapped back after rendering. Chosen
-# from the PUA block so it cannot collide with authored markdown content.
+# rows during parsing, then gets swapped back after rendering. Both Mistune and
+# markdown-it-py count raw pipes per row in their table plugin and reject the
+# whole block on mismatch, dropping the table to a paragraph; hiding the inner
+# pipes behind a PUA sentinel keeps the column count honest.
_TABLE_CODE_PIPE_SENTINEL = ""
def _escape_table_inline_code_pipes(content: str) -> str:
- """
- Swap `|` characters inside inline-code spans on table-row lines for a
- sentinel, which is restored after Mistune renders.
-
- GFM-compliant parsers (marked, markdown-it) treat a backtick-delimited span
- like `` `dict | list` `` as a single code token, so its `|` is not a column
- separator. Mistune's table plugin instead counts raw pipes per row and,
- finding a mismatch, rejects the entire block — the table collapses to a
- paragraph. Hiding those pipes behind a sentinel makes the column count
- match, and a post-render replace restores the `|` inside ` tags)
- html = md(processed_content)
-
- # Step 5: Replace callout placeholders with actual callout HTML
- html = _replace_callout_placeholders(html, callouts, placeholder_prefix, md)
-
- # Step 6: Replace video placeholders with block video HTML
- html = _replace_video_placeholders(html, videos, video_placeholder_prefix)
-
- # Step 7: Restore pipes that were hidden from the table parser.
if _TABLE_CODE_PIPE_SENTINEL in html:
html = html.replace(_TABLE_CODE_PIPE_SENTINEL, "|")
- # Get the headings extracted during rendering
- headings = renderer.get_headings()
-
return html, headings
diff --git a/wiki/wiki/test_markdown.py b/wiki/wiki/test_markdown.py
index f8f78a87..91490dd6 100644
--- a/wiki/wiki/test_markdown.py
+++ b/wiki/wiki/test_markdown.py
@@ -413,6 +413,47 @@ def test_already_encoded_url_unchanged(self):
self.assertNotIn("%2520", result)
+class TestImageUrlWithParens(unittest.TestCase):
+ """Frappe uploads commonly produce names like `image (14).png`. CommonMark
+ allows one level of balanced parens in URLs, so the parser handles them
+ natively; only literal spaces still need pre-encoding."""
+
+ def test_image_with_literal_parens(self):
+ content = ".png)"
+ result = render_markdown(content)
+ self.assertIn('`.
- """
+ """Swap `|` inside inline-code spans on table-row lines for a sentinel."""
lines = content.split("\n")
in_fence = False
fence_marker: str | None = None
@@ -287,103 +296,41 @@ def replace_span(match: re.Match) -> str:
return "\n".join(lines)
-def _encode_image_url_spaces(content: str) -> str:
- """
- Pre-process markdown to URL-encode spaces in image URLs.
-
- Mistune (unlike markdown2) doesn't handle spaces in URLs, so we need to
- encode them before parsing. This function finds all image syntax and
- encodes spaces in the URL portion.
-
- Args:
- content: Markdown string
-
- Returns:
- Markdown string with spaces in image URLs encoded as %20
- """
-
- def encode_url(match):
- alt_text = match.group(1)
- url_part = match.group(2)
-
- # Split URL and optional title (title is in quotes after a space)
- # e.g., '/path/to/image.png "Image Title"'
- title_match = re.match(r'^([^"]+?)(?:\s+"([^"]*)")?$', url_part)
- if title_match:
- url = title_match.group(1).strip()
- title = title_match.group(2)
- else:
- url = url_part
- title = None
-
- # Only encode spaces, preserve other characters
- # quote() with safe='' would encode everything, but we only want spaces
- encoded_url = url.replace(" ", "%20")
-
- # Reconstruct the image syntax
- if title:
- return f''
- return f""
-
- return IMAGE_PATTERN.sub(encode_url, content)
-
-
-class WikiRenderer(mistune.HTMLRenderer):
- """Custom HTML renderer.
-
- Image captions use the Stack Overflow pattern:
- 
- *caption text*
-
- This renders as caption
.
- return super().block_code(code.rstrip() + "\n", info)
-
- def heading(self, text: str, level: int, **attrs) -> str:
- """Render heading with slugified ID for anchor links."""
- # Generate base slug from heading text
- slug = slugify(text)
-
- # Handle empty slugs
- if not slug:
- slug = "heading"
-
- # Ensure unique slugs by appending numbers for duplicates
- original_slug = slug
- counter = 1
- while slug in self._heading_slugs:
- slug = f"{original_slug}-{counter}"
- counter += 1
+ content = content.rstrip() + "\n"
+ cls = f' class="language-{escapeHtml(lang)}"' if lang else ""
+ return f"
\n"
- self._heading_slugs[slug] = True
+ def fence_rstrip(tokens, idx, options, env):
+ tok = tokens[idx]
+ lang = next(iter((tok.info or "").split()), "")
+ return _render_codeblock_html(tok.content, lang)
- # Track h2 and h3 headings for TOC
- if level in (2, 3):
- self._headings.append(
- {"id": slug, "text": unescape(re.sub(r"<[^>]+>", "", text)), "level": level}
- )
+ def code_block_rstrip(tokens, idx, options, env):
+ return _render_codeblock_html(tokens[idx].content)
- return f'{escapeHtml(content)}"
- def get_headings(self) -> list:
- """Return the list of h2/h3 headings extracted during rendering."""
- return self._headings
+ md.renderer.rules["image"] = image_render
+ return md
+
+
+def _apply_heading_slugs_and_toc(tokens, md: MarkdownIt) -> list[dict]:
+ """
+ Walk parsed tokens, assign unique slug IDs to every heading, and collect
+ h2/h3 entries for the table of contents.
+ """
+ used: set[str] = set()
+ headings: list[dict] = []
+
+ for i, tok in enumerate(tokens):
+ if tok.type != "heading_open":
+ continue
+ inline = tokens[i + 1] if i + 1 < len(tokens) else None
+ raw_text = inline.content if inline and inline.type == "inline" else ""
+
+ slug = base = slugify(raw_text) or "heading"
+ counter = 1
+ while slug in used:
+ slug = f"{base}-{counter}"
+ counter += 1
+ used.add(slug)
+ tok.attrSet("id", slug)
+
+ level = int(tok.tag[1]) # "h2" -> 2
+ if level in (2, 3):
+ # Render the inline as plain text so TOC entries drop markdown syntax
+ text = md.renderer.renderInlineAsText(inline.children or [], md.options, {})
+ headings.append({"id": slug, "text": text, "level": level})
+
+ return headings
def render_markdown_with_toc(content: str) -> tuple[str, list]:
@@ -415,48 +391,24 @@ def render_markdown_with_toc(content: str) -> tuple[str, list]:
if not content:
return "", []
- # Create a base Mistune markdown instance with custom renderer
- # Note: escape=False must be passed to the renderer, not create_markdown
- renderer = WikiRenderer(escape=False)
- md = mistune.create_markdown(
- renderer=renderer,
- plugins=[
- "strikethrough",
- "footnotes",
- "table",
- "task_lists",
- ],
- )
+ md = _build_markdown()
- # Step 1: URL-encode spaces in image URLs (mistune doesn't handle them)
processed_content = _encode_image_url_spaces(content)
-
- # Step 1b: Escape `|` inside inline-code spans on table rows so Mistune's
- # table plugin doesn't miscount columns and drop the table.
processed_content = _escape_table_inline_code_pipes(processed_content)
+ processed_content, callouts, callout_prefix = _process_callouts_with_placeholders(processed_content)
+ processed_content, videos, video_prefix = _process_videos_with_placeholders(processed_content)
- # Step 2: Extract callouts and replace with placeholders
- processed_content, callouts, placeholder_prefix = _process_callouts_with_placeholders(processed_content)
+ env: dict = {}
+ tokens = md.parse(processed_content, env)
+ headings = _apply_heading_slugs_and_toc(tokens, md)
+ html = md.renderer.render(tokens, md.options, env)
- # Step 3: Extract video blocks and replace with placeholders
- processed_content, videos, video_placeholder_prefix = _process_videos_with_placeholders(processed_content)
+ html = _replace_callout_placeholders(html, callouts, callout_prefix, md.render)
+ html = _replace_video_placeholders(html, videos, video_prefix)
- # Step 4: Render markdown (placeholders may be wrapped in
", result)
+
+ def test_image_with_encoded_space_and_literal_parens(self):
+ """The form Frappe actually emits: space encoded, parens literal."""
+ content = ".png)"
+ result = render_markdown(content)
+ self.assertIn('
", result)
+
+ def test_image_with_parens_and_alt_and_title(self):
+ content = '.png "App Logo")'
+ result = render_markdown(content)
+ self.assertIn('.png)