From fe35848490082f2ae0f48e3313560bfe3899458b Mon Sep 17 00:00:00 2001 From: Hussain Nagaria Date: Wed, 27 May 2026 18:20:59 +0530 Subject: [PATCH 1/3] fix: render image paths containing parentheses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The custom marked.js image tokenizer captured URLs with `[^)"]+`, which stops at the first `)` and breaks Frappe uploads named like `/files/image (24).png` (URL-encoded as `/files/image%20(24).png`). The inner `)` was treated as the markdown closer, leaving `.png)` as stray text and the image broken. Allow one level of balanced parens in the URL — matches CommonMark behavior that marked.js had before this custom tokenizer overrode it. Co-Authored-By: Claude Opus 4.7 (1M context) --- frontend/src/components/tiptap-extensions/image-extension.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/frontend/src/components/tiptap-extensions/image-extension.js b/frontend/src/components/tiptap-extensions/image-extension.js index f857dd6f..1b9146d6 100644 --- a/frontend/src/components/tiptap-extensions/image-extension.js +++ b/frontend/src/components/tiptap-extensions/image-extension.js @@ -31,7 +31,10 @@ const imageCaptionTokenizer = { tokenize(src, tokens, lexer) { // Match: ![alt](src) or ![alt](src "title") optionally followed by \n*caption* - const imagePattern = /^!\[([^\]]*)\]\(([^)"]+)(?:\s+"([^"]*)")?\)/; + // URL allows one level of balanced parens so Frappe filenames like + // `/files/image (24).png` survive (otherwise the inner `)` closes the markdown). + const imagePattern = + /^!\[([^\]]*)\]\(((?:[^()"\s]|\([^()"]*\))+)(?:\s+"([^"]*)")?\)/; const captionPattern = /^\n\*([^*]+)\*/; const imageMatch = imagePattern.exec(src); From 3fd4168862ff25d5c07b6f91920caa4750810b79 Mon Sep 17 00:00:00 2001 From: Hussain Nagaria Date: Wed, 27 May 2026 18:29:41 +0530 Subject: [PATCH 2/3] fix: render image paths containing parentheses (public renderer) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same paren bug as the previous commit, but in the server-side Mistune renderer that produces the public page HTML. Mistune 3.2's image parser stops at the first `)`, so `![](/files/image (14).png)` rendered as `.png)` — broken image plus stray text. Every image on the Frappe Helpdesk docs hit this. - Widen `IMAGE_PATTERN` to allow one level of balanced parens (and the spaces that Frappe URLs already needed) so the pre-encoder sees the whole URL. - Pre-encode `(` and `)` to `%28`/`%29` in image URLs alongside the existing space → `%20` step, since Mistune can't be coaxed into handling literal parens itself. - Rename `_encode_image_url_spaces` → `_encode_image_url_unsafe_chars` to match the broader job. Adds `TestImageUrlParensEncoding` covering literal parens, the encoded-space + literal-paren form Frappe actually emits, title and inline-paragraph variants, and multiple images per document. Co-Authored-By: Claude Opus 4.7 (1M context) --- wiki/wiki/markdown.py | 49 +++++++++++++++++--------------------- wiki/wiki/test_markdown.py | 44 ++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 27 deletions(-) diff --git a/wiki/wiki/markdown.py b/wiki/wiki/markdown.py index 7b9a38a0..eae5d67f 100644 --- a/wiki/wiki/markdown.py +++ b/wiki/wiki/markdown.py @@ -151,9 +151,12 @@ def _replace_callout_placeholders(html, callouts, placeholder_prefix, md_instanc # Pattern to match markdown image syntax: ![alt](url) or ![alt](url "title") -# Captures: alt text, URL, and optional title +# Captures: alt text, URL, and optional title. +# URL allows one level of balanced parens so Frappe uploads named like +# `/files/image (14).png` are matched whole (otherwise the inner `)` closes +# the markdown and the rest of the filename leaks out as plain text). IMAGE_PATTERN = re.compile( - r'!\[([^\]]*)\]\(([^)"\s]+(?:\s[^)]*)?)\)', + r'!\[([^\]]*)\]\(((?:[^()"]|\([^()"]*\))+?)(?:\s+"([^"]*)")?\)', ) VIDEO_EXTENSIONS = ( @@ -287,40 +290,31 @@ def replace_span(match: re.Match) -> str: return "\n".join(lines) -def _encode_image_url_spaces(content: str) -> str: +def _encode_image_url_unsafe_chars(content: str) -> str: """ - Pre-process markdown to URL-encode spaces in image URLs. + Pre-process markdown to URL-encode characters in image URLs that Mistune + mishandles. - Mistune (unlike markdown2) doesn't handle spaces in URLs, so we need to - encode them before parsing. This function finds all image syntax and - encodes spaces in the URL portion. + Mistune (unlike markdown2) doesn't allow spaces in URLs, and its image + parser stops at the first `)` even when parens are balanced — so a Frappe + upload like `/files/image (14).png` ends up with src=`/files/image%20(14` + and `.png)` leaking out as text. We pre-encode literal `(`, `)`, and ` ` + so Mistune sees a clean URL. Args: content: Markdown string Returns: - Markdown string with spaces in image URLs encoded as %20 + Markdown string with unsafe chars in image URLs percent-encoded """ def encode_url(match): alt_text = match.group(1) - url_part = match.group(2) - - # Split URL and optional title (title is in quotes after a space) - # e.g., '/path/to/image.png "Image Title"' - title_match = re.match(r'^([^"]+?)(?:\s+"([^"]*)")?$', url_part) - if title_match: - url = title_match.group(1).strip() - title = title_match.group(2) - else: - url = url_part - title = None - - # Only encode spaces, preserve other characters - # quote() with safe='' would encode everything, but we only want spaces - encoded_url = url.replace(" ", "%20") - - # Reconstruct the image syntax + url = match.group(2).strip() + title = match.group(3) + + encoded_url = url.replace("(", "%28").replace(")", "%29").replace(" ", "%20") + if title: return f'![{alt_text}]({encoded_url} "{title}")' return f"![{alt_text}]({encoded_url})" @@ -428,8 +422,9 @@ def render_markdown_with_toc(content: str) -> tuple[str, list]: ], ) - # Step 1: URL-encode spaces in image URLs (mistune doesn't handle them) - processed_content = _encode_image_url_spaces(content) + # Step 1: URL-encode unsafe chars (spaces, parens) in image URLs that + # Mistune would otherwise mishandle. + processed_content = _encode_image_url_unsafe_chars(content) # Step 1b: Escape `|` inside inline-code spans on table rows so Mistune's # table plugin doesn't miscount columns and drop the table. diff --git a/wiki/wiki/test_markdown.py b/wiki/wiki/test_markdown.py index f8f78a87..f82f4454 100644 --- a/wiki/wiki/test_markdown.py +++ b/wiki/wiki/test_markdown.py @@ -413,6 +413,50 @@ def test_already_encoded_url_unchanged(self): self.assertNotIn("%2520", result) +class TestImageUrlParensEncoding(unittest.TestCase): + """Frappe uploads commonly produce names like `image (14).png`. Mistune's + image parser stops at the first `)`, so the URL must be pre-encoded for + the public page to render correctly.""" + + def test_image_with_literal_parens(self): + content = "![](/files/image (14).png)" + result = render_markdown(content) + self.assertIn(' Date: Wed, 27 May 2026 19:08:01 +0530 Subject: [PATCH 3/3] refactor: replace Mistune with markdown-it-py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mistune 3.2's image parser stops at the first `)` in a URL even when the parens are balanced, which is what bit us on Frappe upload paths like `/files/image (14).png`. The previous commits worked around it with a paren pre-encoder; this commit removes the root cause by switching the public-page renderer to markdown-it-py, a CommonMark- compliant parser that handles balanced parens natively. Engine swap: - `mistune>=3.0` → `markdown-it-py>=3.0`, `mdit-py-plugins>=0.4` - `WikiRenderer` (mistune.HTMLRenderer subclass) replaced with render- rule overrides on the markdown-it-py renderer: `fence`/`code_block` (rstrip phantom trailing whitespace) and `image` (video-block detection + script-tag stripping on alt/title). - Heading slug + TOC extraction moved out of the renderer's hidden state into an explicit token walk before render. - Plugins: `footnote_plugin`, `tasklists_plugin`; `table` and `strikethrough` are core in markdown-it-py. Simplifications now that parens parse natively: - `_encode_image_url_unsafe_chars` → `_encode_image_url_spaces`, shrunk to a single `.replace(" ", "%20")`. Parens flow through. - Tests in `TestImageUrlWithParens` (was `TestImageUrlParensEncoding`) now assert the natural `(14)` form rather than `%2814%29`. - `html.unescape` import and call dropped (the mistune-era TOC text was pre-rendered HTML; `renderInlineAsText` returns plain text). Stays the same — these operate on raw markdown and are engine-agnostic: - Callout preprocessor (`:::note` blocks → placeholders → HTML) - Video preprocessor (full-line image markdown with video extension) - Image-URL space encoder (CommonMark forbids unescaped whitespace in URLs; Frappe uploads have them) - Inline-code pipe sentinel for table rows. Confirmed via repro that markdown-it-py's table plugin has the same bug as Mistune here — it counts raw `|` per row and rejects the block on mismatch. All 57 tests pass (52 existing + 5 paren cases added earlier in this PR, expectations flipped to match the new parser). Co-Authored-By: Claude Opus 4.7 (1M context) --- pyproject.toml | 3 +- wiki/wiki/markdown.py | 259 ++++++++++++++++--------------------- wiki/wiki/test_markdown.py | 33 +++-- 3 files changed, 125 insertions(+), 170 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6dfd9940..2a4f55a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,8 @@ requires-python = ">=3.14" readme = "README.md" dynamic = ["version"] dependencies = [ - "mistune>=3.0", + "markdown-it-py>=3.0", + "mdit-py-plugins>=0.4", ] [project.urls] diff --git a/wiki/wiki/markdown.py b/wiki/wiki/markdown.py index eae5d67f..15869b54 100644 --- a/wiki/wiki/markdown.py +++ b/wiki/wiki/markdown.py @@ -1,10 +1,7 @@ """ -Custom Markdown Renderer with Callout/Aside Support +Wiki markdown → HTML renderer (markdown-it-py + custom callout/aside support). -This module provides a custom markdown-to-HTML renderer using Mistune, -with support for Astro Starlight-style callouts/asides. - -Syntax: +Callout syntax: :::note Content here ::: @@ -17,10 +14,11 @@ """ import re -from html import unescape -from urllib.parse import quote -import mistune +from markdown_it import MarkdownIt +from markdown_it.common.utils import escapeHtml +from mdit_py_plugins.footnote import footnote_plugin +from mdit_py_plugins.tasklists import tasklists_plugin def slugify(text: str) -> str: @@ -103,7 +101,6 @@ def _process_callouts_with_placeholders(content): and a list of callout data to be processed later. """ callouts = [] - # Use HTML comment-like placeholder that won't be parsed as markdown placeholder_prefix = "WIKICALLOUTPLACEHOLDER" def replacer(match): @@ -123,7 +120,6 @@ def replacer(match): "content": inner_content.strip(), } ) - # Return placeholder - use format that won't be parsed as markdown return f"\n\n{placeholder_prefix}{idx}END\n\n" # Process callouts (may be nested, so we process iteratively) @@ -135,12 +131,11 @@ def replacer(match): return content, callouts, placeholder_prefix -def _replace_callout_placeholders(html, callouts, placeholder_prefix, md_instance): +def _replace_callout_placeholders(html, callouts, placeholder_prefix, render_inner): """Replace callout placeholders with actual HTML after markdown rendering.""" for idx, callout in enumerate(callouts): placeholder = f"{placeholder_prefix}{idx}END" - # The placeholder might be wrapped in

tags, so handle both cases - inner_html = md_instance(callout["content"]) if callout["content"] else "" + inner_html = render_inner(callout["content"]) if callout["content"] else "" callout_html = _generate_callout_html(callout["type"], callout["title"], inner_html) # Replace placeholder (may be wrapped in

tags) @@ -151,10 +146,8 @@ def _replace_callout_placeholders(html, callouts, placeholder_prefix, md_instanc # Pattern to match markdown image syntax: ![alt](url) or ![alt](url "title") -# Captures: alt text, URL, and optional title. # URL allows one level of balanced parens so Frappe uploads named like -# `/files/image (14).png` are matched whole (otherwise the inner `)` closes -# the markdown and the rest of the filename leaks out as plain text). +# `/files/image (14).png` are matched whole. IMAGE_PATTERN = re.compile( r'!\[([^\]]*)\]\(((?:[^()"]|\([^()"]*\))+?)(?:\s+"([^"]*)")?\)', ) @@ -231,7 +224,6 @@ def replacer(match): "title": match.group("title") or "", } ) - # Force paragraph break around video block return f"\n\n{placeholder_prefix}{idx}END\n\n" return VIDEO_MARKDOWN_PATTERN.sub(replacer, content), videos, placeholder_prefix @@ -247,24 +239,38 @@ def _replace_video_placeholders(html: str, videos: list[dict], placeholder_prefi return html +def _encode_image_url_spaces(content: str) -> str: + """ + Pre-process markdown to URL-encode literal spaces in image URLs. + + CommonMark forbids unescaped whitespace in URLs, but Frappe uploads + routinely contain spaces (e.g. `/files/my image.png`). The matching + regex tolerates balanced parens so URLs like `/files/image (14).png` + are captured whole — the parser handles those parens natively. + """ + + def encode_url(match): + alt_text = match.group(1) + url = match.group(2).strip().replace(" ", "%20") + title = match.group(3) + + if title: + return f'![{alt_text}]({url} "{title}")' + return f"![{alt_text}]({url})" + + return IMAGE_PATTERN.sub(encode_url, content) + + # Private-use Unicode sentinel — stands in for `|` inside inline-code on table -# rows during Mistune parsing, then gets swapped back after rendering. Chosen -# from the PUA block so it cannot collide with authored markdown content. +# rows during parsing, then gets swapped back after rendering. Both Mistune and +# markdown-it-py count raw pipes per row in their table plugin and reject the +# whole block on mismatch, dropping the table to a paragraph; hiding the inner +# pipes behind a PUA sentinel keeps the column count honest. _TABLE_CODE_PIPE_SENTINEL = "" def _escape_table_inline_code_pipes(content: str) -> str: - """ - Swap `|` characters inside inline-code spans on table-row lines for a - sentinel, which is restored after Mistune renders. - - GFM-compliant parsers (marked, markdown-it) treat a backtick-delimited span - like `` `dict | list` `` as a single code token, so its `|` is not a column - separator. Mistune's table plugin instead counts raw pipes per row and, - finding a mismatch, rejects the entire block — the table collapses to a - paragraph. Hiding those pipes behind a sentinel makes the column count - match, and a post-render replace restores the `|` inside ``. - """ + """Swap `|` inside inline-code spans on table-row lines for a sentinel.""" lines = content.split("\n") in_fence = False fence_marker: str | None = None @@ -290,94 +296,41 @@ def replace_span(match: re.Match) -> str: return "\n".join(lines) -def _encode_image_url_unsafe_chars(content: str) -> str: - """ - Pre-process markdown to URL-encode characters in image URLs that Mistune - mishandles. - - Mistune (unlike markdown2) doesn't allow spaces in URLs, and its image - parser stops at the first `)` even when parens are balanced — so a Frappe - upload like `/files/image (14).png` ends up with src=`/files/image%20(14` - and `.png)` leaking out as text. We pre-encode literal `(`, `)`, and ` ` - so Mistune sees a clean URL. - - Args: - content: Markdown string - - Returns: - Markdown string with unsafe chars in image URLs percent-encoded - """ - - def encode_url(match): - alt_text = match.group(1) - url = match.group(2).strip() - title = match.group(3) - - encoded_url = url.replace("(", "%28").replace(")", "%29").replace(" ", "%20") - - if title: - return f'![{alt_text}]({encoded_url} "{title}")' - return f"![{alt_text}]({encoded_url})" - - return IMAGE_PATTERN.sub(encode_url, content) - - -class WikiRenderer(mistune.HTMLRenderer): - """Custom HTML renderer. - - Image captions use the Stack Overflow pattern: - ![alt text](image.jpg) - *caption text* - - This renders as

caption

(no blank line between). - Style with CSS: img + em { ... } - Alt text remains for accessibility, caption is separate. - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self._heading_slugs = {} # Track used slugs to avoid duplicates - self._headings = [] # Track headings for TOC +def _build_markdown() -> MarkdownIt: + """Build a configured markdown-it-py instance with our render overrides.""" + md = ( + MarkdownIt("commonmark", {"html": True, "linkify": False, "typographer": False}) + .enable(["table", "strikethrough"]) + .use(footnote_plugin) + .use(tasklists_plugin, enabled=True) + ) - def block_code(self, code: str, info: str | None = None) -> str: + def _render_codeblock_html(content: str, lang: str = "") -> str: # Trim trailing whitespace the author left inside the fence — spaces, # tabs, and blank lines all render as phantom empty rows in
.
-		return super().block_code(code.rstrip() + "\n", info)
+		content = content.rstrip() + "\n"
+		cls = f' class="language-{escapeHtml(lang)}"' if lang else ""
+		return f"
{escapeHtml(content)}
\n" - def heading(self, text: str, level: int, **attrs) -> str: - """Render heading with slugified ID for anchor links.""" - # Generate base slug from heading text - slug = slugify(text) + def fence_rstrip(tokens, idx, options, env): + tok = tokens[idx] + lang = next(iter((tok.info or "").split()), "") + return _render_codeblock_html(tok.content, lang) - # Handle empty slugs - if not slug: - slug = "heading" + def code_block_rstrip(tokens, idx, options, env): + return _render_codeblock_html(tokens[idx].content) - # Ensure unique slugs by appending numbers for duplicates - original_slug = slug - counter = 1 - while slug in self._heading_slugs: - slug = f"{original_slug}-{counter}" - counter += 1 + md.renderer.rules["fence"] = fence_rstrip + md.renderer.rules["code_block"] = code_block_rstrip - self._heading_slugs[slug] = True + def image_render(tokens, idx, options, env): + tok = tokens[idx] + src = tok.attrGet("src") or "" + alt = _remove_script_tags(tok.content) + title = _remove_script_tags(tok.attrGet("title") or "") - # Track h2 and h3 headings for TOC - if level in (2, 3): - self._headings.append( - {"id": slug, "text": unescape(re.sub(r"<[^>]+>", "", text)), "level": level} - ) - - return f'{text}\n' - - def image(self, text: str, url: str, title: str | None = None) -> str: - """Render video URLs as HTML5 video blocks; others as normal images.""" - src = self.safe_url(url) - alt = _remove_script_tags(text) - safe_title = _remove_script_tags(title) - - if _is_video_url(url): - title_attr = f' title="{safe_title}"' if safe_title else "" + if _is_video_url(src): + title_attr = f' title="{title}"' if title else "" data_alt_attr = f' data-alt="{alt}"' if alt else "" return ( f'
' @@ -385,15 +338,44 @@ def image(self, text: str, url: str, title: str | None = None) -> str: f'' "
" ) - s = f'{alt}" - def get_headings(self) -> list: - """Return the list of h2/h3 headings extracted during rendering.""" - return self._headings + md.renderer.rules["image"] = image_render + return md + + +def _apply_heading_slugs_and_toc(tokens, md: MarkdownIt) -> list[dict]: + """ + Walk parsed tokens, assign unique slug IDs to every heading, and collect + h2/h3 entries for the table of contents. + """ + used: set[str] = set() + headings: list[dict] = [] + + for i, tok in enumerate(tokens): + if tok.type != "heading_open": + continue + inline = tokens[i + 1] if i + 1 < len(tokens) else None + raw_text = inline.content if inline and inline.type == "inline" else "" + + slug = base = slugify(raw_text) or "heading" + counter = 1 + while slug in used: + slug = f"{base}-{counter}" + counter += 1 + used.add(slug) + tok.attrSet("id", slug) + + level = int(tok.tag[1]) # "h2" -> 2 + if level in (2, 3): + # Render the inline as plain text so TOC entries drop markdown syntax + text = md.renderer.renderInlineAsText(inline.children or [], md.options, {}) + headings.append({"id": slug, "text": text, "level": level}) + + return headings def render_markdown_with_toc(content: str) -> tuple[str, list]: @@ -409,49 +391,24 @@ def render_markdown_with_toc(content: str) -> tuple[str, list]: if not content: return "", [] - # Create a base Mistune markdown instance with custom renderer - # Note: escape=False must be passed to the renderer, not create_markdown - renderer = WikiRenderer(escape=False) - md = mistune.create_markdown( - renderer=renderer, - plugins=[ - "strikethrough", - "footnotes", - "table", - "task_lists", - ], - ) - - # Step 1: URL-encode unsafe chars (spaces, parens) in image URLs that - # Mistune would otherwise mishandle. - processed_content = _encode_image_url_unsafe_chars(content) + md = _build_markdown() - # Step 1b: Escape `|` inside inline-code spans on table rows so Mistune's - # table plugin doesn't miscount columns and drop the table. + processed_content = _encode_image_url_spaces(content) processed_content = _escape_table_inline_code_pipes(processed_content) + processed_content, callouts, callout_prefix = _process_callouts_with_placeholders(processed_content) + processed_content, videos, video_prefix = _process_videos_with_placeholders(processed_content) - # Step 2: Extract callouts and replace with placeholders - processed_content, callouts, placeholder_prefix = _process_callouts_with_placeholders(processed_content) + env: dict = {} + tokens = md.parse(processed_content, env) + headings = _apply_heading_slugs_and_toc(tokens, md) + html = md.renderer.render(tokens, md.options, env) - # Step 3: Extract video blocks and replace with placeholders - processed_content, videos, video_placeholder_prefix = _process_videos_with_placeholders(processed_content) + html = _replace_callout_placeholders(html, callouts, callout_prefix, md.render) + html = _replace_video_placeholders(html, videos, video_prefix) - # Step 4: Render markdown (placeholders may be wrapped in

tags) - html = md(processed_content) - - # Step 5: Replace callout placeholders with actual callout HTML - html = _replace_callout_placeholders(html, callouts, placeholder_prefix, md) - - # Step 6: Replace video placeholders with block video HTML - html = _replace_video_placeholders(html, videos, video_placeholder_prefix) - - # Step 7: Restore pipes that were hidden from the table parser. if _TABLE_CODE_PIPE_SENTINEL in html: html = html.replace(_TABLE_CODE_PIPE_SENTINEL, "|") - # Get the headings extracted during rendering - headings = renderer.get_headings() - return html, headings diff --git a/wiki/wiki/test_markdown.py b/wiki/wiki/test_markdown.py index f82f4454..91490dd6 100644 --- a/wiki/wiki/test_markdown.py +++ b/wiki/wiki/test_markdown.py @@ -413,31 +413,28 @@ def test_already_encoded_url_unchanged(self): self.assertNotIn("%2520", result) -class TestImageUrlParensEncoding(unittest.TestCase): - """Frappe uploads commonly produce names like `image (14).png`. Mistune's - image parser stops at the first `)`, so the URL must be pre-encoded for - the public page to render correctly.""" +class TestImageUrlWithParens(unittest.TestCase): + """Frappe uploads commonly produce names like `image (14).png`. CommonMark + allows one level of balanced parens in URLs, so the parser handles them + natively; only literal spaces still need pre-encoding.""" def test_image_with_literal_parens(self): content = "![](/files/image (14).png)" result = render_markdown(content) - self.assertIn('", result) def test_image_with_encoded_space_and_literal_parens(self): - """The form actually emitted by Frappe: space encoded, parens literal. - - This is the case from the Helpdesk docs that triggered the bug. - """ + """The form Frappe actually emits: space encoded, parens literal.""" content = "![](/files/image%20(14).png)" result = render_markdown(content) - self.assertIn('", result) def test_image_with_parens_and_alt_and_title(self): content = '![logo](/files/image (24).png "App Logo")' result = render_markdown(content) - self.assertIn('