Improve codeblock tokenization inside other block tokens" (#53)

* Optimize codeblock lexer inside other block tokens * Update test snapshots * Lint fixes
kiranparajuli589 · Oct 23, 2023 · 154dacc · 154dacc
1 parent 9617e19
commit 154dacc
Show file tree

Hide file tree

Showing 16 changed files with 205 additions and 2,968 deletions.
diff --git a/demo/src/styles/main.scss b/demo/src/styles/main.scss
@@ -91,6 +91,9 @@ main {
 						*:last-child {
 							margin-bottom: 0;
 						}
+						pre {
+							margin-top: .5rem;
+						}
 					}
 					input {
 						margin-right: .5rem;

diff --git a/lib/regex/index.js b/lib/regex/index.js
@@ -13,7 +13,7 @@ export const REGEX = {
 		UNDERLINE_1: /^\s*=+$/g,
 		UNDERLINE_2: /^\s*-+$/g
 	},
-	CODE_BLOCK: /^\s*`{3}(?<lang>[a-z]*)$/g,
+	CODE_BLOCK: /^\s*`{3}\s*(?<lang>[a-z]*)$/g,
 	LIST: {
 		CHECKBOX: /^\s*(?:[-~*]|\d+\.)\s\[(?<check>\s|x)]\s(?<value>.+)/g,
 		UNORDERED: /^\s*(?<mark>[-*+])\s(?<value>.+)/g,

diff --git a/lib/tokenizer/codeblock.js b/lib/tokenizer/codeblock.js
@@ -25,7 +25,7 @@ export class CodeBlock {
 		if (
 			!fromToken &&
 			indent >= 4 &&
-			(!lastLexer || lastLexer?.type === TOKENS.NEW_LINE)
+			(!lastLexer || lastLexer.type === TOKENS.NEW_LINE)
 		) {
 			return true
 		}
@@ -105,6 +105,9 @@ export class CodeBlock {
 
 		do {
 			nextLine = this.#lines[++this.#cursor]
+			if (nextLine === "") {
+				continue
+			}
 			nextLineIndent = (nextLine) ? Indent.get(nextLine) : null
 
 			if (nextLine !== undefined) {
@@ -217,7 +220,6 @@ export class CodeBlock {
 		} else {
 			skeleton = skeleton.replace("%s", Esc.everything(lexer.value))
 		}
-
 		return skeleton
 	}
 }
diff --git a/lib/tokenizer/list.js b/lib/tokenizer/list.js
@@ -195,8 +195,7 @@ export class List {
 		const listTag = (lexer.meta.ordered) ? "ol" : "ul"
 		let listBodyHtml = []
 		lexer.items.forEach(listItem => {
-			let listItemHtml = `
-<li>%s</li>`
+			let listItemHtml = "<li>%s</li>"
 			const lParser = new Parser(listItem.tokens, { from: TOKENS.LIST })
 			if (lexer.meta.checklist) {
 				const isChecked = (listItem.checked) ? " checked" : ""
@@ -215,8 +214,7 @@ export class List {
 			}
 			listBodyHtml.push(listItemHtml)
 		})
-		return `<${listTag}>${listBodyHtml.join("")}
-</${listTag}>`
+		return `<${listTag}>${listBodyHtml.join("")}</${listTag}>`
 	}
 
 	/**

diff --git a/lib/tokenizer/paragraph.js b/lib/tokenizer/paragraph.js
@@ -417,6 +417,7 @@ export class Paragraph {
 			} else if (token.type === TOKENS.ITALIC) {
 				parsed += `<em>${Paragraph.parse(token)}</em>`
 			} else if (token.type === TOKENS.CODE) {
+				token.value = Esc.unEscape(token.value)
 				parsed += `<code>${Esc.everything(token.value)}</code>`
 			}else if (token.type === TOKENS.STRIKE_THROUGH) {
 				parsed += `<s>${Paragraph.parse(token)}</s>`

diff --git a/tests/integration/__snapshots__/large.spec.js.snap b/tests/integration/__snapshots__/large.spec.js.snap
diff --git a/tests/integration/__snapshots__/small.spec.js.snap b/tests/integration/__snapshots__/small.spec.js.snap
@@ -2,31 +2,7 @@
 
 exports[`Small MD To HTML should parse the small markdown file content to html 1`] = `
 "<!-- https://regex101.com/r/C6SxZH/1--><h1>Heading One <code>CODE INSIDE</code></h1><h2>Heading Two</h2><h3>Heading Three</h3><h4>Heading Four</h4><h5>Heading Five</h5><h6>Heading Six</h6><p>a paragraph of words <code>first code</code> normal text here <code>code body</code> <em>first italics</em> here me crying <em>italic body</em> here me crying <strong>first bolds</strong> some normal again <strong>bold body</strong> <a href=\\"https://kiranparajuli.com.np\\">Kiran Parajuli</a> <s>strikes body</s> here some normal again at the last</p><blockquote>
-<p> some quote with <u>Underline</u></p></blockquote><ul>
-<li>list item 1</li>
-<li>list item 2</li>
-</ul><ul>
-<li>list item 3</li>
-<li>list item 4</li>
-</ul><ul>
-<li>list item 5</li>
-<li>list item 6</li>
-</ul><ul>
-<li><input type='checkbox'>checkbox empty</li>
-<li><input type='checkbox' checked>checkbox checked</li>
-</ul><ul>
-<li><input type='checkbox'>checkbox empty</li>
-<li><input type='checkbox' checked>checkbox checked</li>
-</ul><ul>
-<li><input type='checkbox'>checkbox empty</li>
-<li><input type='checkbox' checked>checkbox checked</li>
-</ul><ol>
-<li>numbered list item 1</li>
-<li>numbered list item 2</li>
-</ol><ol>
-<li><input type='checkbox'>numbered checkbox empty</li>
-<li><input type='checkbox' checked>numbered checkbox checked paragraph just below hmm, but with some fun :)</li>
-</ol><blockquote>
+<p> some quote with <u>Underline</u></p></blockquote><ul><li>list item 1</li><li>list item 2</li></ul><ul><li>list item 3</li><li>list item 4</li></ul><ul><li>list item 5</li><li>list item 6</li></ul><ul><li><input type='checkbox'>checkbox empty</li><li><input type='checkbox' checked>checkbox checked</li></ul><ul><li><input type='checkbox'>checkbox empty</li><li><input type='checkbox' checked>checkbox checked</li></ul><ul><li><input type='checkbox'>checkbox empty</li><li><input type='checkbox' checked>checkbox checked</li></ul><ol><li>numbered list item 1</li><li>numbered list item 2</li></ol><ol><li><input type='checkbox'>numbered checkbox empty</li><li><input type='checkbox' checked>numbered checkbox checked paragraph just below hmm, but with some fun :)</li></ol><blockquote>
 <p> quote 1</p><blockquote>
 <p> nested quote 2 <a href=\\"quote-link-url\\">quote-link</a></p><blockquote>
 <p> nested quote 3 with <em>italics</em> and <strong>bolds</strong></p></blockquote></blockquote></blockquote><pre><code class='language-js'>const a = 1;

diff --git a/tests/integration/large.spec.js b/tests/integration/large.spec.js
@@ -10,8 +10,7 @@ const mdp = new HtmlMark({
 describe("Large MD To HTML", () => {
 	it.each([
 		// eslint-disable-next-line no-undef
-		File.pathJoin(__dirname, "..", "fixtures/large_markdown.md"),
-		File.pathJoin(__dirname, "..", "fixtures/markdownit.md")
+		File.pathJoin(__dirname, "..", "fixtures/large_markdown.md")
 	])("should parse the large markdown file content to html", (path) => {
 		const html = mdp.parse(File.read(path))
 		expect(html).toMatchSnapshot()

diff --git a/tests/unit/lexer/__snapshots__/codeblock.spec.js.snap b/tests/unit/lexer/__snapshots__/codeblock.spec.js.snap
@@ -96,6 +96,95 @@ four",
 ]
 `;
 
+exports[`codeblock should be work just fine within a list item 1`] = `
+Array [
+  Object {
+    "indent": 0,
+    "items": Array [
+      Object {
+        "checked": null,
+        "count": "4",
+        "raw": "4.  What is the exact rule for determining when list items get
+	wrapped in \`<p>\` tags?  Can a list be partially \\"loose\\" and partially
+	\\"tight\\"?  What should we do with a list like this?
+
+		\`\`\` markdown
+		1. one
+
+		2. two
+		3. three
+		\`\`\`",
+        "tokens": Array [
+          Object {
+            "indent": 0,
+            "raw": "What is the exact rule for determining when list items get
+	wrapped in \`<p>\` tags?  Can a list be partially \\"loose\\" and partially
+	\\"tight\\"?  What should we do with a list like this?",
+            "tokens": Array [
+              Object {
+                "raw": "What is the exact rule for determining when list items get 	wrapped in ",
+                "type": "text",
+                "value": "What is the exact rule for determining when list items get 	wrapped in ",
+              },
+              Object {
+                "raw": "\`<p>\`",
+                "type": "code",
+                "value": "<p>",
+              },
+              Object {
+                "raw": " tags?  Can a list be partially \\"loose\\" and partially 	\\"tight\\"?  What should we do with a list like this?",
+                "type": "text",
+                "value": " tags?  Can a list be partially \\"loose\\" and partially 	\\"tight\\"?  What should we do with a list like this?",
+              },
+            ],
+            "type": "paragraph",
+            "value": "What is the exact rule for determining when list items get 	wrapped in \`<p>\` tags?  Can a list be partially \\"loose\\" and partially 	\\"tight\\"?  What should we do with a list like this?",
+          },
+          Object {
+            "type": "new-line",
+          },
+          Object {
+            "indent": 2,
+            "language": "markdown",
+            "raw": "		\`\`\` markdown
+		1. one
+
+		2. two
+		3. three
+		\`\`\`",
+            "type": "code-block",
+            "value": "1. one
+
+2. two
+3. three",
+          },
+        ],
+        "type": "list-item",
+      },
+    ],
+    "meta": Object {
+      "checklist": false,
+      "identifier": null,
+      "ordered": true,
+    },
+    "raw": "4.  What is the exact rule for determining when list items get
+	wrapped in \`<p>\` tags?  Can a list be partially \\"loose\\" and partially
+	\\"tight\\"?  What should we do with a list like this?
+
+		\`\`\` markdown
+		1. one
+
+		2. two
+		3. three
+		\`\`\`",
+    "type": "list",
+  },
+  Object {
+    "type": "new-line",
+  },
+]
+`;
+
 exports[`codeblock should cope with empty body 1`] = `
 Array [
   Object {

diff --git a/tests/unit/lexer/__snapshots__/paragraph.spec.js.snap b/tests/unit/lexer/__snapshots__/paragraph.spec.js.snap
@@ -82,6 +82,34 @@ Array [
 ]
 `;
 
+exports[`paragraph should allow escaped escape character 1`] = `
+Array [
+  Object {
+    "indent": 0,
+    "raw": "this is \`go\\\\ogle\` link",
+    "tokens": Array [
+      Object {
+        "raw": "this is ",
+        "type": "text",
+        "value": "this is ",
+      },
+      Object {
+        "raw": "\`go\\\\ogle\`",
+        "type": "code",
+        "value": "go\\\\ogle",
+      },
+      Object {
+        "raw": " link",
+        "type": "text",
+        "value": " link",
+      },
+    ],
+    "type": "paragraph",
+    "value": "this is \`go\\\\ogle\` link",
+  },
+]
+`;
+
 exports[`paragraph should be deep tokenized: a ****bold**** text 1`] = `
 Array [
   Object {

diff --git a/tests/unit/lexer/codeblock.spec.js b/tests/unit/lexer/codeblock.spec.js
@@ -161,4 +161,22 @@ describe("codeblock", () => {
 		const tokens = lexer.run()
 		expect(tokens).toMatchSnapshot()
 	})
+	it("should be work just fine within a list item", () => {
+		const testString = `
+4.  What is the exact rule for determining when list items get
+\twrapped in \`<p>\` tags?  Can a list be partially "loose" and partially
+\t"tight"?  What should we do with a list like this?
+
+\t\t\`\`\` markdown
+\t\t1. one
+
+\t\t2. two
+\t\t3. three
+\t\t\`\`\`
+`
+		const lines = testString.split("\n")
+		const lexer = new Lexer(lines)
+		const tokens = lexer.run()
+		expect(tokens).toMatchSnapshot()
+	})
 })
diff --git a/tests/unit/lexer/paragraph.spec.js b/tests/unit/lexer/paragraph.spec.js
@@ -96,4 +96,12 @@ describe("paragraph", () => {
 		const lexerData = lexer.run()
 		expect(lexerData).toMatchSnapshot()
 	})
+	it("should allow escaped escape character", () => {
+		const lines = [
+			"this is `go\\ogle` link"
+		]
+		const lexer = new Lexer(lines)
+		const lexerData = lexer.run()
+		expect(lexerData).toMatchSnapshot()
+	})
 })
diff --git a/tests/unit/parser/__snapshots__/codeblock.spec.js.snap b/tests/unit/parser/__snapshots__/codeblock.spec.js.snap
@@ -3,9 +3,11 @@
 exports[`CodeBlock Parsing should cope with multiple items 1`] = `"<pre><code class='language-js'>const a = 1</code></pre><p>some people are funny</p>"`;
 
 exports[`CodeBlock Parsing should parse indent codeblock 1`] = `
-"<p>This is a normal paragraph:</p><pre><code>This is a code block.</code></pre><p>Here is an example of AppleScript:</p><pre><code>tell application &quot;Foo&quot;
+"<p>This is a normal paragraph:</p><pre><code>This is a code block.
+</code></pre><p>Here is an example of AppleScript:</p><pre><code>tell application &quot;Foo&quot;
 beep
-end tell</code></pre><p>A code block continues until it reaches a line that is not indented (or the end of the article).</p>"
+end tell
+</code></pre><p>A code block continues until it reaches a line that is not indented (or the end of the article).</p>"
 `;
 
 exports[`CodeBlock Parsing should parse multiple consequetive codeblocks 1`] = `"<pre><code class='language-js'>const a = 1</code></pre><pre><code class='language-js'>const b = 2</code></pre>"`;