From 55349a926ad1f566abefd2bd763d0d10112d54da Mon Sep 17 00:00:00 2001 From: qwinsi <70425035+qwinsi@users.noreply.github.com> Date: Thu, 22 Aug 2024 00:50:23 +0800 Subject: [PATCH] simplify parser.ts --- package.json | 2 +- src/parser.ts | 212 +++++++++------------------------------------- src/types.ts | 18 +--- src/writer.ts | 103 ++++++++++++---------- test/main.test.ts | 4 +- 5 files changed, 104 insertions(+), 235 deletions(-) diff --git a/package.json b/package.json index 92e5140..753fc35 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "tex2typst", - "version": "0.2.0", + "version": "0.2.1", "description": "JavaScript library for converting TeX code to Typst", "type": "module", "main": "dist/index.js", diff --git a/src/parser.ts b/src/parser.ts index 9985374..dfcfbd0 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -1,4 +1,4 @@ -import { TexNode, LatexParseNode, TexSupsubData } from "./types"; +import { TexNode, TexSupsubData } from "./types"; const UNARY_COMMANDS = [ 'sqrt', @@ -252,7 +252,7 @@ function eat_primes(latex: string, start: number): number { } -class LatexParserError extends Error { +export class LatexParserError extends Error { constructor(message: string) { super(message); this.name = 'LatexParserError'; @@ -260,7 +260,7 @@ class LatexParserError extends Error { } -type ParseResult = [LatexParseNode, number]; +type ParseResult = [TexNode, number]; export class LatexParser { space_sensitive: boolean; @@ -271,8 +271,8 @@ export class LatexParser { this.newline_sensitive = newline_sensitive; } - parse(latex: string): LatexParseNode { - const results: LatexParseNode[] = []; + parse(latex: string): TexNode { + const results: TexNode[] = []; let pos = 0; while (pos < latex.length) { @@ -295,14 +295,14 @@ export class LatexParser { } else if (results.length === 1) { return results[0]; } else { - return { type: 'ordgroup', args: results }; + return { type: 'ordgroup', content: '', args: results }; } } parseNextExpr(latex: string, start: number): ParseResult { let [base, pos] = this.parseNextExprWithoutSupSub(latex, start); - let sub: LatexParseNode | null = null; - let sup: LatexParseNode | null = null; + let sub: TexNode | null = null; + let sup: TexNode | null = null; let num_prime = 0; num_prime += eat_primes(latex, pos); @@ -331,14 +331,14 @@ export class LatexParser { } if (sub !== null || sup !== null || num_prime > 0) { - const res = { type: 'supsub', base } as LatexParseNode; + const res: TexSupsubData = { base }; if (sub) { res.sub = sub; } if (num_prime > 0) { - res.sup = { type: 'ordgroup', args: [] }; + res.sup = { type: 'ordgroup', content: '', args: [] }; for (let i = 0; i < num_prime; i++) { - res.sup.args!.push({ type: 'command', content: 'prime' }); + res.sup.args!.push({ type: 'symbol', content: '\\prime' }); } if (sup) { res.sup.args!.push(sup); @@ -349,7 +349,7 @@ export class LatexParser { } else if (sup) { res.sup = sup; } - return [res, pos]; + return [{type: 'supsub', content: '', data: res }, pos]; } else { return [base, pos]; } @@ -369,7 +369,7 @@ export class LatexParser { if (firstTwoChars === '\\\\') { return [{ type: 'control', content: '\\\\' }, start + 2]; } else if (firstTwoChars === '\\{' || firstTwoChars === '\\}') { - return [{ type: 'token-parenthesis', content: firstTwoChars }, start + 2]; + return [{ type: 'token', content: firstTwoChars }, start + 2]; } else if (['\\%', '\\$', '\\&', '\\#', '\\_'].includes(firstTwoChars)) { return [{ type: 'token', content: firstTwoChars }, start + 2]; } else if (latex.slice(start).startsWith('\\begin{')) { @@ -390,29 +390,31 @@ export class LatexParser { while (pos < latex.length && isdigit(latex[pos])) { pos += 1; } - return [{ type: 'token-number', content: latex.slice(start, pos) }, pos]; + return [{ type: 'token', content: latex.slice(start, pos) }, pos]; } else if (isalpha(firstChar)) { - return [{ type: 'token-letter-var', content: firstChar }, start + 1]; + return [{ type: 'token', content: firstChar }, start + 1]; } else if ('+-*/=<>!'.includes(firstChar)) { - return [{ type: 'token-operator', content: firstChar }, start + 1]; + return [{ type: 'token', content: firstChar }, start + 1]; } else if ('.,;?'.includes(firstChar)) { return [{ type: 'atom', content: firstChar }, start + 1]; } else if ('()[]'.includes(firstChar)) { - return [{ type: 'token-parenthesis', content: firstChar }, start + 1]; + return [{ type: 'token', content: firstChar }, start + 1]; } else if (firstChar === '_') { let [sub, pos] = this.parseNextExpr(latex, start + 1); - let sup: LatexParseNode | undefined = undefined; + let sup: TexNode | undefined = undefined; if (pos < latex.length && latex[pos] === '^') { [sup, pos] = this.parseNextExpr(latex, pos + 1); } - return [{ type: 'supsub', base: EMPTY_NODE, sub, sup }, pos]; + const data = { base: EMPTY_NODE, sub, sup }; + return [{ type: 'supsub', content: '', data: data }, pos]; } else if (firstChar === '^') { let [sup, pos] = this.parseNextExpr(latex, start + 1); - let sub: LatexParseNode | undefined = undefined; + let sub: TexNode | undefined = undefined; if (pos < latex.length && latex[pos] === '_') { [sub, pos] = this.parseNextExpr(latex, pos + 1); } - return [{ type: 'supsub', base: EMPTY_NODE, sub, sup }, pos]; + const data = { base: EMPTY_NODE, sub, sup }; + return [{ type: 'supsub', content: '', data: data }, pos]; } else if (firstChar === ' ') { let pos = start; while (pos < latex.length && latex[pos] === ' ') { @@ -441,7 +443,7 @@ export class LatexParser { pos += command.length; const paramNum = get_command_param_num(command); if (paramNum === 0) { - return [{ type: 'command', content: command }, pos]; + return [{ type: 'symbol', content: '\\' + command }, pos]; } else if (paramNum === 1) { if (command === 'sqrt' && pos < latex.length && latex[pos] === '[') { const posLeftSquareBracket = pos; @@ -449,7 +451,7 @@ export class LatexParser { const exprInside = latex.slice(posLeftSquareBracket + 1, posRightSquareBracket); const exponent = this.parse(exprInside); const [arg1, newPos] = this.parseNextExprWithoutSupSub(latex, posRightSquareBracket + 1); - return [{ type: 'command', content: command, arg1, exponent }, newPos]; + return [{ type: 'unaryFunc', content: '\\' + command, args: [arg1], data: exponent }, newPos]; } else if (command === 'text') { assert(latex[pos] === '{'); const posClosingBracket = find_closing_curly_bracket(latex, pos); @@ -457,12 +459,12 @@ export class LatexParser { return [{ type: 'text', content: text }, posClosingBracket + 1]; } else { let [arg1, newPos] = this.parseNextExprWithoutSupSub(latex, pos); - return [{ type: 'command', content: command, arg1 }, newPos]; + return [{ type: 'unaryFunc', content: '\\' + command, args: [arg1] }, newPos]; } } else if (paramNum === 2) { const [arg1, pos1] = this.parseNextExprWithoutSupSub(latex, pos); const [arg2, pos2] = this.parseNextExprWithoutSupSub(latex, pos1); - return [{ type: 'command', content: command, arg1, arg2 }, pos2]; + return [{ type: 'binaryFunc', content: '\\' + command, args: [arg1, arg2] }, pos2]; } else { throw new Error( 'Invalid number of parameters'); } @@ -498,7 +500,12 @@ export class LatexParser { pos += rightDelimiter.length; const exprInside = latex.slice(exprInsideStart, exprInsideEnd); const body = this.parse(exprInside); - const res = { type: 'leftright', left: leftDelimiter, right: rightDelimiter, body }; + const args = [ + { type: 'token', content: leftDelimiter }, + body, + { type: 'token', content: rightDelimiter } + ] + const res = { type: 'leftright', content: '', args: args }; return [res, pos]; } @@ -530,16 +537,16 @@ export class LatexParser { let exprInside = latex.slice(exprInsideStart, exprInsideEnd); exprInside = exprInside.trimEnd(); // ignore whitespaces and '\n' before \end{envName} const body = this.parseAligned(exprInside); - const res = { type: 'beginend', content: envName, body }; + const res = { type: 'beginend', content: envName, data: body }; return [res, closingIdx + 1]; } - parseAligned(latex: string): LatexParseNode[][] { + parseAligned(latex: string): TexNode[][] { let pos = 0; - const allRows: LatexParseNode[][] = []; - let row: LatexParseNode[] = []; + const allRows: TexNode[][] = []; + let row: TexNode[] = []; allRows.push(row); - let group: LatexParseNode = { type: 'ordgroup', args: [] }; + let group: TexNode = { type: 'ordgroup', content: '', args: [] }; row.push(group); while (pos < latex.length) { @@ -551,11 +558,11 @@ export class LatexParser { continue; } else if (res.type === 'control' && res.content === '\\\\') { row = []; - group = { type: 'ordgroup', args: [] }; + group = { type: 'ordgroup', content: '', args: [] }; row.push(group); allRows.push(row); } else if (res.type === 'control' && res.content === '&') { - group = { type: 'ordgroup', args: [] }; + group = { type: 'ordgroup', content: '', args: [] }; row.push(group); } else { group.args!.push(res); @@ -567,147 +574,10 @@ export class LatexParser { } -export class LatexNodeToTexNodeError extends Error { - node: LatexParseNode; - - constructor(message: string, node: LatexParseNode) { - super(message); - this.name = "LatexNodeToTexNodeError"; - this.node = node; - } -} - -function latexNodeToTexNode(node: LatexParseNode): TexNode { - let res = {} as TexNode; - switch (node.type) { - case 'ordgroup': - res.type = 'ordgroup'; - res.args = (node.args as LatexParseNode[]).map((n: LatexParseNode) => latexNodeToTexNode(n)); - if (res.args!.length === 1) { - res = res.args![0] as TexNode; - } - break; - case 'empty': - res.type = 'empty'; - res.content = ''; - break; - case 'atom': - res.type = 'atom'; - res.content = node.content!; - break; - case 'token': - case 'token-letter-var': - case 'token-number': - case 'token-operator': - case 'token-parenthesis': - res.type = 'symbol'; - res.content = node.content!; - break; - case 'supsub': - res.type = 'supsub'; - res.irregularData = {} as TexSupsubData; - if (node['base']) { - res.irregularData.base = latexNodeToTexNode(node['base']); - } - if (node['sup']) { - res.irregularData.sup = latexNodeToTexNode(node['sup']); - } - if (node['sub']) { - res.irregularData.sub = latexNodeToTexNode(node['sub']); - } - break; - case 'leftright': - res.type = 'leftright'; - - const body = latexNodeToTexNode(node.body as LatexParseNode); - - let left: string = node['left']!; - if (left === "\\{") { - left = "{"; - } - let right: string = node['right']!; - if (right === "\\}") { - right = "}"; - } - const is_atom = (str:string) => (['(', ')', '[', ']', '{', '}'].includes(str)); - res.args = [ - { type: is_atom(left)? 'atom': 'symbol', content: left }, - body, - { type: is_atom(right)? 'atom': 'symbol', content: right} - ]; - break; - case 'beginend': - if (node.content?.startsWith('align')) { - // align, align*, alignat, alignat*, aligned, etc. - res.type = 'align'; - } else { - res.type = 'matrix'; - } - res.content = node.content!; - res.irregularData = (node.body as LatexParseNode[][]).map((row: LatexParseNode[]) => { - return row.map((n: LatexParseNode) => latexNodeToTexNode(n)); - }); - break; - case 'command': - const num_args = get_command_param_num(node.content!); - res.content = '\\' + node.content!; - if (num_args === 0) { - res.type = 'symbol'; - } else if (num_args === 1) { - res.type = 'unaryFunc'; - res.args = [ - latexNodeToTexNode(node.arg1 as LatexParseNode) - ] - if (node.content === 'sqrt') { - if (node.exponent) { - res.irregularData = latexNodeToTexNode(node.exponent) as TexNode; - } - } - } else if (num_args === 2) { - res.type = 'binaryFunc'; - res.args = [ - latexNodeToTexNode(node.arg1 as LatexParseNode), - latexNodeToTexNode(node.arg2 as LatexParseNode) - ] - } else { - throw new LatexNodeToTexNodeError('Invalid number of arguments', node); - } - break; - case 'text': - res.type = 'text'; - res.content = node.content!; - break; - case 'comment': - res.type = 'comment'; - res.content = node.content!; - break; - case 'whitespace': - res.type = 'empty'; - break; - case 'newline': - res.type = 'newline'; - res.content = '\n'; - break; - case 'control': - if (node.content === '\\\\') { - res.type = 'symbol'; - res.content = node.content!; - break; - } else { - throw new LatexNodeToTexNodeError(`Unknown control sequence: ${node.content}`, node); - } - break; - default: - throw new LatexNodeToTexNodeError(`Unknown node type: ${node.type}`, node); - } - return res as TexNode; -} - export function parseTex(tex: string, customTexMacros: {[key: string]: string}): TexNode { const parser = new LatexParser(); for (const [macro, replacement] of Object.entries(customTexMacros)) { tex = tex.replaceAll(macro, replacement); } - const node = parser.parse(tex); - return latexNodeToTexNode(node); + return parser.parse(tex) as TexNode; } diff --git a/src/types.ts b/src/types.ts index b20b21b..9c16883 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,17 +1,3 @@ -export interface LatexParseNode { - type: string; - content?: string; - arg1?: LatexParseNode; - arg2?: LatexParseNode; - args?: LatexParseNode[]; - base?: LatexParseNode; - sub?: LatexParseNode; - sup?: LatexParseNode; - exponent?: LatexParseNode; - body?: LatexParseNode | LatexParseNode[] | LatexParseNode[][]; -} - - export interface TexSupsubData { base: TexNode; sup?: TexNode; @@ -30,11 +16,11 @@ export interface TexNode { // For type="sqrt", it's additional argument wrapped square bracket. e.g. 3 in \sqrt[3]{x} // For type="supsub", it's base, sup, and sub. // For type="array", it's the 2-dimensional matrix. - irregularData?: TexSqrtData | TexSupsubData | TexArrayData; + data?: TexSqrtData | TexSupsubData | TexArrayData; } export interface TypstNode { - type: 'atom' | 'symbol' | 'text' | 'softSpace' | 'comment' | 'newline', + type: 'atom' | 'token' | 'symbol' | 'text' | 'softSpace' | 'comment' | 'newline', content: string; args?: TypstNode[]; } diff --git a/src/writer.ts b/src/writer.ts index d87cc2d..3bbc1d9 100644 --- a/src/writer.ts +++ b/src/writer.ts @@ -74,7 +74,7 @@ export class TypstWriter { } public append(node: TexNode) { - if (node.type === 'empty') { + if (node.type === 'empty' || node.type === 'whitespace') { return; } else if (node.type === 'ordgroup') { // const index = this.startBlock(); @@ -86,12 +86,14 @@ export class TypstWriter { content = 'comma'; } this.queue.push({ type: 'atom', content: content }); + } else if (node.type === 'token') { + this.queue.push({ type: 'token', content: node.content }); } else if (node.type === 'symbol') { this.queue.push({ type: 'symbol', content: node.content }); } else if (node.type === 'text') { this.queue.push(node as TypstNode) } else if (node.type === 'supsub') { - let { base, sup, sub } = node.irregularData as TexSupsubData; + let { base, sup, sub } = node.data as TexSupsubData; // Special logic for overbrace if (base && base.type === 'unaryFunc' && base.content === '\\overbrace' && sup) { @@ -134,7 +136,7 @@ export class TypstWriter { } else if (node.type === 'leftright') { const [left, body, right] = node.args!; // These pairs will be handled by Typst compiler by default. No need to add lr() - if (["[]", "()", "{}", "\\lfloor\\rfloor", "\\lceil\\rceil"].includes(left.content + right.content)) { + if (["[]", "()", "\\{\\}", "\\lfloor\\rfloor", "\\lceil\\rceil"].includes(left.content + right.content)) { this.append(left); this.append(body); this.append(right); @@ -163,12 +165,12 @@ export class TypstWriter { } else if (node.type === 'unaryFunc') { const func_symbol: TypstNode = { type: 'symbol', content: node.content }; const arg0 = node.args![0]; - if (node.content === '\\sqrt' && node.irregularData) { + if (node.content === '\\sqrt' && node.data) { func_symbol.content = 'root'; this.queue.push(func_symbol); this.insideFunctionDepth ++; this.queue.push({ type: 'atom', content: '('}); - this.append(node.irregularData as TexSqrtData); // the number of times to take the root + this.append(node.data as TexSqrtData); // the number of times to take the root this.queue.push({ type: 'atom', content: ','}); this.append(arg0); this.queue.push({ type: 'atom', content: ')'}); @@ -189,7 +191,7 @@ export class TypstWriter { return; } else if (node.content === '\\mathbb') { const body = node.args![0]; - if (body.type === 'symbol' && /^[A-Z]$/.test(body.content)) { + if (body.type === 'token' && /^[A-Z]$/.test(body.content)) { // \mathbb{R} -> RR this.queue.push({ type: 'symbol', content: body.content + body.content}); return; @@ -228,54 +230,64 @@ export class TypstWriter { } else if (node.type === 'newline') { this.queue.push({ type: 'newline', content: '\n'}); return; - } else if (node.type === 'align') { - const matrix = node.irregularData as TexNode[][]; - matrix.forEach((row, i) => { - row.forEach((cell, j) => { - if (j > 0) { - this.queue.push({ type: 'atom', content: '&' }); + } else if (node.type === 'beginend') { + if (node.content!.startsWith('align')) { + // align, align*, alignat, alignat*, aligned, etc. + const matrix = node.data as TexNode[][]; + matrix.forEach((row, i) => { + row.forEach((cell, j) => { + if (j > 0) { + this.queue.push({ type: 'atom', content: '&' }); + } + this.append(cell); + }); + if (i < matrix.length - 1) { + this.queue.push({ type: 'symbol', content: '\\\\' }); } - this.append(cell); }); - if (i < matrix.length - 1) { - this.queue.push({ type: 'symbol', content: '\\\\' }); - } - }); - } else if (node.type === 'matrix') { - const matrix = node.irregularData as TexNode[][]; - this.queue.push({ type: 'symbol', content: 'mat' }); - this.insideFunctionDepth ++; - this.queue.push({ type: 'atom', content: '('}); - this.queue.push({type: 'symbol', content: 'delim: #none, '}); - matrix.forEach((row, i) => { - row.forEach((cell, j) => { - // There is a leading & in row - if (cell.type === 'ordgroup' && cell.args!.length === 0) { - this.queue.push({ type: 'atom', content: ',' }); - return; - } - // if (j == 0 && cell.type === 'newline' && cell.content === '\n') { - // return; - // } - this.append(cell); - // cell.args!.forEach((n) => this.append(n)); - if (j < row.length - 1) { - this.queue.push({ type: 'atom', content: ',' }); - } else { - if (i < matrix.length - 1) { - this.queue.push({ type: 'atom', content: ';' }); + } else { + const matrix = node.data as TexNode[][]; + this.queue.push({ type: 'symbol', content: 'mat' }); + this.insideFunctionDepth ++; + this.queue.push({ type: 'atom', content: '('}); + this.queue.push({type: 'symbol', content: 'delim: #none, '}); + matrix.forEach((row, i) => { + row.forEach((cell, j) => { + // There is a leading & in row + if (cell.type === 'ordgroup' && cell.args!.length === 0) { + this.queue.push({ type: 'atom', content: ',' }); + return; } - } + // if (j == 0 && cell.type === 'newline' && cell.content === '\n') { + // return; + // } + this.append(cell); + // cell.args!.forEach((n) => this.append(n)); + if (j < row.length - 1) { + this.queue.push({ type: 'atom', content: ',' }); + } else { + if (i < matrix.length - 1) { + this.queue.push({ type: 'atom', content: ';' }); + } + } + }); }); - }); - this.queue.push({ type: 'atom', content: ')'}); - this.insideFunctionDepth --; + this.queue.push({ type: 'atom', content: ')'}); + this.insideFunctionDepth --; + } + } else if (node.type === 'matrix') { } else if (node.type === 'unknownMacro') { if (this.nonStrict) { this.queue.push({ type: 'symbol', content: node.content }); } else { throw new TypstWriterError(`Unknown macro: ${node.content}`, node); } + } else if (node.type === 'control') { + if (node.content === '\\\\') { + this.queue.push({ type: 'symbol', content: node.content }); + } else { + throw new TypstWriterError(`Unknown control sequence: ${node.content}`, node); + } } else if (node.type === 'comment') { this.queue.push({ type: 'comment', content: node.content }); } else { @@ -291,6 +303,7 @@ export class TypstWriter { str = node.content; break; case 'symbol': + case 'token': str = convertToken(node.content); break; case 'text': @@ -320,7 +333,7 @@ export class TypstWriter { const is_single_atom = (node.type === 'atom'); const is_single_function = (node.type === 'unaryFunc' || node.type === 'binaryFunc' || node.type === 'leftright'); - const is_single = ['atom', 'symbol', 'unaryFunc', 'binaryFunc', 'leftright'].includes(node.type); + const is_single = ['atom', 'symbol', 'token', 'unaryFunc', 'binaryFunc', 'leftright'].includes(node.type); if (is_single) { this.append(node); } else { diff --git a/test/main.test.ts b/test/main.test.ts index 5eff7e7..8149832 100644 --- a/test/main.test.ts +++ b/test/main.test.ts @@ -2,7 +2,7 @@ import { describe, it, test, expect } from 'vitest'; import yaml from 'js-yaml'; import path from 'node:path'; import fs from 'node:fs'; -import { parseTex, LatexNodeToTexNodeError } from '../src/parser'; +import { parseTex, LatexParserError } from '../src/parser'; import { tex2typst } from '../src/index'; import { TypstWriterError } from '../src/writer'; import { Tex2TypstOptions, TexNode } from '../src/types'; @@ -52,7 +52,7 @@ caseFiles.forEach(({ title, cases }) => { expect(result).toBe(typst); } catch (e) { console.log(`====== 😭 Error ======`); - if (e instanceof LatexNodeToTexNodeError || e instanceof TypstWriterError) { + if (e instanceof LatexParserError || e instanceof TypstWriterError) { console.log(e.node); } if (tex_node !== null) {