Skip to content

Commit

Permalink
make type of Token::type enum
Browse files Browse the repository at this point in the history
  • Loading branch information
qwinsi committed Sep 7, 2024
1 parent f80bdfa commit 3da187f
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 55 deletions.
105 changes: 52 additions & 53 deletions src/parser.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { TexNode, TexSupsubData } from "./types";
import { TexNode, TexSupsubData, Token, TokenType } from "./types";


const UNARY_COMMANDS = [
Expand Down Expand Up @@ -60,8 +60,8 @@ function get_command_param_num(command: string): number {
}
}

const LEFT_CURLY_BRACKET: Token = {type: 'control', value: '{'};
const RIGHT_CURLY_BRACKET: Token = {type: 'control', value: '}'};
const LEFT_CURLY_BRACKET: Token = {type: TokenType.CONTROL, value: '{'};
const RIGHT_CURLY_BRACKET: Token = {type: TokenType.CONTROL, value: '}'};

function find_closing_curly_bracket(tokens: Token[], start: number): number {
assert(token_eq(tokens[start], LEFT_CURLY_BRACKET));
Expand All @@ -83,8 +83,8 @@ function find_closing_curly_bracket(tokens: Token[], start: number): number {
return pos - 1;
}

const LEFT_SQUARE_BRACKET: Token = {type: 'element', value: '['};
const RIGHT_SQUARE_BRACKET: Token = {type: 'element', value: ']'};
const LEFT_SQUARE_BRACKET: Token = {type: TokenType.ELEMENT, value: '['};
const RIGHT_SQUARE_BRACKET: Token = {type: TokenType.ELEMENT, value: ']'};

function find_closing_square_bracket(tokens: Token[], start: number): number {
assert(token_eq(tokens[start], LEFT_SQUARE_BRACKET));
Expand Down Expand Up @@ -117,7 +117,7 @@ function isdigit(char: string): boolean {

function eat_whitespaces(tokens: Token[], start: number): Token[] {
let pos = start;
while (pos < tokens.length && ['whitespace', 'newline'].includes(tokens[pos].type)) {
while (pos < tokens.length && [TokenType.WHITESPACE, TokenType.NEWLINE].includes(tokens[pos].type)) {
pos++;
}
return tokens.slice(start, pos);
Expand All @@ -126,9 +126,9 @@ function eat_whitespaces(tokens: Token[], start: number): Token[] {

function eat_parenthesis(tokens: Token[], start: number): Token | null {
const firstToken = tokens[start];
if (firstToken.type === 'element' && ['(', ')', '[', ']', '|', '\\{', '\\}'].includes(firstToken.value)) {
if (firstToken.type === TokenType.ELEMENT && ['(', ')', '[', ']', '|', '\\{', '\\}'].includes(firstToken.value)) {
return firstToken;
} else if (firstToken.type === 'command' && ['lfloor', 'rfloor', 'lceil', 'rceil', 'langle', 'rangle'].includes(firstToken.value.slice(1))) {
} else if (firstToken.type === TokenType.COMMAND && ['lfloor', 'rfloor', 'lceil', 'rceil', 'langle', 'rangle'].includes(firstToken.value.slice(1))) {
return firstToken;
} else {
return null;
Expand All @@ -137,7 +137,7 @@ function eat_parenthesis(tokens: Token[], start: number): Token | null {

function eat_primes(tokens: Token[], start: number): number {
let pos = start;
while (pos < tokens.length && token_eq(tokens[pos], { type: 'element', value: "'" })) {
while (pos < tokens.length && token_eq(tokens[pos], { type: TokenType.ELEMENT, value: "'" })) {
pos += 1;
}
return pos - start;
Expand All @@ -155,8 +155,8 @@ function eat_command_name(latex: string, start: number): string {



const LEFT_COMMAND: Token = { type: 'command', value: '\\left' };
const RIGHT_COMMAND: Token = { type: 'command', value: '\\right' };
const LEFT_COMMAND: Token = { type: TokenType.COMMAND, value: '\\left' };
const RIGHT_COMMAND: Token = { type: TokenType.COMMAND, value: '\\right' };

function find_closing_right_command(tokens: Token[], start: number): number {
let count = 1;
Expand All @@ -178,8 +178,8 @@ function find_closing_right_command(tokens: Token[], start: number): number {
}


const BEGIN_COMMAND: Token = { type: 'command', value: '\\begin' };
const END_COMMAND: Token = { type: 'command', value: '\\end' };
const BEGIN_COMMAND: Token = { type: TokenType.COMMAND, value: '\\begin' };
const END_COMMAND: Token = { type: TokenType.COMMAND, value: '\\end' };


function find_closing_end_command(tokens: Token[], start: number): number {
Expand Down Expand Up @@ -226,11 +226,6 @@ function find_closing_curly_bracket_char(latex: string, start: number): number {
}


export interface Token {
type: 'element' | 'command' | 'text' | 'comment' | 'whitespace' | 'newline' | 'control' | 'unknown';
value: string;
}

export function tokenize(latex: string): Token[] {
const tokens: Token[] = [];
let pos = 0;
Expand All @@ -244,7 +239,7 @@ export function tokenize(latex: string): Token[] {
while (newPos < latex.length && latex[newPos] !== '\n') {
newPos += 1;
}
token = { type: 'comment', value: latex.slice(pos + 1, newPos) };
token = { type: TokenType.COMMENT, value: latex.slice(pos + 1, newPos) };
pos = newPos;
break;
}
Expand All @@ -253,19 +248,19 @@ export function tokenize(latex: string): Token[] {
case '_':
case '^':
case '&':
token = { type: 'control', value: firstChar};
token = { type: TokenType.CONTROL, value: firstChar};
pos++;
break;
case '\n':
token = { type: 'newline', value: firstChar};
token = { type: TokenType.NEWLINE, value: firstChar};
pos++;
break;
case '\r': {
if (pos + 1 < latex.length && latex[pos + 1] === '\n') {
token = { type: 'newline', value: '\n' };
token = { type: TokenType.NEWLINE, value: '\n' };
pos += 2;
} else {
token = { type: 'newline', value: '\n' };
token = { type: TokenType.NEWLINE, value: '\n' };
pos ++;
}
break;
Expand All @@ -275,7 +270,7 @@ export function tokenize(latex: string): Token[] {
while (newPos < latex.length && latex[newPos] === ' ') {
newPos += 1;
}
token = {type: 'whitespace', value: latex.slice(pos, newPos)};
token = {type: TokenType.WHITESPACE, value: latex.slice(pos, newPos)};
pos = newPos;
break;
}
Expand All @@ -285,12 +280,12 @@ export function tokenize(latex: string): Token[] {
}
const firstTwoChars = latex.slice(pos, pos + 2);
if (['\\\\', '\\,'].includes(firstTwoChars)) {
token = { type: 'control', value: firstTwoChars };
token = { type: TokenType.CONTROL, value: firstTwoChars };
} else if (['\\{','\\}', '\\%', '\\$', '\\&', '\\#', '\\_'].includes(firstTwoChars)) {
token = { type: 'element', value: firstTwoChars };
token = { type: TokenType.ELEMENT, value: firstTwoChars };
} else {
const command = eat_command_name(latex, pos + 1);
token = { type: 'command', value: '\\' + command};
token = { type: TokenType.COMMAND, value: '\\' + command};
}
pos += token.value.length;
break;
Expand All @@ -301,25 +296,25 @@ export function tokenize(latex: string): Token[] {
while (newPos < latex.length && isdigit(latex[newPos])) {
newPos += 1;
}
token = { type: 'element', value: latex.slice(pos, newPos) }
token = { type: TokenType.ELEMENT, value: latex.slice(pos, newPos) }
} else if (isalpha(firstChar)) {
token = { type: 'element', value: firstChar };
token = { type: TokenType.ELEMENT, value: firstChar };
} else if ('+-*/=\'<>!.,;?()[]|'.includes(firstChar)) {
token = { type: 'element', value: firstChar }
token = { type: TokenType.ELEMENT, value: firstChar }
} else {
token = { type: 'unknown', value: firstChar };
token = { type: TokenType.UNKNOWN, value: firstChar };
}
pos += token.value.length;
}
}

tokens.push(token);

if (token.type === 'command' && ['\\text', '\\begin', '\\end'].includes(token.value)) {
if (token.type === TokenType.COMMAND && ['\\text', '\\begin', '\\end'].includes(token.value)) {
if (pos >= latex.length || latex[pos] !== '{') {
throw new LatexParserError(`No content for ${token.value} command`);
}
tokens.push({ type: 'control', value: '{' });
tokens.push({ type: TokenType.CONTROL, value: '{' });
const posClosingBracket = find_closing_curly_bracket_char(latex, pos);
pos++;
let textInside = latex.slice(pos, posClosingBracket);
Expand All @@ -328,8 +323,8 @@ export function tokenize(latex: string): Token[] {
for (const char of chars) {
textInside = textInside.replaceAll('\\' + char, char);
}
tokens.push({ type: 'text', value: textInside });
tokens.push({ type: 'control', value: '}' });
tokens.push({ type: TokenType.TEXT, value: textInside });
tokens.push({ type: TokenType.CONTROL, value: '}' });
pos = posClosingBracket + 1;
}
}
Expand All @@ -351,8 +346,8 @@ export class LatexParserError extends Error {

type ParseResult = [TexNode, number];

const SUB_SYMBOL:Token = { type: 'control', value: '_' };
const SUP_SYMBOL:Token = { type: 'control', value: '^' };
const SUB_SYMBOL:Token = { type: TokenType.CONTROL, value: '_' };
const SUP_SYMBOL:Token = { type: TokenType.CONTROL, value: '^' };

export class LatexParser {
space_sensitive: boolean;
Expand Down Expand Up @@ -464,21 +459,25 @@ export class LatexParser {
const firstToken = tokens[start];
const tokenType = firstToken.type;
switch (tokenType) {
case 'element':
case 'text':
case 'comment':
case 'whitespace':
case 'newline':
return [{ type: tokenType, content: firstToken.value }, start + 1];
case 'command':
case TokenType.ELEMENT:
return [{ type: 'element', content: firstToken.value }, start + 1];
case TokenType.TEXT:
return [{ type: 'text', content: firstToken.value }, start + 1];
case TokenType.COMMENT:
return [{ type: 'comment', content: firstToken.value }, start + 1];
case TokenType.WHITESPACE:
return [{ type: 'whitespace', content: firstToken.value }, start + 1];
case TokenType.NEWLINE:
return [{ type: 'newline', content: firstToken.value }, start + 1];
case TokenType.COMMAND:
if (token_eq(firstToken, BEGIN_COMMAND)) {
return this.parseBeginEndExpr(tokens, start);
} else if (token_eq(firstToken, LEFT_COMMAND)) {
return this.parseLeftRightExpr(tokens, start);
} else {
return this.parseCommandExpr(tokens, start);
}
case 'control':
case TokenType.CONTROL:
const controlChar = firstToken.value;
switch (controlChar) {
case '{':
Expand Down Expand Up @@ -508,7 +507,7 @@ export class LatexParser {
}

parseCommandExpr(tokens: Token[], start: number): ParseResult {
assert(tokens[start].type === 'command');
assert(tokens[start].type === TokenType.COMMAND);

const command = tokens[start].value; // command name starts with a \

Expand All @@ -534,7 +533,7 @@ export class LatexParser {
throw new LatexParserError('Expecting content for \\text command');
}
assert(token_eq(tokens[pos], LEFT_CURLY_BRACKET));
assert(tokens[pos + 1].type === 'text');
assert(tokens[pos + 1].type === TokenType.TEXT);
assert(token_eq(tokens[pos + 2], RIGHT_CURLY_BRACKET));
const text = tokens[pos + 1].value;
return [{ type: 'text', content: text }, pos + 3];
Expand Down Expand Up @@ -600,7 +599,7 @@ export class LatexParser {

let pos = start + 1;
assert(token_eq(tokens[pos], LEFT_CURLY_BRACKET));
assert(tokens[pos + 1].type === 'text');
assert(tokens[pos + 1].type === TokenType.TEXT);
assert(token_eq(tokens[pos + 2], RIGHT_CURLY_BRACKET));
const envName = tokens[pos + 1].value;
pos += 3;
Expand All @@ -617,7 +616,7 @@ export class LatexParser {
pos = endIdx + 1;

assert(token_eq(tokens[pos], LEFT_CURLY_BRACKET));
assert(tokens[pos + 1].type === 'text');
assert(tokens[pos + 1].type === TokenType.TEXT);
assert(token_eq(tokens[pos + 2], RIGHT_CURLY_BRACKET));
if (tokens[pos + 1].value !== envName) {
throw new LatexParserError('Mismatched \\begin and \\end environments');
Expand All @@ -626,7 +625,7 @@ export class LatexParser {

const exprInside = tokens.slice(exprInsideStart, exprInsideEnd);
// ignore whitespaces and '\n' before \end{envName}
while(exprInside.length > 0 && ['whitespace', 'newline'].includes(exprInside[exprInside.length - 1].type)) {
while(exprInside.length > 0 && [TokenType.WHITESPACE, TokenType.NEWLINE].includes(exprInside[exprInside.length - 1].type)) {
exprInside.pop();
}
const body = this.parseAligned(exprInside);
Expand Down Expand Up @@ -670,10 +669,10 @@ function passIgnoreWhitespaceBeforeScriptMark(tokens: Token[]): Token[] {
const is_script_mark = (token: Token) => token_eq(token, SUB_SYMBOL) || token_eq(token, SUP_SYMBOL);
let out_tokens: Token[] = [];
for (let i = 0; i < tokens.length; i++) {
if (tokens[i].type === 'whitespace' && i + 1 < tokens.length && is_script_mark(tokens[i + 1])) {
if (tokens[i].type === TokenType.WHITESPACE && i + 1 < tokens.length && is_script_mark(tokens[i + 1])) {
continue;
}
if (tokens[i].type === 'whitespace' && i - 1 >= 0 && is_script_mark(tokens[i - 1])) {
if (tokens[i].type === TokenType.WHITESPACE && i - 1 >= 0 && is_script_mark(tokens[i - 1])) {
continue;
}
out_tokens.push(tokens[i]);
Expand All @@ -685,7 +684,7 @@ function passIgnoreWhitespaceBeforeScriptMark(tokens: Token[]): Token[] {
function passExpandCustomTexMacros(tokens: Token[], customTexMacros: {[key: string]: string}): Token[] {
let out_tokens: Token[] = [];
for (const token of tokens) {
if (token.type === 'command' && customTexMacros[token.value]) {
if (token.type === TokenType.COMMAND && customTexMacros[token.value]) {
const expanded_tokens = tokenize(customTexMacros[token.value]);
out_tokens = out_tokens.concat(expanded_tokens);
} else {
Expand Down
17 changes: 17 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
export enum TokenType {
ELEMENT,
COMMAND,
TEXT,
COMMENT,
WHITESPACE,
NEWLINE,
CONTROL,
UNKNOWN,
}

export interface Token {
type: TokenType;
value: string;
}


export interface TexSupsubData {
base: TexNode;
sup?: TexNode;
Expand Down
4 changes: 2 additions & 2 deletions test/main.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ import { describe, it, test, expect } from 'vitest';
import yaml from 'js-yaml';
import path from 'node:path';
import fs from 'node:fs';
import { parseTex, LatexParserError, Token, tokenize } from '../src/parser';
import { parseTex, tokenize } from '../src/parser';
import { tex2typst } from '../src/index';
import { TypstWriterError } from '../src/writer';
import { Tex2TypstOptions, TexNode } from '../src/types';
import { Tex2TypstOptions, TexNode, Token } from '../src/types';

type TestCase = {
title: string;
Expand Down

0 comments on commit 3da187f

Please sign in to comment.