-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.js
404 lines (371 loc) · 14.4 KB
/
lexer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
import Context from '#internal/context';
import Logger from '#internal/logger';
function resolve (value) {
if (typeof value === 'function') { return value(); }
return value;
}
function determineIfFinal (rule, context) {
let final = rule.final;
if (typeof final === 'boolean') { return final; }
else if (typeof final === 'function') {
return final(context);
} else if (!final) {
return false;
} else {
throw new TypeError(`Invalid value for rule.final!`);
}
}
class LexerError extends Error {
constructor (message) {
super(message);
this.name = 'LexerError';
}
}
/**
* A string fragment with contextual information. It has a name, content, and an
* `index` value that corresponds to where it begins in the original string. A
* token's content can be either a string or an array of tokens.
*
* Tokens are created automatically by lexers.
*/
class Token {
constructor (name, content, index, lengthConsumed, options = {}) {
/**
* The name of the token. Determined by the name of the lexer rule that
* created this token.
* @type {string}
*/
this.name = name;
/**
* The content of the token. Either a string or an array of `Token`s.
* @type {string|Token[]}
*/
this.content = content;
/**
* The index of the original string at which this token matched.
* @type {number}
*/
// “Length consumed” refers to the number of characters that have already
// been processed in the original source string. All our indices should be
// relative to this value.
this.index = lengthConsumed + index;
this.scopes = options.scopes;
}
}
/**
* A lexer that consumes a string based on a series of rules. Can include other
* lexers or designate “sub-lexers” for certain rules.
*
* @param {Object[]} rules A collection of rules. Each rule object must satisfy
* the “rule” contract.
* @param {String} [name = ''] A name for the lexer. Optional; used for
* debugging.
*/
class Lexer {
constructor (rules, name = '', options = {}) {
this.rules = rules;
// A lexer can optionally have a name; this is mainly for debugging
// purposes.
this.name = name;
this.scopes = options.scopes;
this.silent = !!options.silent;
this.highlight = options.highlight;
this.logger = new Logger(`Lexer ${this.name}`);
}
/**
* Add rules to the lexer.
*
* @param {Object[]|Lexer} rules An array of rules or an instance of `Lexer`.
* Each rule object must satisfy the “rule” contract.
*/
addRules (rules) {
this.rules.push(...rules);
}
// To iterate through a lexer is to iterate through its rules.
[Symbol.iterator] () {
let allRules = [];
for (let rule of this.rules) {
if (rule.include) {
let lexer = resolve(rule.include);
allRules.push([...lexer]);
} else {
allRules.push(rule);
}
}
return allRules.values();
}
/**
* Attempt to consume part or all of a string with this lexer.
*
* Will use its rules to break the string into tokens, matching as close to
* the beginning of the string as possible with each successive matching
* attempt.
*
* @param {[type]} text The text to consume.
* @param {[type]} [context=null] A context object to reuse. If one is not
* given, a fresh context will be created. All invocations of sub-lexers by
* this lexer will share the same context instance.
* @param {Object} [options={}] A series of options.
* @param {number} [options.startIndex=0] The index on which to start lexing
* the string. Defaults to 0.
* @param {number} [options.highlight=false] Whether the results of this
* lexing run will be used for syntax highlighting. If `true`, will call
* a lexer's `highlight` callback (if present) so that applicable ranges
* are returned as tokenized HTML rather than the typical tree of `Token`s.
*
* @returns {Object} An object with two keys: `results` and `text`. The
* `results` key refers to an array of `Token`s and raw strings. The `text`
* key contains whatever fragment of the string could not be parsed.
*/
run (text, context = null, { startIndex = 0, highlight = false } = {}) {
let loggerWasEnabled = this.logger.enabled;
if (context) {
this.logger.toggle(context.logging);
}
let isRoot = context === null;
let tokens = [];
if (!context) {
context = new Context();
}
let lastText = null;
let lengthConsumed = startIndex;
while (text) {
// console.groupCollapsed(`${this.name ? `[${this.name}] ` : ''}Trying new match:`, text);
// `cMatch` and `cRule` refer to a candidate match and a candidate rule,
// respectively.
let rule, match, cMatch;
for (let cRule of this) {
cRule.pattern.lastIndex = 0;
cMatch = cRule.pattern.exec(text);
if (cMatch && cRule.test) {
let result = false;
// Will this pattern allow us to keep going?
if (cRule.pattern.lastIndex > cMatch.index) {
// If this test passes, our regex has a `g` flag and will keep
// updating its `lastIndex`. That allows us to keep iterating until
// we get a positive result from this rule, or until there are no
// more possible matches to test.
while (cMatch) {
result = cRule.test(cMatch, text, context, cRule.pattern);
if (result) { break; }
cMatch = cRule.pattern.exec(text);
}
} else {
// A `test` rule, if defined, imposes further criteria on this rule.
// If it returns truthy, we proceed. If it returns falsy, the rule
// doesn't pass after all.
result = cRule.test(cMatch, text, context, cRule.pattern);
if (!result) { cMatch = null; }
}
}
if (cMatch) {
// This rule matched, but it's still only a _candidate_ for the right
// match. We choose the one that matches as near to the beginning of
// the string as possible.
if (cMatch.index === 0) {
// This pattern matched without skipping any text. It's the winner.
match = cMatch;
rule = cRule;
break;
} else if (!match || (cMatch.index < match.index)) {
// This is the match that has skipped the least text so far. But
// keep going to see if we can find a better one.
match = cMatch;
rule = cRule;
}
}
}
if (!match) {
// Failing to match anything will cause the Lexer to return and report its results.
this.logger.debug(this.name, 'No match!');
// console.groupEnd();
break;
} else {
this.logger.debug(this.name, 'Match:', rule.name, match[0], match, rule);
this.logger.debug(' found at global index:', match.index + lengthConsumed);
}
if (rule.win) {
// Give the winning rule a chance to set context.
rule.win(match, text, context);
}
let matchIndex = match.index;
let newStartIndex = match.index + match[0].length;
if (match.index > 0) {
let skipped = text.slice(0, match.index);
tokens.push(skipped);
lengthConsumed += skipped.length;
// Now that we've consumed all the skipped text, the match's index
// should be reset to zero, as if that skipped text had already been
// processed before the call to `exec`.
matchIndex = 0;
}
text = text.slice(newStartIndex);
// A rule with `final: true` will cause the lexer to stop parsing once
// the current match has been processed.
//
// If we're inside a sub-lexer, this means that it will cede to its
// parent, and the parent will continue parsing with its own rules.
//
// If we're not inside a sub-lexer, this means that it will cede to the
// code that called it, even if the entire string hasn't been parsed yet.
//
// The `final` property can be a boolean or a function. If it's a
// function, it'll get called with the current `context` as its only
// argument. This lets us decide dynamically whether the lexer should
// terminate based on state.
//
// An additional property, `skipSubRulesIfFinal`, determines what happens
// when a final rule also has a sub-lexer (via the `after` or
// `inside`properties). If `false` (the default), `final` acts _after_
// the sub-lexer has a chance to parse the remaining portion of the
// string. If `true`, the lexer skips the sub-lexing phase even if
// `inside` or `after` is present.
let ruleIsFinal = determineIfFinal(rule, context);
let shouldSkipSubRules = ruleIsFinal && rule.skipSubRulesIfFinal;
if (rule.trim) {
if (rule.raw) {
throw new LexerError(`Options "trim" and "raw" cannot coexist on a rule!`);
}
let originalMatch = match[0];
// If `trim: true` is present, the rule wants us to strip leading space
// — or, rather, pull it out of the match and prepend it as a raw token.
let leadingMatch = /^[\n\s\t]+/.exec(originalMatch);
if (leadingMatch) {
tokens.push(leadingMatch[0]);
match[0] = match[0].slice(leadingMatch[0].length);
lengthConsumed += leadingMatch[0].length;
}
}
if (rule.raw) {
// Sometimes we write rules to match a string just to prevent it from
// being matched by another rule. In these cases, we can use `raw:
// true` to pass along the raw string rather than wrap it in a Token.
tokens.push(match[0]);
lengthConsumed += match[0].length;
} else if ((rule.inside || rule.after) && !shouldSkipSubRules) {
let lexerName, lexer, mode;
// Often, when we encounter a certain pattern, we want to have a
// different lexer parse some successive portion of the string before
// we act again. This is how we apply context-aware parsing rules.
//
// We specify these "sub-lexers" via `inside` or `after`. The two
// properties have identical shapes: they take a `name` property and a
// `lexer` property, the values of which are a string and a Lexer
// instance, respectively.
//
// Sub-lexing produces a token whose content is an array, rather than a
// string, and which contains all the Tokens parsed by the sub-lexer,
// in order. Its name will be the `name` value you specified in your
// `after` or `inside` rule.
//
// The difference between `inside` and `after` is whether the rule that
// prompted the sub-lexing is placed _within_ that Token (as the first
// item in its `content` array) or just _after_ that token.
//
// Any `context` set inside your lexer will also be available to
// sub-lexers. It's up to the sub-lexer to decide when to stop parsing,
// but that decision can be made dynamically depending on the values
// inside of the `context` store.
if (rule.inside) {
mode = 'inside';
lexerName = rule.inside.name;
lexer = resolve(rule.inside.lexer);
} else {
mode = 'after';
lexerName = rule.after.name;
lexer = resolve(rule.after.lexer);
}
if (!lexer || !(lexer instanceof Lexer)) {
throw new LexerError(`Invalid lexer!`);
}
let initialToken = new Token(
rule.name,
match[0],
matchIndex,
lengthConsumed,
{ scopes: rule.scopes || rule.name }
);
let subLexerStartIndex = lengthConsumed + match[0].length - matchIndex;
let subTokens = [];
if (mode === 'inside') {
// In 'inside' mode, this initial token is part of the subtokens
// collection.
subTokens.push(initialToken);
} else {
// In 'after' mode, this initial token is not part of the subtokens
// collection.
tokens.push(initialToken);
}
this.logger.debug('Lexer START', lexerName, 'at index:', subLexerStartIndex, text);
// To ensure accurate `index` values on Tokens, we need to tell the
// sub-lexer how much of the string we've already consumed.
let lexerResult = lexer.run(text, context, {
startIndex: subLexerStartIndex,
highlight
});
this.logger.debug('Lexer END', lexerName, 'consumed:', lexerResult.lengthConsumed);
subTokens.push(...lexerResult.tokens);
// Build the container token.
let token = new Token(
lexerName,
subTokens,
matchIndex,
lengthConsumed,
{
scopes: lexerResult.scopes
}
);
tokens.push(token);
lengthConsumed = lexerResult.lengthConsumed;
text = lexerResult.text;
} else {
// Ordinary rule with no `inside` or `after` property.
let token = new Token(
rule.name,
match[0],
matchIndex,
lengthConsumed,
{ scopes: rule.scopes || rule.name }
);
tokens.push(token);
lengthConsumed += match[0].length;
}
if (ruleIsFinal) {
this.logger.debug(this.name, 'Last rule! Done.');
// console.groupEnd();
break;
}
// If we get this far without having consumed any more of the string than
// the last iteration, then we should consider ourselves to be done. This
// is in place to prevent accidental infinite loops, though it does
// prevent us from using patterns that end up consuming zero characters.
// In the future I might want this to behave differently.
if (text === lastText) {
this.logger.debug(this.name, `No further matches! Done.`);
// console.groupEnd();
break;
}
lastText = text;
// console.groupEnd();
} // end the gigantic `while` loop.
if (highlight && this.highlight) {
tokens = this.highlight(tokens, context);
}
// Lexer#run returns three values: an array of tokens, the leftover text
// that could not be parsed (if any), and the number of characters that
// were able to be parsed.
this.logger.toggle(loggerWasEnabled);
return {
name: this.name,
scopes: this.scopes,
content: tokens,
tokens,
text,
lengthConsumed
};
}
}
export {
Token,
Lexer as default
};