Skip to content

Commit 81da13f

Browse files
sleepy-monaxLouciole
authored andcommitted
vaev-css: Spec compliant string lexing.
1 parent b834738 commit 81da13f

File tree

4 files changed

+110
-52
lines changed

4 files changed

+110
-52
lines changed

src/vaev-engine/css/lexer.cpp

Lines changed: 102 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -211,37 +211,6 @@ static auto const RE_NUMBER = Re::chain(
211211
)
212212
);
213213

214-
// string token description
215-
static auto const RE_STRING =
216-
Re::either(
217-
Re::chain(
218-
'"'_re,
219-
Re::zeroOrMore(
220-
Re::either(
221-
RE_ESCAPE,
222-
Re::negate(
223-
'\\'_re | RE_NEWLINE | '\"'_re
224-
),
225-
'\\'_re & RE_NEWLINE
226-
)
227-
),
228-
'"'_re
229-
),
230-
Re::chain(
231-
'\''_re,
232-
Re::zeroOrMore(
233-
Re::either(
234-
RE_ESCAPE,
235-
Re::negate(
236-
'\\'_re | RE_NEWLINE | '\''_re
237-
),
238-
'\\'_re & RE_NEWLINE
239-
)
240-
),
241-
'\''_re
242-
)
243-
);
244-
245214
export struct Lexer {
246215
Io::SScan _scan;
247216
Token _curr;
@@ -274,6 +243,105 @@ export struct Lexer {
274243
return {Token::FUNCTION, s.end()};
275244
}
276245

246+
// https://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape
247+
bool _checkValidEscape(Io::SScan& s) {
248+
if (s.rem() < 2)
249+
return false;
250+
// If the first code point is not U+005C REVERSE SOLIDUS (\), return false.
251+
if (s.peek() != '\\')
252+
return false;
253+
// Otherwise, if the second code point is a newline, return false.
254+
else if (s.ahead('\\'_re & RE_NEWLINE))
255+
return false;
256+
// Otherwise, return true.
257+
return true;
258+
}
259+
260+
// https://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
261+
Rune _consumeEscapeCodepoint(Io::SScan& s) const {
262+
if (not s.skip('\\'))
263+
return U'';
264+
265+
// hex digit
266+
if (auto hex = s.token(Re::nOrN(1, 5, Re::xdigit()))) {
267+
// Consume as many hex digits as possible, but no more than 5.
268+
// NOTE: This means 1-6 hex digits have been consumed in total.
269+
270+
// If the next input code point is whitespace, consume it as well.
271+
s.skip(RE_WHITESPACE);
272+
273+
// Interpret the hex digits as a hexadecimal number.
274+
auto num = Io::atou(hex, {.base = 16}).unwrap();
275+
276+
// If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point
277+
if (0xD800 <= num and num <= 0xDFFF)
278+
return U'';
279+
if (num == 0 or num > 0x10FFFF)
280+
// return U+FFFD REPLACEMENT CHARACTER (�).
281+
return U'';
282+
283+
// Otherwise, return the code point with that value.
284+
return num;
285+
}
286+
// EOF
287+
else if (s.ended()) {
288+
// This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�).
289+
logWarn("Unexpected EOF");
290+
return U'';
291+
}
292+
293+
// anything else
294+
else {
295+
return s.next();
296+
}
297+
}
298+
299+
// https://www.w3.org/TR/css-syntax-3/#consume-string-token
300+
Token _consumeStringToken(Io::SScan& s, Opt<Rune> endingCodepoint = NONE) const {
301+
// Initially create a <string-token> with its value set to the empty string.
302+
StringBuilder sb;
303+
if (not endingCodepoint)
304+
endingCodepoint = s.next();
305+
306+
// Repeatedly consume the next input code point from the stream:
307+
while (not s.ended()) {
308+
// ending code point
309+
if (s.peek() == endingCodepoint) {
310+
// Return the <string-token>.
311+
s.next();
312+
return {Token::STRING, sb.take()};
313+
}
314+
// newline
315+
else if (s.ahead(RE_NEWLINE)) {
316+
// This is a parse error. Reconsume the current input code point, create a <bad-string-token>, and return it.
317+
logWarn("tokenizing bad string due to newline");
318+
return {Token::BAD_STRING, sb.take()};
319+
}
320+
// U+005C REVERSE SOLIDUS (\)
321+
else if (s.peek() == '\\') {
322+
// If the next input code point is EOF, do nothing.
323+
if (s.rem() == 1)
324+
s.next();
325+
// Otherwise, if the next input code point is a newline, consume it.
326+
else if (s.skip('\\'_re & RE_NEWLINE))
327+
/* consumed by skip() */;
328+
// Otherwise, (the stream starts with a valid escape) consume an escaped code point and append the returned code point to the <string-token>’s value.
329+
else
330+
sb.append(_consumeEscapeCodepoint(s));
331+
}
332+
// anything else
333+
else {
334+
// Append the current input code point to the <string-token>’s value.
335+
sb.append(s.next());
336+
}
337+
}
338+
339+
// EOF
340+
// This is a parse error. Return the <string-token>.
341+
logWarn("unexpected EOF");
342+
return {Token::STRING, sb.take()};
343+
}
344+
277345
Token _next(Io::SScan& s) const {
278346
s.begin();
279347
if (s.ended()) {
@@ -321,13 +389,12 @@ export struct Lexer {
321389
return _nextIdent(s);
322390
} else if (s.skip(RE_AT_KEYWORD)) {
323391
return {Token::AT_KEYWORD, s.end()};
324-
} else if (s.skip(RE_STRING)) {
325-
// https://www.w3.org/TR/css-syntax-3/#consume-string-token
326-
return {Token::STRING, s.end()};
392+
} else if (s.peek() == '"' or s.peek() == '\'') {
393+
return _consumeStringToken(s);
327394
} else if (s.skip(RE_DELIM)) {
328395
return {Token::DELIM, s.end()};
329396
} else {
330-
logWarn("unrecognized token: {}", s.end());
397+
logWarn("unrecognized token: {}", s.remStr());
331398
s.next();
332399
return {Token::OTHER, s.end()};
333400
}

src/vaev-engine/css/tests/test-lexer.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,14 @@ test$("vaev-css-lex-hash") {
5252
test$("vaev-css-lex-strings") {
5353
auto t = lex("''");
5454
expectEq$(t.type, Token::STRING);
55+
expectEq$(t.data, "");
5556

5657
t = lex("\"\"");
5758
expectEq$(t.type, Token::STRING);
59+
expectEq$(t.data, "");
60+
61+
t = lex(R"("\"")");
62+
expectEq$(t.type, Token::STRING);
5863

5964
t = lex("\"abc\"");
6065
expectEq$(t.type, Token::STRING);

src/vaev-engine/loader/loader-impl.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import Karm.Ref;
1313
import Karm.Logger;
1414
import Karm.Gfx;
1515
import Karm.Sys;
16+
import Karm.Font;
1617

1718
import :dom.window;
1819
import :layout.values;

src/vaev-engine/values/primitives.cpp

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -73,23 +73,8 @@ struct ValueParser<String> {
7373
static Res<String> parse(Cursor<Css::Sst>& c) {
7474
if (c.ended())
7575
return Error::invalidData("unexpected end of input");
76-
77-
if (c.peek() == Css::Token::STRING) {
78-
// TODO: Handle escape sequences
79-
Io::SScan s = c.next().token.data.str();
80-
StringBuilder sb{s.rem()};
81-
auto quote = s.next();
82-
while (not s.skip(quote) and not s.ended()) {
83-
if (s.skip('\\') and not s.ended()) {
84-
if (s.skip('\\'))
85-
sb.append(s.next());
86-
} else {
87-
sb.append(s.next());
88-
}
89-
}
90-
return Ok(sb.take());
91-
}
92-
76+
if (c.peek() == Css::Token::STRING)
77+
return Ok(c.next().token.data);
9378
return Error::invalidData("expected string");
9479
}
9580
};

0 commit comments

Comments
 (0)