@@ -211,37 +211,6 @@ static auto const RE_NUMBER = Re::chain(
211211 )
212212);
213213
214- // string token description
215- static auto const RE_STRING =
216- Re::either (
217- Re::chain (
218- ' "' _re,
219- Re::zeroOrMore (
220- Re::either (
221- RE_ESCAPE,
222- Re::negate (
223- ' \\ ' _re | RE_NEWLINE | ' \" ' _re
224- ),
225- '\\'_re & RE_NEWLINE
226- )
227- ),
228- '"'_re
229- ),
230- Re::chain(
231- ' \' ' _re,
232- Re::zeroOrMore (
233- Re::either (
234- RE_ESCAPE,
235- Re::negate (
236- ' \\ ' _re | RE_NEWLINE | ' \' ' _re
237- ),
238- '\\'_re & RE_NEWLINE
239- )
240- ),
241- '\''_re
242- )
243- );
244-
245214export struct Lexer {
246215 Io::SScan _scan;
247216 Token _curr;
@@ -274,6 +243,105 @@ export struct Lexer {
274243 return {Token::FUNCTION, s.end ()};
275244 }
276245
246+ // https://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape
247+ bool _checkValidEscape (Io::SScan& s) {
248+ if (s.rem () < 2 )
249+ return false ;
250+ // If the first code point is not U+005C REVERSE SOLIDUS (\), return false.
251+ if (s.peek () != ' \\ ' )
252+ return false ;
253+ // Otherwise, if the second code point is a newline, return false.
254+ else if (s.ahead (' \\ ' _re & RE_NEWLINE))
255+ return false ;
256+ // Otherwise, return true.
257+ return true ;
258+ }
259+
260+ // https://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
261+ Rune _consumeEscapeCodepoint (Io::SScan& s) const {
262+ if (not s.skip (' \\ ' ))
263+ return U' �' ;
264+
265+ // hex digit
266+ if (auto hex = s.token (Re::nOrN (1 , 5 , Re::xdigit ()))) {
267+ // Consume as many hex digits as possible, but no more than 5.
268+ // NOTE: This means 1-6 hex digits have been consumed in total.
269+
270+ // If the next input code point is whitespace, consume it as well.
271+ s.skip (RE_WHITESPACE);
272+
273+ // Interpret the hex digits as a hexadecimal number.
274+ auto num = Io::atou (hex, {.base = 16 }).unwrap ();
275+
276+ // If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point
277+ if (0xD800 <= num and num <= 0xDFFF )
278+ return U' �' ;
279+ if (num == 0 or num > 0x10FFFF )
280+ // return U+FFFD REPLACEMENT CHARACTER (�).
281+ return U' �' ;
282+
283+ // Otherwise, return the code point with that value.
284+ return num;
285+ }
286+ // EOF
287+ else if (s.ended ()) {
288+ // This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�).
289+ logWarn (" Unexpected EOF" );
290+ return U' �' ;
291+ }
292+
293+ // anything else
294+ else {
295+ return s.next ();
296+ }
297+ }
298+
299+ // https://www.w3.org/TR/css-syntax-3/#consume-string-token
300+ Token _consumeStringToken (Io::SScan& s, Opt<Rune> endingCodepoint = NONE) const {
301+ // Initially create a <string-token> with its value set to the empty string.
302+ StringBuilder sb;
303+ if (not endingCodepoint)
304+ endingCodepoint = s.next ();
305+
306+ // Repeatedly consume the next input code point from the stream:
307+ while (not s.ended ()) {
308+ // ending code point
309+ if (s.peek () == endingCodepoint) {
310+ // Return the <string-token>.
311+ s.next ();
312+ return {Token::STRING, sb.take ()};
313+ }
314+ // newline
315+ else if (s.ahead (RE_NEWLINE)) {
316+ // This is a parse error. Reconsume the current input code point, create a <bad-string-token>, and return it.
317+ logWarn (" tokenizing bad string due to newline" );
318+ return {Token::BAD_STRING, sb.take ()};
319+ }
320+ // U+005C REVERSE SOLIDUS (\)
321+ else if (s.peek () == ' \\ ' ) {
322+ // If the next input code point is EOF, do nothing.
323+ if (s.rem () == 1 )
324+ s.next ();
325+ // Otherwise, if the next input code point is a newline, consume it.
326+ else if (s.skip (' \\ ' _re & RE_NEWLINE))
327+ /* consumed by skip() */ ;
328+ // Otherwise, (the stream starts with a valid escape) consume an escaped code point and append the returned code point to the <string-token>’s value.
329+ else
330+ sb.append (_consumeEscapeCodepoint (s));
331+ }
332+ // anything else
333+ else {
334+ // Append the current input code point to the <string-token>’s value.
335+ sb.append (s.next ());
336+ }
337+ }
338+
339+ // EOF
340+ // This is a parse error. Return the <string-token>.
341+ logWarn (" unexpected EOF" );
342+ return {Token::STRING, sb.take ()};
343+ }
344+
277345 Token _next (Io::SScan& s) const {
278346 s.begin ();
279347 if (s.ended ()) {
@@ -321,13 +389,12 @@ export struct Lexer {
321389 return _nextIdent (s);
322390 } else if (s.skip (RE_AT_KEYWORD)) {
323391 return {Token::AT_KEYWORD, s.end ()};
324- } else if (s.skip (RE_STRING)) {
325- // https://www.w3.org/TR/css-syntax-3/#consume-string-token
326- return {Token::STRING, s.end ()};
392+ } else if (s.peek () == ' "' or s.peek () == ' \' ' ) {
393+ return _consumeStringToken (s);
327394 } else if (s.skip (RE_DELIM)) {
328395 return {Token::DELIM, s.end ()};
329396 } else {
330- logWarn (" unrecognized token: {}" , s.end ());
397+ logWarn (" unrecognized token: {}" , s.remStr ());
331398 s.next ();
332399 return {Token::OTHER, s.end ()};
333400 }
0 commit comments