From f3affa903596f3fce134252fef83587c4f360f0a Mon Sep 17 00:00:00 2001 From: Nicolas Chevobbe Date: Fri, 22 Mar 2024 11:37:20 +0100 Subject: [PATCH 1/2] Add ParserState method to get current utf16 position This adds a `utf16_position` method on `ParserState`, exposing a `current_position` field that we compute. The implementation is closely following what what done to compute the utf16 column position. --- src/parser.rs | 7 +++++++ src/size_of_tests.rs | 6 +++--- src/tests.rs | 37 ++++++++++++++++++++----------------- src/tokenizer.rs | 21 +++++++++++++++++++-- 4 files changed, 49 insertions(+), 22 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 77a21008..00052df2 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -18,6 +18,7 @@ use std::ops::Range; pub struct ParserState { pub(crate) position: usize, pub(crate) current_line_start_position: usize, + pub(crate) current_position: usize, pub(crate) current_line_number: u32, pub(crate) at_start_of: Option, } @@ -37,6 +38,12 @@ impl ParserState { column: (self.position - self.current_line_start_position + 1) as u32, } } + + /// The position from the start of the input, counted in UTF-16 code units + #[inline] + pub fn utf16_position(&self) -> u32 { + self.current_position as u32 + } } /// When parsing until a given token, sometimes the caller knows that parsing is going to restart diff --git a/src/size_of_tests.rs b/src/size_of_tests.rs index edd2b439..ae9fd01b 100644 --- a/src/size_of_tests.rs +++ b/src/size_of_tests.rs @@ -42,11 +42,11 @@ size_of_test!(token, Token, 32); size_of_test!(std_cow_str, std::borrow::Cow<'static, str>, 24, 32); size_of_test!(cow_rc_str, CowRcStr, 16); -size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 72); -size_of_test!(parser_input, crate::parser::ParserInput, 136); +size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 80); +size_of_test!(parser_input, crate::parser::ParserInput, 152); size_of_test!(parser, crate::parser::Parser, 16); size_of_test!(source_position, crate::SourcePosition, 8); -size_of_test!(parser_state, crate::ParserState, 24); +size_of_test!(parser_state, crate::ParserState, 32); size_of_test!(basic_parse_error, crate::BasicParseError, 40, 48); size_of_test!(parse_error_lower_bound, crate::ParseError<()>, 40, 48); diff --git a/src/tests.rs b/src/tests.rs index 98c51f61..386471f6 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -1267,7 +1267,7 @@ fn roundtrip_percentage_token() { } #[test] -fn utf16_columns() { +fn utf16_columns_and_positions() { // This particular test serves two purposes. First, it checks // that the column number computations are correct. Second, it // checks that tokenizer code paths correctly differentiate @@ -1278,24 +1278,26 @@ fn utf16_columns() { // the column is in units of UTF-16, the 4-byte sequence results // in two columns. let tests = vec![ - ("", 1), - ("ascii", 6), - ("/*Qฮกโœˆ๐Ÿ†’*/", 10), - ("'Qฮกโœˆ๐Ÿ†’*'", 9), - ("\"\\\"'Qฮกโœˆ๐Ÿ†’*'", 12), - ("\\Q\\ฮก\\โœˆ\\๐Ÿ†’", 10), - ("Qฮกโœˆ๐Ÿ†’", 6), - ("Qฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’", 15), - ("newline\r\nQฮกโœˆ๐Ÿ†’", 6), - ("url(Qฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’)", 20), - ("url(Qฮกโœˆ๐Ÿ†’)", 11), - ("url(\r\nQฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’)", 16), - ("url(\r\nQฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’", 15), - ("url(\r\nQฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’ x", 17), - ("Qฮกโœˆ๐Ÿ†’()", 8), + ("", 1, 0), + ("ascii", 6, 5), + ("/*Qฮกโœˆ๐Ÿ†’*/", 10, 9), + ("/*Qฮกโœˆ\r\n๐Ÿ†’*/", 5, 11), + ("'Qฮกโœˆ๐Ÿ†’*'", 9, 8), + ("\"\\\"'Qฮกโœˆ๐Ÿ†’*'", 12, 11), + ("\\Q\\ฮก\\โœˆ\\๐Ÿ†’", 10, 9), + ("Qฮกโœˆ๐Ÿ†’", 6, 5), + ("Qฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’", 15, 14), + ("newline\r\nQฮกโœˆ๐Ÿ†’", 6, 14), + ("url(Qฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’)", 20, 19), + ("url(Qฮกโœˆ๐Ÿ†’)", 11, 10), + ("url(\r\nQฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’)", 16, 21), + ("url(\r\nQฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’", 15, 20), + ("url(\r\nQฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’ x", 17, 22), + ("url( \tQ)", 10, 9), + ("Qฮกโœˆ๐Ÿ†’()", 8, 7), // Test that under/over-flow of current_line_start_position is // handled properly; see the special case in consume_4byte_intro. - ("๐Ÿ†’", 3), + ("๐Ÿ†’", 3, 2), ]; for test in tests { @@ -1321,6 +1323,7 @@ fn utf16_columns() { // Check the resulting column. assert_eq!(parser.current_source_location().column, test.1); + assert_eq!(parser.state().utf16_position(), test.2, "test: {}", test.0); } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index f1716c71..06487588 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -214,6 +214,7 @@ pub struct Tokenizer<'a> { /// ensure that computing the column will give the result in units /// of UTF-16 characters. current_line_start_position: usize, + current_position: usize, current_line_number: u32, var_or_env_functions: SeenStatus, source_map_url: Option<&'a str>, @@ -234,6 +235,7 @@ impl<'a> Tokenizer<'a> { input, position: 0, current_line_start_position: 0, + current_position: 0, current_line_number: 0, var_or_env_functions: SeenStatus::DontCare, source_map_url: None, @@ -296,6 +298,7 @@ impl<'a> Tokenizer<'a> { ParserState { position: self.position, current_line_start_position: self.current_line_start_position, + current_position: self.current_position, current_line_number: self.current_line_number, at_start_of: None, } @@ -305,6 +308,7 @@ impl<'a> Tokenizer<'a> { pub fn reset(&mut self, state: &ParserState) { self.position = state.position; self.current_line_start_position = state.current_line_start_position; + self.current_position = state.current_position; self.current_line_number = state.current_line_number; } @@ -370,6 +374,7 @@ impl<'a> Tokenizer<'a> { debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C'); } } + self.current_position = self.current_position.wrapping_add(n); self.position += n } @@ -392,6 +397,7 @@ impl<'a> Tokenizer<'a> { // This takes two UTF-16 characters to represent, so we // actually have an undercount. self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); + self.current_position = self.current_position.wrapping_add(2); self.position += 1; } @@ -417,10 +423,13 @@ impl<'a> Tokenizer<'a> { // This takes two UTF-16 characters to represent, so we // actually have an undercount. self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); + self.current_position = self.current_position.wrapping_add(2); } else if byte & 0xC0 == 0x80 { // Note that due to the special case for the 4-byte // sequence intro, we must use wrapping add here. self.current_line_start_position = self.current_line_start_position.wrapping_add(1); + } else { + self.current_position = self.current_position.wrapping_add(1); } } @@ -439,8 +448,10 @@ impl<'a> Tokenizer<'a> { let byte = self.next_byte_unchecked(); debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C'); self.position += 1; + self.current_position = self.current_position.wrapping_add(1); if byte == b'\r' && self.next_byte() == Some(b'\n') { self.position += 1; + self.current_position = self.current_position.wrapping_add(1); } self.current_line_start_position = self.position; self.current_line_number += 1; @@ -459,9 +470,11 @@ impl<'a> Tokenizer<'a> { self.position += len_utf8; // Note that due to the special case for the 4-byte sequence // intro, we must use wrapping add here. + let len_utf16 = c.len_utf16(); self.current_line_start_position = self .current_line_start_position - .wrapping_add(len_utf8 - c.len_utf16()); + .wrapping_add(len_utf8 - len_utf16); + self.current_position = self.current_position.wrapping_add(len_utf16); c } @@ -1151,12 +1164,16 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, } }; match_byte! { b, - b' ' | b'\t' => {}, + b' ' | b'\t' => { + tokenizer.current_position = tokenizer.current_position.wrapping_add(1); + }, b'\n' | b'\x0C' => { newlines += 1; last_newline = offset; + tokenizer.current_position = tokenizer.current_position.wrapping_add(1); } b'\r' => { + tokenizer.current_position = tokenizer.current_position.wrapping_add(1); if from_start.as_bytes().get(offset + 1) != Some(&b'\n') { newlines += 1; last_newline = offset; From 75ecffafea6d5ca6a664bf343821ad2d8c565150 Mon Sep 17 00:00:00 2001 From: Nicolas Chevobbe Date: Wed, 10 Apr 2024 10:23:33 +0200 Subject: [PATCH 2/2] Store utf8/utf16 position difference instead of counting utf16 positions --- src/parser.rs | 11 ++++++++--- src/tokenizer.rs | 50 +++++++++++++++++++++++------------------------- 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 00052df2..dc9a9d16 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -18,7 +18,8 @@ use std::ops::Range; pub struct ParserState { pub(crate) position: usize, pub(crate) current_line_start_position: usize, - pub(crate) current_position: usize, + pub(crate) current_line_start_difference: u16, + pub(crate) position_difference: u16, pub(crate) current_line_number: u32, pub(crate) at_start_of: Option, } @@ -35,14 +36,18 @@ impl ParserState { pub fn source_location(&self) -> SourceLocation { SourceLocation { line: self.current_line_number, - column: (self.position - self.current_line_start_position + 1) as u32, + column: ( + self.position - self.current_line_start_position - + (self.position_difference - self.current_line_start_difference) as usize + + 1 + ) as u32, } } /// The position from the start of the input, counted in UTF-16 code units #[inline] pub fn utf16_position(&self) -> u32 { - self.current_position as u32 + (self.position - self.position_difference as usize) as u32 } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 06487588..f5d143c0 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -214,7 +214,8 @@ pub struct Tokenizer<'a> { /// ensure that computing the column will give the result in units /// of UTF-16 characters. current_line_start_position: usize, - current_position: usize, + position_difference: u16, + current_line_start_difference: u16, current_line_number: u32, var_or_env_functions: SeenStatus, source_map_url: Option<&'a str>, @@ -235,8 +236,9 @@ impl<'a> Tokenizer<'a> { input, position: 0, current_line_start_position: 0, - current_position: 0, + current_line_start_difference: 0, current_line_number: 0, + position_difference: 0, var_or_env_functions: SeenStatus::DontCare, source_map_url: None, source_url: None, @@ -279,7 +281,12 @@ impl<'a> Tokenizer<'a> { pub fn current_source_location(&self) -> SourceLocation { SourceLocation { line: self.current_line_number, - column: (self.position - self.current_line_start_position + 1) as u32, + column: ( + self.position - + self.current_line_start_position - + (self.position_difference - self.current_line_start_difference) as usize + + 1 + ) as u32, } } @@ -298,7 +305,8 @@ impl<'a> Tokenizer<'a> { ParserState { position: self.position, current_line_start_position: self.current_line_start_position, - current_position: self.current_position, + current_line_start_difference: self.current_line_start_difference, + position_difference: self.position_difference, current_line_number: self.current_line_number, at_start_of: None, } @@ -308,7 +316,8 @@ impl<'a> Tokenizer<'a> { pub fn reset(&mut self, state: &ParserState) { self.position = state.position; self.current_line_start_position = state.current_line_start_position; - self.current_position = state.current_position; + self.current_line_start_difference = state.current_line_start_difference; + self.position_difference = state.position_difference; self.current_line_number = state.current_line_number; } @@ -374,7 +383,6 @@ impl<'a> Tokenizer<'a> { debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C'); } } - self.current_position = self.current_position.wrapping_add(n); self.position += n } @@ -396,8 +404,7 @@ impl<'a> Tokenizer<'a> { debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0); // This takes two UTF-16 characters to represent, so we // actually have an undercount. - self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); - self.current_position = self.current_position.wrapping_add(2); + self.position_difference = self.position_difference.wrapping_sub(1); self.position += 1; } @@ -409,7 +416,7 @@ impl<'a> Tokenizer<'a> { // Continuation bytes contribute to column overcount. Note // that due to the special case for the 4-byte sequence intro, // we must use wrapping add here. - self.current_line_start_position = self.current_line_start_position.wrapping_add(1); + self.position_difference = self.position_difference.wrapping_add(1); self.position += 1; } @@ -422,14 +429,11 @@ impl<'a> Tokenizer<'a> { if byte & 0xF0 == 0xF0 { // This takes two UTF-16 characters to represent, so we // actually have an undercount. - self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); - self.current_position = self.current_position.wrapping_add(2); + self.position_difference = self.position_difference.wrapping_sub(1); } else if byte & 0xC0 == 0x80 { // Note that due to the special case for the 4-byte // sequence intro, we must use wrapping add here. - self.current_line_start_position = self.current_line_start_position.wrapping_add(1); - } else { - self.current_position = self.current_position.wrapping_add(1); + self.position_difference = self.position_difference.wrapping_add(1); } } @@ -448,12 +452,11 @@ impl<'a> Tokenizer<'a> { let byte = self.next_byte_unchecked(); debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C'); self.position += 1; - self.current_position = self.current_position.wrapping_add(1); if byte == b'\r' && self.next_byte() == Some(b'\n') { self.position += 1; - self.current_position = self.current_position.wrapping_add(1); } self.current_line_start_position = self.position; + self.current_line_start_difference = self.position_difference; self.current_line_number += 1; } @@ -467,14 +470,13 @@ impl<'a> Tokenizer<'a> { fn consume_char(&mut self) -> char { let c = self.next_char(); let len_utf8 = c.len_utf8(); + let len_utf16 = c.len_utf16(); self.position += len_utf8; // Note that due to the special case for the 4-byte sequence // intro, we must use wrapping add here. - let len_utf16 = c.len_utf16(); - self.current_line_start_position = self - .current_line_start_position - .wrapping_add(len_utf8 - len_utf16); - self.current_position = self.current_position.wrapping_add(len_utf16); + self.position_difference = self + .position_difference + .wrapping_add((len_utf8 - len_utf16) as u16); c } @@ -1164,16 +1166,12 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, } }; match_byte! { b, - b' ' | b'\t' => { - tokenizer.current_position = tokenizer.current_position.wrapping_add(1); - }, + b' ' | b'\t' => {}, b'\n' | b'\x0C' => { newlines += 1; last_newline = offset; - tokenizer.current_position = tokenizer.current_position.wrapping_add(1); } b'\r' => { - tokenizer.current_position = tokenizer.current_position.wrapping_add(1); if from_start.as_bytes().get(offset + 1) != Some(&b'\n') { newlines += 1; last_newline = offset;