diff --git a/src/parser.rs b/src/parser.rs index 77a21008..dc9a9d16 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -18,6 +18,8 @@ use std::ops::Range; pub struct ParserState { pub(crate) position: usize, pub(crate) current_line_start_position: usize, + pub(crate) current_line_start_difference: u16, + pub(crate) position_difference: u16, pub(crate) current_line_number: u32, pub(crate) at_start_of: Option, } @@ -34,9 +36,19 @@ impl ParserState { pub fn source_location(&self) -> SourceLocation { SourceLocation { line: self.current_line_number, - column: (self.position - self.current_line_start_position + 1) as u32, + column: ( + self.position - self.current_line_start_position - + (self.position_difference - self.current_line_start_difference) as usize + + 1 + ) as u32, } } + + /// The position from the start of the input, counted in UTF-16 code units + #[inline] + pub fn utf16_position(&self) -> u32 { + (self.position - self.position_difference as usize) as u32 + } } /// When parsing until a given token, sometimes the caller knows that parsing is going to restart diff --git a/src/size_of_tests.rs b/src/size_of_tests.rs index edd2b439..ae9fd01b 100644 --- a/src/size_of_tests.rs +++ b/src/size_of_tests.rs @@ -42,11 +42,11 @@ size_of_test!(token, Token, 32); size_of_test!(std_cow_str, std::borrow::Cow<'static, str>, 24, 32); size_of_test!(cow_rc_str, CowRcStr, 16); -size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 72); -size_of_test!(parser_input, crate::parser::ParserInput, 136); +size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 80); +size_of_test!(parser_input, crate::parser::ParserInput, 152); size_of_test!(parser, crate::parser::Parser, 16); size_of_test!(source_position, crate::SourcePosition, 8); -size_of_test!(parser_state, crate::ParserState, 24); +size_of_test!(parser_state, crate::ParserState, 32); size_of_test!(basic_parse_error, crate::BasicParseError, 40, 48); size_of_test!(parse_error_lower_bound, crate::ParseError<()>, 40, 48); diff --git a/src/tests.rs b/src/tests.rs index 98c51f61..386471f6 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -1267,7 +1267,7 @@ fn roundtrip_percentage_token() { } #[test] -fn utf16_columns() { +fn utf16_columns_and_positions() { // This particular test serves two purposes. First, it checks // that the column number computations are correct. Second, it // checks that tokenizer code paths correctly differentiate @@ -1278,24 +1278,26 @@ fn utf16_columns() { // the column is in units of UTF-16, the 4-byte sequence results // in two columns. let tests = vec![ - ("", 1), - ("ascii", 6), - ("/*Qฮกโœˆ๐Ÿ†’*/", 10), - ("'Qฮกโœˆ๐Ÿ†’*'", 9), - ("\"\\\"'Qฮกโœˆ๐Ÿ†’*'", 12), - ("\\Q\\ฮก\\โœˆ\\๐Ÿ†’", 10), - ("Qฮกโœˆ๐Ÿ†’", 6), - ("Qฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’", 15), - ("newline\r\nQฮกโœˆ๐Ÿ†’", 6), - ("url(Qฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’)", 20), - ("url(Qฮกโœˆ๐Ÿ†’)", 11), - ("url(\r\nQฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’)", 16), - ("url(\r\nQฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’", 15), - ("url(\r\nQฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’ x", 17), - ("Qฮกโœˆ๐Ÿ†’()", 8), + ("", 1, 0), + ("ascii", 6, 5), + ("/*Qฮกโœˆ๐Ÿ†’*/", 10, 9), + ("/*Qฮกโœˆ\r\n๐Ÿ†’*/", 5, 11), + ("'Qฮกโœˆ๐Ÿ†’*'", 9, 8), + ("\"\\\"'Qฮกโœˆ๐Ÿ†’*'", 12, 11), + ("\\Q\\ฮก\\โœˆ\\๐Ÿ†’", 10, 9), + ("Qฮกโœˆ๐Ÿ†’", 6, 5), + ("Qฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’", 15, 14), + ("newline\r\nQฮกโœˆ๐Ÿ†’", 6, 14), + ("url(Qฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’)", 20, 19), + ("url(Qฮกโœˆ๐Ÿ†’)", 11, 10), + ("url(\r\nQฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’)", 16, 21), + ("url(\r\nQฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’", 15, 20), + ("url(\r\nQฮกโœˆ๐Ÿ†’\\Q\\ฮก\\โœˆ\\๐Ÿ†’ x", 17, 22), + ("url( \tQ)", 10, 9), + ("Qฮกโœˆ๐Ÿ†’()", 8, 7), // Test that under/over-flow of current_line_start_position is // handled properly; see the special case in consume_4byte_intro. - ("๐Ÿ†’", 3), + ("๐Ÿ†’", 3, 2), ]; for test in tests { @@ -1321,6 +1323,7 @@ fn utf16_columns() { // Check the resulting column. assert_eq!(parser.current_source_location().column, test.1); + assert_eq!(parser.state().utf16_position(), test.2, "test: {}", test.0); } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index f1716c71..f5d143c0 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -214,6 +214,8 @@ pub struct Tokenizer<'a> { /// ensure that computing the column will give the result in units /// of UTF-16 characters. current_line_start_position: usize, + position_difference: u16, + current_line_start_difference: u16, current_line_number: u32, var_or_env_functions: SeenStatus, source_map_url: Option<&'a str>, @@ -234,7 +236,9 @@ impl<'a> Tokenizer<'a> { input, position: 0, current_line_start_position: 0, + current_line_start_difference: 0, current_line_number: 0, + position_difference: 0, var_or_env_functions: SeenStatus::DontCare, source_map_url: None, source_url: None, @@ -277,7 +281,12 @@ impl<'a> Tokenizer<'a> { pub fn current_source_location(&self) -> SourceLocation { SourceLocation { line: self.current_line_number, - column: (self.position - self.current_line_start_position + 1) as u32, + column: ( + self.position - + self.current_line_start_position - + (self.position_difference - self.current_line_start_difference) as usize + + 1 + ) as u32, } } @@ -296,6 +305,8 @@ impl<'a> Tokenizer<'a> { ParserState { position: self.position, current_line_start_position: self.current_line_start_position, + current_line_start_difference: self.current_line_start_difference, + position_difference: self.position_difference, current_line_number: self.current_line_number, at_start_of: None, } @@ -305,6 +316,8 @@ impl<'a> Tokenizer<'a> { pub fn reset(&mut self, state: &ParserState) { self.position = state.position; self.current_line_start_position = state.current_line_start_position; + self.current_line_start_difference = state.current_line_start_difference; + self.position_difference = state.position_difference; self.current_line_number = state.current_line_number; } @@ -391,7 +404,7 @@ impl<'a> Tokenizer<'a> { debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0); // This takes two UTF-16 characters to represent, so we // actually have an undercount. - self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); + self.position_difference = self.position_difference.wrapping_sub(1); self.position += 1; } @@ -403,7 +416,7 @@ impl<'a> Tokenizer<'a> { // Continuation bytes contribute to column overcount. Note // that due to the special case for the 4-byte sequence intro, // we must use wrapping add here. - self.current_line_start_position = self.current_line_start_position.wrapping_add(1); + self.position_difference = self.position_difference.wrapping_add(1); self.position += 1; } @@ -416,11 +429,11 @@ impl<'a> Tokenizer<'a> { if byte & 0xF0 == 0xF0 { // This takes two UTF-16 characters to represent, so we // actually have an undercount. - self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); + self.position_difference = self.position_difference.wrapping_sub(1); } else if byte & 0xC0 == 0x80 { // Note that due to the special case for the 4-byte // sequence intro, we must use wrapping add here. - self.current_line_start_position = self.current_line_start_position.wrapping_add(1); + self.position_difference = self.position_difference.wrapping_add(1); } } @@ -443,6 +456,7 @@ impl<'a> Tokenizer<'a> { self.position += 1; } self.current_line_start_position = self.position; + self.current_line_start_difference = self.position_difference; self.current_line_number += 1; } @@ -456,12 +470,13 @@ impl<'a> Tokenizer<'a> { fn consume_char(&mut self) -> char { let c = self.next_char(); let len_utf8 = c.len_utf8(); + let len_utf16 = c.len_utf16(); self.position += len_utf8; // Note that due to the special case for the 4-byte sequence // intro, we must use wrapping add here. - self.current_line_start_position = self - .current_line_start_position - .wrapping_add(len_utf8 - c.len_utf16()); + self.position_difference = self + .position_difference + .wrapping_add((len_utf8 - len_utf16) as u16); c }