servo · nchevobbe · Mar 22, 2024 · Apr 10, 2024
diff --git a/src/parser.rs b/src/parser.rs
@@ -18,6 +18,8 @@ use std::ops::Range;
 pub struct ParserState {
     pub(crate) position: usize,
     pub(crate) current_line_start_position: usize,
+    pub(crate) current_line_start_difference: u16,
+    pub(crate) position_difference: u16,
     pub(crate) current_line_number: u32,
     pub(crate) at_start_of: Option<BlockType>,
 }
@@ -34,9 +36,19 @@ impl ParserState {
     pub fn source_location(&self) -> SourceLocation {
         SourceLocation {
             line: self.current_line_number,
-            column: (self.position - self.current_line_start_position + 1) as u32,
+            column: (
+                self.position - self.current_line_start_position -
+                (self.position_difference - self.current_line_start_difference) as usize +
+                1
+            ) as u32,
         }
     }
+
+    /// The position from the start of the input, counted in UTF-16 code units
+    #[inline]
+    pub fn utf16_position(&self) -> u32 {
+        (self.position - self.position_difference as usize) as u32
+    }
 }
 
 /// When parsing until a given token, sometimes the caller knows that parsing is going to restart

diff --git a/src/size_of_tests.rs b/src/size_of_tests.rs
@@ -42,11 +42,11 @@ size_of_test!(token, Token, 32);
 size_of_test!(std_cow_str, std::borrow::Cow<'static, str>, 24, 32);
 size_of_test!(cow_rc_str, CowRcStr, 16);
 
-size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 72);
-size_of_test!(parser_input, crate::parser::ParserInput, 136);
+size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 80);
+size_of_test!(parser_input, crate::parser::ParserInput, 152);
 size_of_test!(parser, crate::parser::Parser, 16);
 size_of_test!(source_position, crate::SourcePosition, 8);
-size_of_test!(parser_state, crate::ParserState, 24);
+size_of_test!(parser_state, crate::ParserState, 32);
 
 size_of_test!(basic_parse_error, crate::BasicParseError, 40, 48);
 size_of_test!(parse_error_lower_bound, crate::ParseError<()>, 40, 48);
diff --git a/src/tests.rs b/src/tests.rs
@@ -1267,7 +1267,7 @@ fn roundtrip_percentage_token() {
 }
 
 #[test]
-fn utf16_columns() {
+fn utf16_columns_and_positions() {
     // This particular test serves two purposes.  First, it checks
     // that the column number computations are correct.  Second, it
     // checks that tokenizer code paths correctly differentiate
@@ -1278,24 +1278,26 @@ fn utf16_columns() {
     // the column is in units of UTF-16, the 4-byte sequence results
     // in two columns.
     let tests = vec![
-        ("", 1),
-        ("ascii", 6),
-        ("/*QΡ✈🆒*/", 10),
-        ("'QΡ✈🆒*'", 9),
-        ("\"\\\"'QΡ✈🆒*'", 12),
-        ("\\Q\\Ρ\\✈\\🆒", 10),
-        ("QΡ✈🆒", 6),
-        ("QΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15),
-        ("newline\r\nQΡ✈🆒", 6),
-        ("url(QΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 20),
-        ("url(QΡ✈🆒)", 11),
-        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 16),
-        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15),
-        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒 x", 17),
-        ("QΡ✈🆒()", 8),
+        ("", 1, 0),
+        ("ascii", 6, 5),
+        ("/*QΡ✈🆒*/", 10, 9),
+        ("/*QΡ✈\r\n🆒*/", 5, 11),
+        ("'QΡ✈🆒*'", 9, 8),
+        ("\"\\\"'QΡ✈🆒*'", 12, 11),
+        ("\\Q\\Ρ\\✈\\🆒", 10, 9),
+        ("QΡ✈🆒", 6, 5),
+        ("QΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15, 14),
+        ("newline\r\nQΡ✈🆒", 6, 14),
+        ("url(QΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 20, 19),
+        ("url(QΡ✈🆒)", 11, 10),
+        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 16, 21),
+        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15, 20),
+        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒 x", 17, 22),
+        ("url(  \tQ)", 10, 9),
+        ("QΡ✈🆒()", 8, 7),
         // Test that under/over-flow of current_line_start_position is
         // handled properly; see the special case in consume_4byte_intro.
-        ("🆒", 3),
+        ("🆒", 3, 2),
     ];
 
     for test in tests {
@@ -1321,6 +1323,7 @@ fn utf16_columns() {
 
         // Check the resulting column.
         assert_eq!(parser.current_source_location().column, test.1);
+        assert_eq!(parser.state().utf16_position(), test.2, "test: {}", test.0);
     }
 }
 

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -214,6 +214,8 @@ pub struct Tokenizer<'a> {
     /// ensure that computing the column will give the result in units
     /// of UTF-16 characters.
     current_line_start_position: usize,
+    position_difference: u16,
+    current_line_start_difference: u16,
     current_line_number: u32,
     var_or_env_functions: SeenStatus,
     source_map_url: Option<&'a str>,
@@ -234,7 +236,9 @@ impl<'a> Tokenizer<'a> {
             input,
             position: 0,
             current_line_start_position: 0,
+            current_line_start_difference: 0,
             current_line_number: 0,
+            position_difference: 0,
             var_or_env_functions: SeenStatus::DontCare,
             source_map_url: None,
             source_url: None,
@@ -277,7 +281,12 @@ impl<'a> Tokenizer<'a> {
     pub fn current_source_location(&self) -> SourceLocation {
         SourceLocation {
             line: self.current_line_number,
-            column: (self.position - self.current_line_start_position + 1) as u32,
+            column: (
+                self.position -
+                self.current_line_start_position -
+                (self.position_difference - self.current_line_start_difference) as usize
+                + 1
+            ) as u32,
         }
     }
 
@@ -296,6 +305,8 @@ impl<'a> Tokenizer<'a> {
         ParserState {
             position: self.position,
             current_line_start_position: self.current_line_start_position,
+            current_line_start_difference: self.current_line_start_difference,
+            position_difference: self.position_difference,
             current_line_number: self.current_line_number,
             at_start_of: None,
         }
@@ -305,6 +316,8 @@ impl<'a> Tokenizer<'a> {
     pub fn reset(&mut self, state: &ParserState) {
         self.position = state.position;
         self.current_line_start_position = state.current_line_start_position;
+        self.current_line_start_difference = state.current_line_start_difference;
+        self.position_difference = state.position_difference;
         self.current_line_number = state.current_line_number;
     }
 
@@ -391,7 +404,7 @@ impl<'a> Tokenizer<'a> {
         debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
         // This takes two UTF-16 characters to represent, so we
         // actually have an undercount.
-        self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
+        self.position_difference = self.position_difference.wrapping_sub(1);
         self.position += 1;
     }
 
@@ -403,7 +416,7 @@ impl<'a> Tokenizer<'a> {
         // Continuation bytes contribute to column overcount.  Note
         // that due to the special case for the 4-byte sequence intro,
         // we must use wrapping add here.
-        self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
+        self.position_difference = self.position_difference.wrapping_add(1);
         self.position += 1;
     }
 
@@ -416,11 +429,11 @@ impl<'a> Tokenizer<'a> {
         if byte & 0xF0 == 0xF0 {
             // This takes two UTF-16 characters to represent, so we
             // actually have an undercount.
-            self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
+            self.position_difference = self.position_difference.wrapping_sub(1);
         } else if byte & 0xC0 == 0x80 {
             // Note that due to the special case for the 4-byte
             // sequence intro, we must use wrapping add here.
-            self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
+            self.position_difference = self.position_difference.wrapping_add(1);
         }
     }
 
@@ -443,6 +456,7 @@ impl<'a> Tokenizer<'a> {
             self.position += 1;
         }
         self.current_line_start_position = self.position;
+        self.current_line_start_difference = self.position_difference;
         self.current_line_number += 1;
     }
 
@@ -456,12 +470,13 @@ impl<'a> Tokenizer<'a> {
     fn consume_char(&mut self) -> char {
         let c = self.next_char();
         let len_utf8 = c.len_utf8();
+        let len_utf16 = c.len_utf16();
         self.position += len_utf8;
         // Note that due to the special case for the 4-byte sequence
         // intro, we must use wrapping add here.
-        self.current_line_start_position = self
-            .current_line_start_position
-            .wrapping_add(len_utf8 - c.len_utf16());
+        self.position_difference = self
+            .position_difference
+            .wrapping_add((len_utf8 - len_utf16) as u16);
         c
     }