From f3affa903596f3fce134252fef83587c4f360f0a Mon Sep 17 00:00:00 2001
From: Nicolas Chevobbe <chevobbe.nicolas@gmail.com>
Date: Fri, 22 Mar 2024 11:37:20 +0100
Subject: [PATCH 1/2] Add ParserState method to get current utf16 position

This adds a `utf16_position` method on `ParserState`, exposing a `current_position`
field that we compute.
The implementation is closely following what what done to compute the utf16 column
position.
---
 src/parser.rs        |  7 +++++++
 src/size_of_tests.rs |  6 +++---
 src/tests.rs         | 37 ++++++++++++++++++++-----------------
 src/tokenizer.rs     | 21 +++++++++++++++++++--
 4 files changed, 49 insertions(+), 22 deletions(-)
diff --git a/src/parser.rs b/src/parser.rs
index 77a21008..00052df2 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -18,6 +18,7 @@ use std::ops::Range;
 pub struct ParserState {
     pub(crate) position: usize,
     pub(crate) current_line_start_position: usize,
+    pub(crate) current_position: usize,
     pub(crate) current_line_number: u32,
     pub(crate) at_start_of: Option<BlockType>,
 }
@@ -37,6 +38,12 @@ impl ParserState {
             column: (self.position - self.current_line_start_position + 1) as u32,
         }
     }
+
+    /// The position from the start of the input, counted in UTF-16 code units
+    #[inline]
+    pub fn utf16_position(&self) -> u32 {
+        self.current_position as u32
+    }
 }
 
 /// When parsing until a given token, sometimes the caller knows that parsing is going to restart
diff --git a/src/size_of_tests.rs b/src/size_of_tests.rs
index edd2b439..ae9fd01b 100644
--- a/src/size_of_tests.rs
+++ b/src/size_of_tests.rs
@@ -42,11 +42,11 @@ size_of_test!(token, Token, 32);
 size_of_test!(std_cow_str, std::borrow::Cow<'static, str>, 24, 32);
 size_of_test!(cow_rc_str, CowRcStr, 16);
 
-size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 72);
-size_of_test!(parser_input, crate::parser::ParserInput, 136);
+size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 80);
+size_of_test!(parser_input, crate::parser::ParserInput, 152);
 size_of_test!(parser, crate::parser::Parser, 16);
 size_of_test!(source_position, crate::SourcePosition, 8);
-size_of_test!(parser_state, crate::ParserState, 24);
+size_of_test!(parser_state, crate::ParserState, 32);
 
 size_of_test!(basic_parse_error, crate::BasicParseError, 40, 48);
 size_of_test!(parse_error_lower_bound, crate::ParseError<()>, 40, 48);
diff --git a/src/tests.rs b/src/tests.rs
index 98c51f61..386471f6 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -1267,7 +1267,7 @@ fn roundtrip_percentage_token() {
 }
 
 #[test]
-fn utf16_columns() {
+fn utf16_columns_and_positions() {
     // This particular test serves two purposes.  First, it checks
     // that the column number computations are correct.  Second, it
     // checks that tokenizer code paths correctly differentiate
@@ -1278,24 +1278,26 @@ fn utf16_columns() {
     // the column is in units of UTF-16, the 4-byte sequence results
     // in two columns.
     let tests = vec![
-        ("", 1),
-        ("ascii", 6),
-        ("/*QΡ✈🆒*/", 10),
-        ("'QΡ✈🆒*'", 9),
-        ("\"\\\"'QΡ✈🆒*'", 12),
-        ("\\Q\\Ρ\\✈\\🆒", 10),
-        ("QΡ✈🆒", 6),
-        ("QΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15),
-        ("newline\r\nQΡ✈🆒", 6),
-        ("url(QΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 20),
-        ("url(QΡ✈🆒)", 11),
-        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 16),
-        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15),
-        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒 x", 17),
-        ("QΡ✈🆒()", 8),
+        ("", 1, 0),
+        ("ascii", 6, 5),
+        ("/*QΡ✈🆒*/", 10, 9),
+        ("/*QΡ✈\r\n🆒*/", 5, 11),
+        ("'QΡ✈🆒*'", 9, 8),
+        ("\"\\\"'QΡ✈🆒*'", 12, 11),
+        ("\\Q\\Ρ\\✈\\🆒", 10, 9),
+        ("QΡ✈🆒", 6, 5),
+        ("QΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15, 14),
+        ("newline\r\nQΡ✈🆒", 6, 14),
+        ("url(QΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 20, 19),
+        ("url(QΡ✈🆒)", 11, 10),
+        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 16, 21),
+        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15, 20),
+        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒 x", 17, 22),
+        ("url(  \tQ)", 10, 9),
+        ("QΡ✈🆒()", 8, 7),
         // Test that under/over-flow of current_line_start_position is
         // handled properly; see the special case in consume_4byte_intro.
-        ("🆒", 3),
+        ("🆒", 3, 2),
     ];
 
     for test in tests {
@@ -1321,6 +1323,7 @@ fn utf16_columns() {
 
         // Check the resulting column.
         assert_eq!(parser.current_source_location().column, test.1);
+        assert_eq!(parser.state().utf16_position(), test.2, "test: {}", test.0);
     }
 }
 
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index f1716c71..06487588 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -214,6 +214,7 @@ pub struct Tokenizer<'a> {
     /// ensure that computing the column will give the result in units
     /// of UTF-16 characters.
     current_line_start_position: usize,
+    current_position: usize,
     current_line_number: u32,
     var_or_env_functions: SeenStatus,
     source_map_url: Option<&'a str>,
@@ -234,6 +235,7 @@ impl<'a> Tokenizer<'a> {
             input,
             position: 0,
             current_line_start_position: 0,
+            current_position: 0,
             current_line_number: 0,
             var_or_env_functions: SeenStatus::DontCare,
             source_map_url: None,
@@ -296,6 +298,7 @@ impl<'a> Tokenizer<'a> {
         ParserState {
             position: self.position,
             current_line_start_position: self.current_line_start_position,
+            current_position: self.current_position,
             current_line_number: self.current_line_number,
             at_start_of: None,
         }
@@ -305,6 +308,7 @@ impl<'a> Tokenizer<'a> {
     pub fn reset(&mut self, state: &ParserState) {
         self.position = state.position;
         self.current_line_start_position = state.current_line_start_position;
+        self.current_position = state.current_position;
         self.current_line_number = state.current_line_number;
     }
 
@@ -370,6 +374,7 @@ impl<'a> Tokenizer<'a> {
                 debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
             }
         }
+        self.current_position = self.current_position.wrapping_add(n);
         self.position += n
     }
 
@@ -392,6 +397,7 @@ impl<'a> Tokenizer<'a> {
         // This takes two UTF-16 characters to represent, so we
         // actually have an undercount.
         self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
+        self.current_position = self.current_position.wrapping_add(2);
         self.position += 1;
     }
 
@@ -417,10 +423,13 @@ impl<'a> Tokenizer<'a> {
             // This takes two UTF-16 characters to represent, so we
             // actually have an undercount.
             self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
+            self.current_position = self.current_position.wrapping_add(2);
         } else if byte & 0xC0 == 0x80 {
             // Note that due to the special case for the 4-byte
             // sequence intro, we must use wrapping add here.
             self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
+        } else {
+            self.current_position = self.current_position.wrapping_add(1);
         }
     }
 
@@ -439,8 +448,10 @@ impl<'a> Tokenizer<'a> {
         let byte = self.next_byte_unchecked();
         debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
         self.position += 1;
+        self.current_position = self.current_position.wrapping_add(1);
         if byte == b'\r' && self.next_byte() == Some(b'\n') {
             self.position += 1;
+            self.current_position = self.current_position.wrapping_add(1);
         }
         self.current_line_start_position = self.position;
         self.current_line_number += 1;
@@ -459,9 +470,11 @@ impl<'a> Tokenizer<'a> {
         self.position += len_utf8;
         // Note that due to the special case for the 4-byte sequence
         // intro, we must use wrapping add here.
+        let len_utf16 = c.len_utf16();
         self.current_line_start_position = self
             .current_line_start_position
-            .wrapping_add(len_utf8 - c.len_utf16());
+            .wrapping_add(len_utf8 - len_utf16);
+        self.current_position = self.current_position.wrapping_add(len_utf16);
         c
     }
 
@@ -1151,12 +1164,16 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
             }
         };
         match_byte! { b,
-            b' ' | b'\t' => {},
+            b' ' | b'\t' => {
+                tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
+            },
             b'\n' | b'\x0C' => {
                 newlines += 1;
                 last_newline = offset;
+                tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
             }
             b'\r' => {
+                tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
                 if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
                     newlines += 1;
                     last_newline = offset;

From 75ecffafea6d5ca6a664bf343821ad2d8c565150 Mon Sep 17 00:00:00 2001
From: Nicolas Chevobbe <chevobbe.nicolas@gmail.com>
Date: Wed, 10 Apr 2024 10:23:33 +0200
Subject: [PATCH 2/2] Store utf8/utf16 position difference instead of counting
 utf16 positions

---
 src/parser.rs    | 11 ++++++++---
 src/tokenizer.rs | 50 +++++++++++++++++++++++-------------------------
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/src/parser.rs b/src/parser.rs
index 00052df2..dc9a9d16 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -18,7 +18,8 @@ use std::ops::Range;
 pub struct ParserState {
     pub(crate) position: usize,
     pub(crate) current_line_start_position: usize,
-    pub(crate) current_position: usize,
+    pub(crate) current_line_start_difference: u16,
+    pub(crate) position_difference: u16,
     pub(crate) current_line_number: u32,
     pub(crate) at_start_of: Option<BlockType>,
 }
@@ -35,14 +36,18 @@ impl ParserState {
     pub fn source_location(&self) -> SourceLocation {
         SourceLocation {
             line: self.current_line_number,
-            column: (self.position - self.current_line_start_position + 1) as u32,
+            column: (
+                self.position - self.current_line_start_position -
+                (self.position_difference - self.current_line_start_difference) as usize +
+                1
+            ) as u32,
         }
     }
 
     /// The position from the start of the input, counted in UTF-16 code units
     #[inline]
     pub fn utf16_position(&self) -> u32 {
-        self.current_position as u32
+        (self.position - self.position_difference as usize) as u32
     }
 }
 
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 06487588..f5d143c0 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -214,7 +214,8 @@ pub struct Tokenizer<'a> {
     /// ensure that computing the column will give the result in units
     /// of UTF-16 characters.
     current_line_start_position: usize,
-    current_position: usize,
+    position_difference: u16,
+    current_line_start_difference: u16,
     current_line_number: u32,
     var_or_env_functions: SeenStatus,
     source_map_url: Option<&'a str>,
@@ -235,8 +236,9 @@ impl<'a> Tokenizer<'a> {
             input,
             position: 0,
             current_line_start_position: 0,
-            current_position: 0,
+            current_line_start_difference: 0,
             current_line_number: 0,
+            position_difference: 0,
             var_or_env_functions: SeenStatus::DontCare,
             source_map_url: None,
             source_url: None,
@@ -279,7 +281,12 @@ impl<'a> Tokenizer<'a> {
     pub fn current_source_location(&self) -> SourceLocation {
         SourceLocation {
             line: self.current_line_number,
-            column: (self.position - self.current_line_start_position + 1) as u32,
+            column: (
+                self.position -
+                self.current_line_start_position -
+                (self.position_difference - self.current_line_start_difference) as usize
+                + 1
+            ) as u32,
         }
     }
 
@@ -298,7 +305,8 @@ impl<'a> Tokenizer<'a> {
         ParserState {
             position: self.position,
             current_line_start_position: self.current_line_start_position,
-            current_position: self.current_position,
+            current_line_start_difference: self.current_line_start_difference,
+            position_difference: self.position_difference,
             current_line_number: self.current_line_number,
             at_start_of: None,
         }
@@ -308,7 +316,8 @@ impl<'a> Tokenizer<'a> {
     pub fn reset(&mut self, state: &ParserState) {
         self.position = state.position;
         self.current_line_start_position = state.current_line_start_position;
-        self.current_position = state.current_position;
+        self.current_line_start_difference = state.current_line_start_difference;
+        self.position_difference = state.position_difference;
         self.current_line_number = state.current_line_number;
     }
 
@@ -374,7 +383,6 @@ impl<'a> Tokenizer<'a> {
                 debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
             }
         }
-        self.current_position = self.current_position.wrapping_add(n);
         self.position += n
     }
 
@@ -396,8 +404,7 @@ impl<'a> Tokenizer<'a> {
         debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
         // This takes two UTF-16 characters to represent, so we
         // actually have an undercount.
-        self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
-        self.current_position = self.current_position.wrapping_add(2);
+        self.position_difference = self.position_difference.wrapping_sub(1);
         self.position += 1;
     }
 
@@ -409,7 +416,7 @@ impl<'a> Tokenizer<'a> {
         // Continuation bytes contribute to column overcount.  Note
         // that due to the special case for the 4-byte sequence intro,
         // we must use wrapping add here.
-        self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
+        self.position_difference = self.position_difference.wrapping_add(1);
         self.position += 1;
     }
 
@@ -422,14 +429,11 @@ impl<'a> Tokenizer<'a> {
         if byte & 0xF0 == 0xF0 {
             // This takes two UTF-16 characters to represent, so we
             // actually have an undercount.
-            self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
-            self.current_position = self.current_position.wrapping_add(2);
+            self.position_difference = self.position_difference.wrapping_sub(1);
         } else if byte & 0xC0 == 0x80 {
             // Note that due to the special case for the 4-byte
             // sequence intro, we must use wrapping add here.
-            self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
-        } else {
-            self.current_position = self.current_position.wrapping_add(1);
+            self.position_difference = self.position_difference.wrapping_add(1);
         }
     }
 
@@ -448,12 +452,11 @@ impl<'a> Tokenizer<'a> {
         let byte = self.next_byte_unchecked();
         debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
         self.position += 1;
-        self.current_position = self.current_position.wrapping_add(1);
         if byte == b'\r' && self.next_byte() == Some(b'\n') {
             self.position += 1;
-            self.current_position = self.current_position.wrapping_add(1);
         }
         self.current_line_start_position = self.position;
+        self.current_line_start_difference = self.position_difference;
         self.current_line_number += 1;
     }
 
@@ -467,14 +470,13 @@ impl<'a> Tokenizer<'a> {
     fn consume_char(&mut self) -> char {
         let c = self.next_char();
         let len_utf8 = c.len_utf8();
+        let len_utf16 = c.len_utf16();
         self.position += len_utf8;
         // Note that due to the special case for the 4-byte sequence
         // intro, we must use wrapping add here.
-        let len_utf16 = c.len_utf16();
-        self.current_line_start_position = self
-            .current_line_start_position
-            .wrapping_add(len_utf8 - len_utf16);
-        self.current_position = self.current_position.wrapping_add(len_utf16);
+        self.position_difference = self
+            .position_difference
+            .wrapping_add((len_utf8 - len_utf16) as u16);
         c
     }
 
@@ -1164,16 +1166,12 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
             }
         };
         match_byte! { b,
-            b' ' | b'\t' => {
-                tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
-            },
+            b' ' | b'\t' => {},
             b'\n' | b'\x0C' => {
                 newlines += 1;
                 last_newline = offset;
-                tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
             }
             b'\r' => {
-                tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
                 if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
                     newlines += 1;
                     last_newline = offset;