Skip to content

Add ParserState method to get current utf16 position #374

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion src/parser.rs
Original file line number Diff line number Diff line change
@@ -18,6 +18,8 @@ use std::ops::Range;
pub struct ParserState {
pub(crate) position: usize,
pub(crate) current_line_start_position: usize,
pub(crate) current_line_start_difference: u16,
pub(crate) position_difference: u16,
pub(crate) current_line_number: u32,
pub(crate) at_start_of: Option<BlockType>,
}
@@ -34,9 +36,19 @@ impl ParserState {
pub fn source_location(&self) -> SourceLocation {
SourceLocation {
line: self.current_line_number,
column: (self.position - self.current_line_start_position + 1) as u32,
column: (
self.position - self.current_line_start_position -
(self.position_difference - self.current_line_start_difference) as usize +
1
) as u32,
}
}

/// The position from the start of the input, counted in UTF-16 code units
#[inline]
pub fn utf16_position(&self) -> u32 {
(self.position - self.position_difference as usize) as u32
}
}

/// When parsing until a given token, sometimes the caller knows that parsing is going to restart
6 changes: 3 additions & 3 deletions src/size_of_tests.rs
Original file line number Diff line number Diff line change
@@ -42,11 +42,11 @@ size_of_test!(token, Token, 32);
size_of_test!(std_cow_str, std::borrow::Cow<'static, str>, 24, 32);
size_of_test!(cow_rc_str, CowRcStr, 16);

size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 72);
size_of_test!(parser_input, crate::parser::ParserInput, 136);
size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 80);
size_of_test!(parser_input, crate::parser::ParserInput, 152);
size_of_test!(parser, crate::parser::Parser, 16);
size_of_test!(source_position, crate::SourcePosition, 8);
size_of_test!(parser_state, crate::ParserState, 24);
size_of_test!(parser_state, crate::ParserState, 32);

size_of_test!(basic_parse_error, crate::BasicParseError, 40, 48);
size_of_test!(parse_error_lower_bound, crate::ParseError<()>, 40, 48);
37 changes: 20 additions & 17 deletions src/tests.rs
Original file line number Diff line number Diff line change
@@ -1267,7 +1267,7 @@ fn roundtrip_percentage_token() {
}

#[test]
fn utf16_columns() {
fn utf16_columns_and_positions() {
// This particular test serves two purposes. First, it checks
// that the column number computations are correct. Second, it
// checks that tokenizer code paths correctly differentiate
@@ -1278,24 +1278,26 @@ fn utf16_columns() {
// the column is in units of UTF-16, the 4-byte sequence results
// in two columns.
let tests = vec![
("", 1),
("ascii", 6),
("/*QΡ✈🆒*/", 10),
("'QΡ✈🆒*'", 9),
("\"\\\"'QΡ✈🆒*'", 12),
("\\Q\\Ρ\\✈\\🆒", 10),
("QΡ✈🆒", 6),
("QΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15),
("newline\r\nQΡ✈🆒", 6),
("url(QΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 20),
("url(QΡ✈🆒)", 11),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 16),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒 x", 17),
("QΡ✈🆒()", 8),
("", 1, 0),
("ascii", 6, 5),
("/*QΡ✈🆒*/", 10, 9),
("/*QΡ✈\r\n🆒*/", 5, 11),
("'QΡ✈🆒*'", 9, 8),
("\"\\\"'QΡ✈🆒*'", 12, 11),
("\\Q\\Ρ\\✈\\🆒", 10, 9),
("QΡ✈🆒", 6, 5),
("QΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15, 14),
("newline\r\nQΡ✈🆒", 6, 14),
("url(QΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 20, 19),
("url(QΡ✈🆒)", 11, 10),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 16, 21),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15, 20),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒 x", 17, 22),
("url( \tQ)", 10, 9),
("QΡ✈🆒()", 8, 7),
// Test that under/over-flow of current_line_start_position is
// handled properly; see the special case in consume_4byte_intro.
("🆒", 3),
("🆒", 3, 2),
];

for test in tests {
@@ -1321,6 +1323,7 @@ fn utf16_columns() {

// Check the resulting column.
assert_eq!(parser.current_source_location().column, test.1);
assert_eq!(parser.state().utf16_position(), test.2, "test: {}", test.0);
}
}

31 changes: 23 additions & 8 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -214,6 +214,8 @@ pub struct Tokenizer<'a> {
/// ensure that computing the column will give the result in units
/// of UTF-16 characters.
current_line_start_position: usize,
position_difference: u16,
current_line_start_difference: u16,
current_line_number: u32,
var_or_env_functions: SeenStatus,
source_map_url: Option<&'a str>,
@@ -234,7 +236,9 @@ impl<'a> Tokenizer<'a> {
input,
position: 0,
current_line_start_position: 0,
current_line_start_difference: 0,
current_line_number: 0,
position_difference: 0,
var_or_env_functions: SeenStatus::DontCare,
source_map_url: None,
source_url: None,
@@ -277,7 +281,12 @@ impl<'a> Tokenizer<'a> {
pub fn current_source_location(&self) -> SourceLocation {
SourceLocation {
line: self.current_line_number,
column: (self.position - self.current_line_start_position + 1) as u32,
column: (
self.position -
self.current_line_start_position -
(self.position_difference - self.current_line_start_difference) as usize
+ 1
) as u32,
}
}

@@ -296,6 +305,8 @@ impl<'a> Tokenizer<'a> {
ParserState {
position: self.position,
current_line_start_position: self.current_line_start_position,
current_line_start_difference: self.current_line_start_difference,
position_difference: self.position_difference,
current_line_number: self.current_line_number,
at_start_of: None,
}
@@ -305,6 +316,8 @@ impl<'a> Tokenizer<'a> {
pub fn reset(&mut self, state: &ParserState) {
self.position = state.position;
self.current_line_start_position = state.current_line_start_position;
self.current_line_start_difference = state.current_line_start_difference;
self.position_difference = state.position_difference;
self.current_line_number = state.current_line_number;
}

@@ -391,7 +404,7 @@ impl<'a> Tokenizer<'a> {
debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
// This takes two UTF-16 characters to represent, so we
// actually have an undercount.
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
self.position_difference = self.position_difference.wrapping_sub(1);
self.position += 1;
}

@@ -403,7 +416,7 @@ impl<'a> Tokenizer<'a> {
// Continuation bytes contribute to column overcount. Note
// that due to the special case for the 4-byte sequence intro,
// we must use wrapping add here.
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
self.position_difference = self.position_difference.wrapping_add(1);
self.position += 1;
}

@@ -416,11 +429,11 @@ impl<'a> Tokenizer<'a> {
if byte & 0xF0 == 0xF0 {
// This takes two UTF-16 characters to represent, so we
// actually have an undercount.
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
self.position_difference = self.position_difference.wrapping_sub(1);
} else if byte & 0xC0 == 0x80 {
// Note that due to the special case for the 4-byte
// sequence intro, we must use wrapping add here.
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
self.position_difference = self.position_difference.wrapping_add(1);
}
}

@@ -443,6 +456,7 @@ impl<'a> Tokenizer<'a> {
self.position += 1;
}
self.current_line_start_position = self.position;
self.current_line_start_difference = self.position_difference;
self.current_line_number += 1;
}

@@ -456,12 +470,13 @@ impl<'a> Tokenizer<'a> {
fn consume_char(&mut self) -> char {
let c = self.next_char();
let len_utf8 = c.len_utf8();
let len_utf16 = c.len_utf16();
self.position += len_utf8;
// Note that due to the special case for the 4-byte sequence
// intro, we must use wrapping add here.
self.current_line_start_position = self
.current_line_start_position
.wrapping_add(len_utf8 - c.len_utf16());
self.position_difference = self
.position_difference
.wrapping_add((len_utf8 - len_utf16) as u16);
c
}