Skip to content

Commit 516e3f5

Browse files
committed
Store utf8/utf16 position difference instead of counting utf16 positions
1 parent c844076 commit 516e3f5

File tree

2 files changed

+32
-29
lines changed

2 files changed

+32
-29
lines changed

src/parser.rs

+8-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ use std::ops::Range;
1818
pub struct ParserState {
1919
pub(crate) position: usize,
2020
pub(crate) current_line_start_position: usize,
21-
pub(crate) current_position: usize,
21+
pub(crate) current_line_start_difference: u16,
22+
pub(crate) position_difference: u16,
2223
pub(crate) current_line_number: u32,
2324
pub(crate) at_start_of: Option<BlockType>,
2425
}
@@ -35,14 +36,18 @@ impl ParserState {
3536
pub fn source_location(&self) -> SourceLocation {
3637
SourceLocation {
3738
line: self.current_line_number,
38-
column: (self.position - self.current_line_start_position + 1) as u32,
39+
column: (
40+
self.position - self.current_line_start_position -
41+
(self.position_difference - self.current_line_start_difference) as usize +
42+
1
43+
) as u32,
3944
}
4045
}
4146

4247
/// The position from the start of the input, counted in UTF-16 code units
4348
#[inline]
4449
pub fn utf16_position(&self) -> u32 {
45-
self.current_position as u32
50+
(self.position - self.position_difference as usize) as u32
4651
}
4752
}
4853

src/tokenizer.rs

+24-26
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,8 @@ pub struct Tokenizer<'a> {
214214
/// ensure that computing the column will give the result in units
215215
/// of UTF-16 characters.
216216
current_line_start_position: usize,
217-
current_position: usize,
217+
position_difference: u16,
218+
current_line_start_difference: u16,
218219
current_line_number: u32,
219220
var_or_env_functions: SeenStatus,
220221
source_map_url: Option<&'a str>,
@@ -235,8 +236,9 @@ impl<'a> Tokenizer<'a> {
235236
input,
236237
position: 0,
237238
current_line_start_position: 0,
238-
current_position: 0,
239+
current_line_start_difference: 0,
239240
current_line_number: 0,
241+
position_difference: 0,
240242
var_or_env_functions: SeenStatus::DontCare,
241243
source_map_url: None,
242244
source_url: None,
@@ -279,7 +281,12 @@ impl<'a> Tokenizer<'a> {
279281
pub fn current_source_location(&self) -> SourceLocation {
280282
SourceLocation {
281283
line: self.current_line_number,
282-
column: (self.position - self.current_line_start_position + 1) as u32,
284+
column: (
285+
self.position -
286+
self.current_line_start_position -
287+
(self.position_difference - self.current_line_start_difference) as usize
288+
+ 1
289+
) as u32,
283290
}
284291
}
285292

@@ -298,7 +305,8 @@ impl<'a> Tokenizer<'a> {
298305
ParserState {
299306
position: self.position,
300307
current_line_start_position: self.current_line_start_position,
301-
current_position: self.current_position,
308+
current_line_start_difference: self.current_line_start_difference,
309+
position_difference: self.position_difference,
302310
current_line_number: self.current_line_number,
303311
at_start_of: None,
304312
}
@@ -308,7 +316,8 @@ impl<'a> Tokenizer<'a> {
308316
pub fn reset(&mut self, state: &ParserState) {
309317
self.position = state.position;
310318
self.current_line_start_position = state.current_line_start_position;
311-
self.current_position = state.current_position;
319+
self.current_line_start_difference = state.current_line_start_difference;
320+
self.position_difference = state.position_difference;
312321
self.current_line_number = state.current_line_number;
313322
}
314323

@@ -372,7 +381,6 @@ impl<'a> Tokenizer<'a> {
372381
debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
373382
}
374383
}
375-
self.current_position = self.current_position.wrapping_add(n);
376384
self.position += n
377385
}
378386

@@ -394,8 +402,7 @@ impl<'a> Tokenizer<'a> {
394402
debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
395403
// This takes two UTF-16 characters to represent, so we
396404
// actually have an undercount.
397-
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
398-
self.current_position = self.current_position.wrapping_add(2);
405+
self.position_difference = self.position_difference.wrapping_sub(1);
399406
self.position += 1;
400407
}
401408

@@ -407,7 +414,7 @@ impl<'a> Tokenizer<'a> {
407414
// Continuation bytes contribute to column overcount. Note
408415
// that due to the special case for the 4-byte sequence intro,
409416
// we must use wrapping add here.
410-
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
417+
self.position_difference = self.position_difference.wrapping_add(1);
411418
self.position += 1;
412419
}
413420

@@ -420,14 +427,11 @@ impl<'a> Tokenizer<'a> {
420427
if byte & 0xF0 == 0xF0 {
421428
// This takes two UTF-16 characters to represent, so we
422429
// actually have an undercount.
423-
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
424-
self.current_position = self.current_position.wrapping_add(2);
430+
self.position_difference = self.position_difference.wrapping_sub(1);
425431
} else if byte & 0xC0 == 0x80 {
426432
// Note that due to the special case for the 4-byte
427433
// sequence intro, we must use wrapping add here.
428-
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
429-
} else {
430-
self.current_position = self.current_position.wrapping_add(1);
434+
self.position_difference = self.position_difference.wrapping_add(1);
431435
}
432436
}
433437

@@ -443,12 +447,11 @@ impl<'a> Tokenizer<'a> {
443447
let byte = self.next_byte_unchecked();
444448
debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
445449
self.position += 1;
446-
self.current_position = self.current_position.wrapping_add(1);
447450
if byte == b'\r' && self.next_byte() == Some(b'\n') {
448451
self.position += 1;
449-
self.current_position = self.current_position.wrapping_add(1);
450452
}
451453
self.current_line_start_position = self.position;
454+
self.current_line_start_difference = self.position_difference;
452455
self.current_line_number += 1;
453456
}
454457

@@ -462,14 +465,13 @@ impl<'a> Tokenizer<'a> {
462465
fn consume_char(&mut self) -> char {
463466
let c = self.next_char();
464467
let len_utf8 = c.len_utf8();
468+
let len_utf16 = c.len_utf16();
465469
self.position += len_utf8;
466470
// Note that due to the special case for the 4-byte sequence
467471
// intro, we must use wrapping add here.
468-
let len_utf16 = c.len_utf16();
469-
self.current_line_start_position = self
470-
.current_line_start_position
471-
.wrapping_add(len_utf8 - len_utf16);
472-
self.current_position = self.current_position.wrapping_add(len_utf16);
472+
self.position_difference = self
473+
.position_difference
474+
.wrapping_add((len_utf8 - len_utf16) as u16);
473475
c
474476
}
475477

@@ -1160,16 +1162,12 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11601162
}
11611163
};
11621164
match_byte! { b,
1163-
b' ' | b'\t' => {
1164-
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
1165-
},
1165+
b' ' | b'\t' => {},
11661166
b'\n' | b'\x0C' => {
11671167
newlines += 1;
11681168
last_newline = offset;
1169-
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
11701169
}
11711170
b'\r' => {
1172-
tokenizer.current_position = tokenizer.current_position.wrapping_add(1);
11731171
if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
11741172
newlines += 1;
11751173
last_newline = offset;

0 commit comments

Comments
 (0)