From cac6c278d6576d95df4c9b0a94de09431c0e5911 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 03:36:42 +0900 Subject: [PATCH 001/201] raw_lexer --- crates/swc_ecma_raw_lexer/Cargo.toml | 13 +++++++++++++ crates/swc_ecma_raw_lexer/src/lib.rs | 7 +++++++ 2 files changed, 20 insertions(+) create mode 100644 crates/swc_ecma_raw_lexer/Cargo.toml create mode 100644 crates/swc_ecma_raw_lexer/src/lib.rs diff --git a/crates/swc_ecma_raw_lexer/Cargo.toml b/crates/swc_ecma_raw_lexer/Cargo.toml new file mode 100644 index 000000000000..1867677e7632 --- /dev/null +++ b/crates/swc_ecma_raw_lexer/Cargo.toml @@ -0,0 +1,13 @@ +[package] +authors = ["강동윤 "] +description = "Raw lexer for swc" +documentation = "https://rustdoc.swc.rs/swc_ecma_raw_lexer/" +edition = { workspace = true } +include = ["Cargo.toml", "src/**/*.rs", "examples/**/*.rs"] +license = { workspace = true } +name = "swc_ecma_raw_lexer" +repository = { workspace = true } +version = "0.1.0" + +[dependencies] +logos = "0.15.0" diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs new file mode 100644 index 000000000000..059ea4e163c8 --- /dev/null +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -0,0 +1,7 @@ +use logos::Logos; + +#[derive(Logos, Debug, PartialEq)] +enum RawToken { + #[regex(r"\P{ID_Start}\P{ID_Continue}*")] + Ident, +} From c2835160982d0d37f930c56747b94a0a4a223c82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 05:15:53 +0900 Subject: [PATCH 002/201] edition --- Cargo.lock | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6762fcfa1e3e..10162ae199d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -303,6 +303,12 @@ dependencies = [ "simd-abstraction", ] +[[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + [[package]] name = "better_scoped_tls" version = "1.0.0" @@ -1893,7 +1899,7 @@ checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f" dependencies = [ "atomic-polyfill", "hash32", - "rustc_version 0.4.0", + "rustc_version 0.4.1", "spin", "stable_deref_trait", ] @@ -2611,6 +2617,40 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "logos" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab6f536c1af4c7cc81edf73da1f8029896e7e1e16a219ef09b184e76a296f3db" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "189bbfd0b61330abea797e5e9276408f2edbe4f822d7ad08685d67419aafb34e" +dependencies = [ + "beef", + "fnv", + "lazy_static", + "proc-macro2", + "quote", + "regex-syntax 0.8.4", + "rustc_version 0.4.1", + "syn 2.0.87", +] + +[[package]] +name = "logos-derive" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebfe8e1a19049ddbfccbd14ac834b215e11b85b90bab0c2dba7c7b92fb5d5cba" +dependencies = [ + "logos-codegen", +] + [[package]] name = "lru" version = "0.10.1" @@ -3786,9 +3826,9 @@ dependencies = [ [[package]] name = "rustc_version" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ "semver 1.0.23", ] @@ -5293,6 +5333,13 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "swc_ecma_raw_lexer" +version = "0.1.0" +dependencies = [ + "logos", +] + [[package]] name = "swc_ecma_testing" version = "5.0.0" From 8296f04ca4ce8c14a93759f19063ce0d9e170e77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 05:24:13 +0900 Subject: [PATCH 003/201] raw token --- crates/swc_ecma_raw_lexer/src/lib.rs | 205 ++++++++++++++++++++++++++- 1 file changed, 204 insertions(+), 1 deletion(-) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 059ea4e163c8..17b985afc069 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -1,7 +1,210 @@ use logos::Logos; #[derive(Logos, Debug, PartialEq)] -enum RawToken { +pub enum RawToken { + #[token("=>", priority = 3)] + Arrow, + + #[token("#", priority = 3)] + Hash, + + #[token("@", priority = 3)] + At, + + #[token(".", priority = 3)] + Dot, + + #[token("...", priority = 3)] + DotDotDot, + + #[token("!", priority = 3)] + Bang, + + #[token("(", priority = 3)] + LParen, + + #[token(")", priority = 3)] + RParen, + + #[token("[", priority = 3)] + LBracket, + + #[token("]", priority = 3)] + RBracket, + + #[token("{", priority = 3)] + LBrace, + + #[token("}", priority = 3)] + RBrace, + + #[token(";", priority = 3)] + Semi, + + #[token(",", priority = 3)] + Comma, + + #[token(":", priority = 3)] + Colon, + + #[token("`", priority = 3)] + BackQuote, + + #[token("${", priority = 3)] + DollarLBrace, + + #[token("?", priority = 3)] + QuestionMark, + + #[token("++", priority = 3)] + PlusPlus, + + #[token("--", priority = 3)] + MinusMinus, + + #[token("~", priority = 3)] + Tilde, + + Str, + Regex, + Num, + BigInt, + + #[token("#!", priority = 3)] + Shebang, + + #[token("null", priority = 3)] + Null, + + #[token("true", priority = 3)] + True, + + #[token("false", priority = 3)] + False, + + #[token("==", priority = 3)] + EqEqOp, + + #[token("!=", priority = 3)] + NotEqOp, + + #[token("===", priority = 3)] + EqEqEqOp, + + #[token("!==", priority = 3)] + NotEqEqOp, + + #[token("<", priority = 3)] + LtOp, + + #[token("<=", priority = 3)] + LtEqOp, + + #[token(">", priority = 3)] + GtOp, + + #[token(">=", priority = 3)] + GtEqOp, + + #[token("<<", priority = 3)] + LShiftOp, + + #[token(">>", priority = 3)] + RShiftOp, + + #[token(">>>", priority = 3)] + ZeroFillRShiftOp, + + #[token("+", priority = 3)] + AddOp, + + #[token("-", priority = 3)] + SubOp, + + #[token("*", priority = 3)] + MulOp, + + #[token("/", priority = 3)] + DivOp, + + #[token("%", priority = 3)] + ModOp, + + #[token("|", priority = 3)] + BitOrOp, + + #[token("^", priority = 3)] + BitXorOp, + + #[token("&", priority = 3)] + BitAndOp, + + #[token("in", priority = 3)] + In, + + #[token("instanceof", priority = 3)] + InstanceOf, + + #[token("**", priority = 3)] + ExpOp, + + #[token("||", priority = 3)] + LogicalOrOp, + + #[token("&&", priority = 3)] + LogicalAndOp, + + #[token("??")] + NullishCoalescingOp, + + #[token("=", priority = 3)] + AssignOp, + + #[token("+=", priority = 3)] + AddAssignOp, + + #[token("-=", priority = 3)] + SubAssignOp, + + #[token("*=", priority = 3)] + MulAssignOp, + + #[token("/=", priority = 3)] + DivAssignOp, + + #[token("%=", priority = 3)] + ModAssignOp, + + #[token("<<=", priority = 3)] + LShiftAssignOp, + + #[token(">>=", priority = 3)] + RShiftAssignOp, + + #[token(">>>=", priority = 3)] + ZeroFillRShiftAssignOp, + + #[token("|=", priority = 3)] + BitOrAssignOp, + + #[token("^=", priority = 3)] + BitXorAssignOp, + + #[token("&=", priority = 3)] + BitAndAssignOp, + + #[token("**=", priority = 3)] + ExpAssignOp, + + #[token("&&=", priority = 3)] + AndAssignOp, + + #[token("||=", priority = 3)] + OrAssignOp, + + #[token("??=", priority = 3)] + NullishAssignOp, + #[regex(r"\P{ID_Start}\P{ID_Continue}*")] Ident, } From 6605c44283af49513170a38a7baa3401b3c39d85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 05:26:42 +0900 Subject: [PATCH 004/201] Dep --- crates/swc_ecma_parser/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/swc_ecma_parser/Cargo.toml b/crates/swc_ecma_parser/Cargo.toml index f4405098f25a..e0ca039ee3af 100644 --- a/crates/swc_ecma_parser/Cargo.toml +++ b/crates/swc_ecma_parser/Cargo.toml @@ -40,6 +40,7 @@ swc_atoms = { version = "3.0.2", path = "../swc_atoms" } swc_common = { version = "5.0.0", path = "../swc_common" } swc_ecma_ast = { version = "5.0.1", path = "../swc_ecma_ast" } swc_ecma_visit = { version = "5.0.0", path = "../swc_ecma_visit", optional = true } +logos = "0.15.0" [target.'cfg(not(any(target_arch = "wasm32", target_arch = "arm")))'.dependencies] stacker = { version = "0.1.15", optional = true } From 1f530f0f6fd223813f6984f9238cf11b62f4b97f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 05:30:34 +0900 Subject: [PATCH 005/201] Dep --- crates/swc_ecma_parser/Cargo.toml | 29 +++++++++++++------------ crates/swc_ecma_parser/src/lexer/mod.rs | 2 +- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/crates/swc_ecma_parser/Cargo.toml b/crates/swc_ecma_parser/Cargo.toml index e0ca039ee3af..80394ccc1338 100644 --- a/crates/swc_ecma_parser/Cargo.toml +++ b/crates/swc_ecma_parser/Cargo.toml @@ -25,22 +25,23 @@ typescript = [] verify = ["swc_ecma_visit"] [dependencies] -either = { workspace = true } -num-bigint = { workspace = true } -num-traits = { workspace = true } -serde = { workspace = true, features = ["derive"] } -smallvec = { workspace = true } -smartstring = { workspace = true } -tracing = { workspace = true } -typed-arena = { workspace = true } - +either = { workspace = true } +logos = "0.15.0" new_debug_unreachable = { workspace = true } +num-bigint = { workspace = true } +num-traits = { workspace = true } phf = { workspace = true, features = ["macros"] } -swc_atoms = { version = "3.0.2", path = "../swc_atoms" } -swc_common = { version = "5.0.0", path = "../swc_common" } -swc_ecma_ast = { version = "5.0.1", path = "../swc_ecma_ast" } -swc_ecma_visit = { version = "5.0.0", path = "../swc_ecma_visit", optional = true } -logos = "0.15.0" +serde = { workspace = true, features = ["derive"] } +smallvec = { workspace = true } +smartstring = { workspace = true } +tracing = { workspace = true } +typed-arena = { workspace = true } + +swc_atoms = { version = "3.0.2", path = "../swc_atoms" } +swc_common = { version = "5.0.0", path = "../swc_common" } +swc_ecma_ast = { version = "5.0.1", path = "../swc_ecma_ast" } +swc_ecma_raw_lexer = { version = "0.1.0", path = "../swc_ecma_raw_lexer" } +swc_ecma_visit = { version = "5.0.0", path = "../swc_ecma_visit", optional = true } [target.'cfg(not(any(target_arch = "wasm32", target_arch = "arm")))'.dependencies] stacker = { version = "0.1.15", optional = true } diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 31bb91f29d3a..796aefadb649 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -121,7 +121,7 @@ pub struct Lexer<'a> { comments_buffer: Option, pub(crate) ctx: Context, - input: StringInput<'a>, + input: logos::Lexer<'a, RawToken>, start_pos: BytePos, state: State, From 6c9e5b8ba5374856ee9d7c486538a71a342acbf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 05:35:47 +0900 Subject: [PATCH 006/201] raw token --- crates/swc_ecma_raw_lexer/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 17b985afc069..fcaf1d3d740d 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -1,6 +1,6 @@ use logos::Logos; -#[derive(Logos, Debug, PartialEq)] +#[derive(Logos, Debug, Clone, Copy, PartialEq)] pub enum RawToken { #[token("=>", priority = 3)] Arrow, From 87a88dc9f5c09c1a185e6bbfd7d3953133ea14c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 05:38:47 +0900 Subject: [PATCH 007/201] Fix string input --- crates/swc_common/src/input.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/swc_common/src/input.rs b/crates/swc_common/src/input.rs index 52c325c6b705..c7565721dff3 100644 --- a/crates/swc_common/src/input.rs +++ b/crates/swc_common/src/input.rs @@ -40,7 +40,7 @@ impl<'a> StringInput<'a> { } #[inline(always)] - pub fn as_str(&self) -> &str { + pub fn as_str(&self) -> &'a str { self.iter.as_str() } From 13291dbb18deb6b08c2952c854d9ba320ce37c6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 05:40:53 +0900 Subject: [PATCH 008/201] raw token --- crates/swc_ecma_raw_lexer/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index fcaf1d3d740d..731c8ffe0f3a 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -1,6 +1,6 @@ use logos::Logos; -#[derive(Logos, Debug, Clone, Copy, PartialEq)] +#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)] pub enum RawToken { #[token("=>", priority = 3)] Arrow, From 91cbf62885253672bf15b9d112b7ea10d6dcae32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 05:42:30 +0900 Subject: [PATCH 009/201] bump --- crates/swc_ecma_parser/src/lexer/mod.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 796aefadb649..772aa39db2dd 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -241,15 +241,14 @@ impl<'a> Lexer<'a> { unsafe { // Safety: cur() is Some // 1st `.` - self.input.bump(); + self.input.bump(1); } if next == '.' && self.input.peek() == Some('.') { unsafe { // Safety: peek() was Some - self.input.bump(); // 2nd `.` - self.input.bump(); // 3rd `.` + self.input.bump(2); `..` } return Ok(tok!("...")); From 0d66e6500de06d99eb575d410089696a4f8a7d51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 05:44:08 +0900 Subject: [PATCH 010/201] mod --- crates/swc_ecma_parser/src/lexer/mod.rs | 8 ++++---- crates/swc_ecma_parser/src/lexer/state.rs | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 772aa39db2dd..2e0dd2f9a68e 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -400,7 +400,7 @@ impl<'a> Lexer<'a> { let is_mul = c == b'*'; unsafe { // Safety: cur() is Some(c) - self.input.bump(); + self.input.bump(1 ); } let mut token = if is_mul { Token::BinOp(BinOpToken::Mul) @@ -546,7 +546,7 @@ impl<'a> Lexer<'a> { unsafe { // Safety: cur() is Some(c) if this method is called. - self.input.bump(); + self.input.bump(1 ); } Ok(Some(vec![c.into()])) @@ -557,14 +557,14 @@ impl<'a> Lexer<'a> { unsafe { // Safety: cur() is Some(c), if this method is called. - self.input.bump(); + self.input.bump(1); } // '++', '--' Ok(Some(if self.input.cur() == Some(c as char) { unsafe { // Safety: cur() is Some(c) - self.input.bump(); + self.input.bump(1); } // Handle --> diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index c881a0bc8d07..0167a2640e3e 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -6,7 +6,7 @@ use tracing::trace; use super::{ comments_buffer::{BufferedComment, BufferedCommentKind}, - Context, Input, Lexer, + Context, Lexer, }; use crate::{ error::{Error, SyntaxError}, @@ -302,7 +302,7 @@ impl Lexer<'_> { if c == '>' { unsafe { // Safety: cur() is Some('>') - self.input.bump(); + self.input.bump(1); } return Ok(Some(Token::JSXTagEnd)); } @@ -320,7 +320,7 @@ impl Lexer<'_> { unsafe { // Safety: cur() is Some('<') - self.input.bump(); + self.input.bump(1); } if had_line_break_before_last && self.is_str("<<<<<< ") { From 5a0254ae296e57e8b18153a49f77cf118f26cf8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 05:45:34 +0900 Subject: [PATCH 011/201] bump(1) --- crates/swc_ecma_parser/src/lexer/jsx.rs | 4 +- crates/swc_ecma_parser/src/lexer/mod.rs | 29 +- crates/swc_ecma_parser/src/lexer/table.rs | 399 ---------------------- 3 files changed, 13 insertions(+), 419 deletions(-) delete mode 100644 crates/swc_ecma_parser/src/lexer/table.rs diff --git a/crates/swc_ecma_parser/src/lexer/jsx.rs b/crates/swc_ecma_parser/src/lexer/jsx.rs index b3c8852773c1..5aaf136ffafe 100644 --- a/crates/swc_ecma_parser/src/lexer/jsx.rs +++ b/crates/swc_ecma_parser/src/lexer/jsx.rs @@ -89,13 +89,13 @@ impl Lexer<'_> { ); unsafe { // Safety: cur() was Some('}') - self.input.bump() + self.input.bump(1) } } '&' => { value.push_str(unsafe { // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) + self.input.slice()[chunk_start..cur_pos] }); let jsx_entity = self.read_jsx_entity()?; diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 2e0dd2f9a68e..0483cc9f32df 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -8,12 +8,7 @@ use swc_atoms::{Atom, AtomStoreCell}; use swc_common::{comments::Comments, input::StringInput, BytePos, Span}; use swc_ecma_ast::{op, AssignOp, EsVersion, Ident}; -use self::{ - comments_buffer::CommentsBuffer, - state::State, - table::{ByteHandler, BYTE_HANDLERS}, - util::*, -}; +use self::{comments_buffer::CommentsBuffer, state::State, util::*}; pub use self::{ input::Input, state::{TokenContext, TokenContexts}, @@ -30,7 +25,6 @@ pub mod input; mod jsx; mod number; mod state; -mod table; #[cfg(test)] mod tests; pub mod util; @@ -248,7 +242,7 @@ impl<'a> Lexer<'a> { unsafe { // Safety: peek() was Some - self.input.bump(2); `..` + self.input.bump(2); // `..` } return Ok(tok!("...")); @@ -266,13 +260,12 @@ impl<'a> Lexer<'a> { Some('?') => { unsafe { // Safety: peek() was some - self.input.bump(); - self.input.bump(); + self.input.bump(2); } if self.input.cur() == Some('=') { unsafe { // Safety: cur() was some - self.input.bump(); + self.input.bump(1); } return Ok(tok!("??=")); @@ -282,7 +275,7 @@ impl<'a> Lexer<'a> { _ => { unsafe { // Safety: peek() is callable only if cur() is Some - self.input.bump(); + self.input.bump(1); } Ok(tok!('?')) } @@ -296,7 +289,7 @@ impl<'a> Lexer<'a> { fn read_token_colon(&mut self) -> LexResult { unsafe { // Safety: cur() is Some(':') - self.input.bump(); + self.input.bump(1); } Ok(tok!(':')) } @@ -336,7 +329,7 @@ impl<'a> Lexer<'a> { unsafe { // Safety: cur() is Some(c as char) - self.input.bump(); + self.input.bump(1); } let token = if c == b'&' { BinOpToken::BitAnd @@ -357,13 +350,13 @@ impl<'a> Lexer<'a> { if self.input.cur() == Some(c as char) { unsafe { // Safety: cur() is Some(c) - self.input.bump(); + self.input.bump(1); } if self.input.cur() == Some('=') { unsafe { // Safety: cur() is Some('=') - self.input.bump(); + self.input.bump(1); } return Ok(Token::AssignOp(match token { BinOpToken::BitAnd => op!("&&="), @@ -400,7 +393,7 @@ impl<'a> Lexer<'a> { let is_mul = c == b'*'; unsafe { // Safety: cur() is Some(c) - self.input.bump(1 ); + self.input.bump(1); } let mut token = if is_mul { Token::BinOp(BinOpToken::Mul) @@ -546,7 +539,7 @@ impl<'a> Lexer<'a> { unsafe { // Safety: cur() is Some(c) if this method is called. - self.input.bump(1 ); + self.input.bump(1); } Ok(Some(vec![c.into()])) diff --git a/crates/swc_ecma_parser/src/lexer/table.rs b/crates/swc_ecma_parser/src/lexer/table.rs deleted file mode 100644 index b190245b3f80..000000000000 --- a/crates/swc_ecma_parser/src/lexer/table.rs +++ /dev/null @@ -1,399 +0,0 @@ -//! Lookup table for byte handlers. -//! -//! Idea is taken from ratel. -//! -//! https://github.com/ratel-rust/ratel-core/blob/e55a1310ba69a3f5ce2a9a6eef643feced02ac08/ratel/src/lexer/mod.rs#L665 - -use either::Either; -use swc_common::input::Input; -use swc_ecma_ast::AssignOp; - -use super::{pos_span, util::CharExt, LexResult, Lexer}; -use crate::{ - error::SyntaxError, - token::{BinOpToken, IdentLike, Keyword, KnownIdent, Token, Word}, -}; - -pub(super) type ByteHandler = Option fn(&mut Lexer<'aa>) -> LexResult>>; - -/// Lookup table mapping any incoming byte to a handler function defined below. -pub(super) static BYTE_HANDLERS: [ByteHandler; 256] = [ - // 0 1 2 3 4 5 6 7 8 9 A B C D E F // - EOF, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 0 - ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 1 - ___, EXL, QOT, HSH, IDN, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 - ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, MOR, QST, // 3 - AT_, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, // 4 - IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, BTO, IDN, BTC, CRT, IDN, // 5 - TPL, L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H, L_I, L_J, L_K, L_L, L_M, L_N, L_O, // 6 - L_P, L_Q, L_R, L_S, L_T, L_U, L_V, L_W, L_X, L_Y, L_Z, BEO, PIP, BEC, TLD, ERR, // 7 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F -]; - -const ___: ByteHandler = None; - -const EOF: ByteHandler = Some(|lexer| { - lexer.input.bump_bytes(1); - - Ok(None) -}); - -const ERR: ByteHandler = Some(|lexer| { - let c = unsafe { - // Safety: Byte handler is only called for non-last chracters - lexer.input.cur().unwrap_unchecked() - }; - - let start = lexer.cur_pos(); - unsafe { - // Safety: Byte handler is only called for non-last chracters - lexer.input.bump(); - } - lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? -}); - -/// Identifier and we know that this cannot be a keyword or known ident. -const IDN: ByteHandler = Some(|lexer| lexer.read_ident_unknown().map(Some)); - -const L_A: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "abstract" => Some(Word::Ident(IdentLike::Known(KnownIdent::Abstract))), - "as" => Some(Word::Ident(IdentLike::Known(KnownIdent::As))), - "await" => Some(Word::Keyword(Keyword::Await)), - "async" => Some(Word::Ident(IdentLike::Known(KnownIdent::Async))), - "assert" => Some(Word::Ident(IdentLike::Known(KnownIdent::Assert))), - "asserts" => Some(Word::Ident(IdentLike::Known(KnownIdent::Asserts))), - "any" => Some(Word::Ident(IdentLike::Known(KnownIdent::Any))), - "accessor" => Some(Word::Ident(IdentLike::Known(KnownIdent::Accessor))), - _ => None, - }) -}); - -const L_B: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "break" => Some(Word::Keyword(Keyword::Break)), - "boolean" => Some(Word::Ident(IdentLike::Known(KnownIdent::Boolean))), - "bigint" => Some(Word::Ident(IdentLike::Known(KnownIdent::Bigint))), - _ => None, - }) -}); - -const L_C: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "case" => Some(Word::Keyword(Keyword::Case)), - "catch" => Some(Word::Keyword(Keyword::Catch)), - "class" => Some(Word::Keyword(Keyword::Class)), - "const" => Some(Word::Keyword(Keyword::Const)), - "continue" => Some(Word::Keyword(Keyword::Continue)), - _ => None, - }) -}); - -const L_D: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "debugger" => Some(Word::Keyword(Keyword::Debugger)), - "default" => Some(Word::Keyword(Keyword::Default_)), - "delete" => Some(Word::Keyword(Keyword::Delete)), - "do" => Some(Word::Keyword(Keyword::Do)), - "declare" => Some(Word::Ident(IdentLike::Known(KnownIdent::Declare))), - _ => None, - }) -}); - -const L_E: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "else" => Some(Word::Keyword(Keyword::Else)), - "enum" => Some(Word::Ident(IdentLike::Known(KnownIdent::Enum))), - "export" => Some(Word::Keyword(Keyword::Export)), - "extends" => Some(Word::Keyword(Keyword::Extends)), - _ => None, - }) -}); - -const L_F: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "false" => Some(Word::False), - "finally" => Some(Word::Keyword(Keyword::Finally)), - "for" => Some(Word::Keyword(Keyword::For)), - "function" => Some(Word::Keyword(Keyword::Function)), - "from" => Some(Word::Ident(IdentLike::Known(KnownIdent::From))), - _ => None, - }) -}); - -const L_G: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "global" => Some(Word::Ident(IdentLike::Known(KnownIdent::Global))), - "get" => Some(Word::Ident(IdentLike::Known(KnownIdent::Get))), - _ => None, - }) -}); - -const L_H: ByteHandler = IDN; - -const L_I: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "if" => Some(Word::Keyword(Keyword::If)), - "import" => Some(Word::Keyword(Keyword::Import)), - "in" => Some(Word::Keyword(Keyword::In)), - "instanceof" => Some(Word::Keyword(Keyword::InstanceOf)), - "is" => Some(Word::Ident(IdentLike::Known(KnownIdent::Is))), - "infer" => Some(Word::Ident(IdentLike::Known(KnownIdent::Infer))), - "interface" => Some(Word::Ident(IdentLike::Known(KnownIdent::Interface))), - "implements" => Some(Word::Ident(IdentLike::Known(KnownIdent::Implements))), - "intrinsic" => Some(Word::Ident(IdentLike::Known(KnownIdent::Intrinsic))), - _ => None, - }) -}); - -const L_J: ByteHandler = IDN; - -const L_K: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "keyof" => Some(Word::Ident(IdentLike::Known(KnownIdent::Keyof))), - _ => None, - }) -}); - -const L_L: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "let" => Some(Word::Keyword(Keyword::Let)), - _ => None, - }) -}); - -const L_M: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "meta" => Some(Word::Ident(IdentLike::Known(KnownIdent::Meta))), - _ => None, - }) -}); - -const L_N: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "new" => Some(Word::Keyword(Keyword::New)), - "null" => Some(Word::Null), - "number" => Some(Word::Ident(IdentLike::Known(KnownIdent::Number))), - "never" => Some(Word::Ident(IdentLike::Known(KnownIdent::Never))), - "namespace" => Some(Word::Ident(IdentLike::Known(KnownIdent::Namespace))), - _ => None, - }) -}); - -const L_O: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "of" => Some(Word::Ident(IdentLike::Known(KnownIdent::Of))), - "object" => Some(Word::Ident(IdentLike::Known(KnownIdent::Object))), - _ => None, - }) -}); - -const L_P: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "public" => Some(Word::Ident(IdentLike::Known(KnownIdent::Public))), - "package" => Some(Word::Ident(IdentLike::Known(KnownIdent::Package))), - "protected" => Some(Word::Ident(IdentLike::Known(KnownIdent::Protected))), - "private" => Some(Word::Ident(IdentLike::Known(KnownIdent::Private))), - _ => None, - }) -}); - -const L_Q: ByteHandler = IDN; - -const L_R: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "return" => Some(Word::Keyword(Keyword::Return)), - "readonly" => Some(Word::Ident(IdentLike::Known(KnownIdent::Readonly))), - "require" => Some(Word::Ident(IdentLike::Known(KnownIdent::Require))), - _ => None, - }) -}); - -const L_S: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "super" => Some(Word::Keyword(Keyword::Super)), - "static" => Some(Word::Ident(IdentLike::Known(KnownIdent::Static))), - "switch" => Some(Word::Keyword(Keyword::Switch)), - "symbol" => Some(Word::Ident(IdentLike::Known(KnownIdent::Symbol))), - "set" => Some(Word::Ident(IdentLike::Known(KnownIdent::Set))), - "string" => Some(Word::Ident(IdentLike::Known(KnownIdent::String))), - "satisfies" => Some(Word::Ident(IdentLike::Known(KnownIdent::Satisfies))), - _ => None, - }) -}); - -const L_T: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "this" => Some(Word::Keyword(Keyword::This)), - "throw" => Some(Word::Keyword(Keyword::Throw)), - "true" => Some(Word::True), - "typeof" => Some(Word::Keyword(Keyword::TypeOf)), - "try" => Some(Word::Keyword(Keyword::Try)), - "type" => Some(Word::Ident(IdentLike::Known(KnownIdent::Type))), - "target" => Some(Word::Ident(IdentLike::Known(KnownIdent::Target))), - _ => None, - }) -}); - -const L_U: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "using" => Some(Word::Ident(IdentLike::Known(KnownIdent::Using))), - "unique" => Some(Word::Ident(IdentLike::Known(KnownIdent::Unique))), - "undefined" => Some(Word::Ident(IdentLike::Known(KnownIdent::Undefined))), - "unknown" => Some(Word::Ident(IdentLike::Known(KnownIdent::Unknown))), - _ => None, - }) -}); - -const L_V: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "var" => Some(Word::Keyword(Keyword::Var)), - "void" => Some(Word::Keyword(Keyword::Void)), - _ => None, - }) -}); - -const L_W: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "while" => Some(Word::Keyword(Keyword::While)), - "with" => Some(Word::Keyword(Keyword::With)), - _ => None, - }) -}); - -const L_X: ByteHandler = IDN; - -const L_Y: ByteHandler = Some(|lexer| { - lexer.read_word_with(&|s| match s { - "yield" => Some(Word::Keyword(Keyword::Yield)), - _ => None, - }) -}); - -const L_Z: ByteHandler = IDN; - -/// `0` -const ZER: ByteHandler = Some(|lexer| lexer.read_token_zero().map(Some)); - -/// Numbers -const DIG: ByteHandler = Some(|lexer| { - lexer - .read_number(false) - .map(|v| match v { - Either::Left((value, raw)) => Token::Num { value, raw }, - Either::Right((value, raw)) => Token::BigInt { value, raw }, - }) - .map(Some) -}); - -/// String literals with `'` or `"` -const QOT: ByteHandler = Some(|lexer| lexer.read_str_lit().map(Some)); - -/// Unicode -const UNI: ByteHandler = Some(|lexer| { - let c = unsafe { - // Safety: Byte handler is only called for non-last chracters - lexer.input.cur().unwrap_unchecked() - }; - - // Identifier or keyword. '\uXXXX' sequences are allowed in - // identifiers, so '\' also dispatches to that. - if c == '\\' || c.is_ident_start() { - return lexer.read_ident_unknown().map(Some); - } - - let start = lexer.cur_pos(); - unsafe { - // Safety: Byte handler is only called for non-last chracters - lexer.input.bump(); - } - lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? -}); - -/// `:` -const COL: ByteHandler = Some(|lexer| lexer.read_token_colon().map(Some)); - -/// `%` -const PRC: ByteHandler = Some(|lexer| lexer.read_token_mul_mod(b'%').map(Some)); - -/// `*` -const ATR: ByteHandler = Some(|lexer| lexer.read_token_mul_mod(b'*').map(Some)); - -/// `?` -const QST: ByteHandler = Some(|lexer| lexer.read_token_question_mark().map(Some)); - -/// `&` -const AMP: ByteHandler = Some(|lexer| lexer.read_token_logical(b'&').map(Some)); - -/// `|` -const PIP: ByteHandler = Some(|lexer| lexer.read_token_logical(b'|').map(Some)); - -macro_rules! single_char { - ($name:ident, $c:literal, $token:ident) => { - const $name: ByteHandler = Some(|lexer| { - lexer.input.bump_bytes(1); - Ok(Some(Token::$token)) - }); - }; -} - -single_char!(SEM, b';', Semi); -single_char!(COM, b',', Comma); -single_char!(TPL, b'`', BackQuote); -single_char!(TLD, b'~', Tilde); -single_char!(AT_, b'@', At); - -single_char!(PNO, b'(', LParen); -single_char!(PNC, b')', RParen); - -single_char!(BTO, b'[', LBracket); -single_char!(BTC, b']', RBracket); - -single_char!(BEO, b'{', LBrace); -single_char!(BEC, b'}', RBrace); - -/// `^` -const CRT: ByteHandler = Some(|lexer| { - // Bitwise xor - lexer.input.bump_bytes(1); - Ok(Some(if lexer.input.cur_as_ascii() == Some(b'=') { - lexer.input.bump_bytes(1); - Token::AssignOp(AssignOp::BitXorAssign) - } else { - Token::BinOp(BinOpToken::BitXor) - })) -}); - -/// `+` -const PLS: ByteHandler = Some(|lexer| lexer.read_token_plus_minus(b'+')); - -/// `-` -const MIN: ByteHandler = Some(|lexer| lexer.read_token_plus_minus(b'-')); - -/// `!` -const EXL: ByteHandler = Some(|lexer| lexer.read_token_bang_or_eq(b'!')); - -/// `=` -const EQL: ByteHandler = Some(|lexer| lexer.read_token_bang_or_eq(b'=')); - -/// `.` -const PRD: ByteHandler = Some(|lexer| lexer.read_token_dot().map(Some)); - -/// `<` -const LSS: ByteHandler = Some(|lexer| lexer.read_token_lt_gt()); - -/// `>` -const MOR: ByteHandler = Some(|lexer| lexer.read_token_lt_gt()); - -/// `/` -const SLH: ByteHandler = Some(|lexer| lexer.read_slash()); - -/// `#` -const HSH: ByteHandler = Some(|lexer| lexer.read_token_number_sign()); From 13563e43d0cc5cd30a32c97a34facff137dda2cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 05:45:43 +0900 Subject: [PATCH 012/201] cargo lockfile --- Cargo.lock | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 10162ae199d5..27232ff4a3ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5257,6 +5257,7 @@ dependencies = [ "codspeed-criterion-compat", "criterion", "either", + "logos", "new_debug_unreachable", "num-bigint", "num-traits", @@ -5270,6 +5271,7 @@ dependencies = [ "swc_atoms", "swc_common", "swc_ecma_ast", + "swc_ecma_raw_lexer", "swc_ecma_visit", "swc_malloc", "testing", From 38b063b5f88dfbd185dba3379459479b3b98c2d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 05:46:28 +0900 Subject: [PATCH 013/201] bump --- crates/swc_ecma_parser/src/lexer/mod.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 0483cc9f32df..3e7ed57a8f36 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -594,7 +594,7 @@ impl<'a> Lexer<'a> { unsafe { // Safety: cur() is Some(c) if this method is called. - self.input.bump(); + self.input.bump(1); } Ok(Some(if self.input.eat_byte(b'=') { @@ -821,7 +821,7 @@ impl Lexer<'_> { let s = unsafe { // Safety: start and end are valid position because we got them from // `self.input` - l.input.slice(slice_start, start) + l.input.slice()[slice_start..start] }; buf.push_str(s); unsafe { @@ -879,7 +879,7 @@ impl Lexer<'_> { let s = unsafe { // Safety: slice_start and end are valid position because we got them from // `self.input` - l.input.slice(slice_start, end) + l.input.slice()[slice_start..end] }; let s = unsafe { // Safety: We don't use 'static. We just bypass the lifetime check. @@ -891,7 +891,7 @@ impl Lexer<'_> { let s = unsafe { // Safety: slice_start and end are valid position because we got them from // `self.input` - l.input.slice(slice_start, end) + l.input.slice()[slice_start..end] }; buf.push_str(s); @@ -1208,10 +1208,8 @@ impl Lexer<'_> { return Ok(None); } unsafe { - // Safety: cur() is Some('#') - self.input.bump(); - // Safety: cur() is Some('!') - self.input.bump(); + // Safety: "#!" + self.input.bump(2); } let s = self.input.uncons_while(|c| !c.is_line_terminator()); Ok(Some(self.atoms.atom(s))) @@ -1231,7 +1229,7 @@ impl Lexer<'_> { cooked.push_str(unsafe { // Safety: Both of start and last_pos are valid position because we got them // from `self.input` - self.input.slice(cooked_slice_start, last_pos) + self.input.slice()[cooked_slice_start..last_pos] }); } }}; @@ -1256,7 +1254,7 @@ impl Lexer<'_> { let s = unsafe { // Safety: Both of start and last_pos are valid position because we got them // from `self.input` - self.input.slice(cooked_slice_start, last_pos) + self.input.slice()[cooked_slice_start..last_pos] }; Ok(self.atoms.atom(s)) @@ -1271,7 +1269,7 @@ impl Lexer<'_> { let raw = unsafe { // Safety: Both of start and last_pos are valid position because we got them // from `self.input` - self.input.slice(raw_slice_start, end) + self.input.slice()[raw_slice_start..end] }; return Ok(Token::Template { cooked, From ad86d37b7097450376eddc9ac3b569f940830d2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:10:41 +0900 Subject: [PATCH 014/201] LexError --- crates/swc_ecma_raw_lexer/src/lib.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 731c8ffe0f3a..54c11d0d6fde 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -1,6 +1,7 @@ use logos::Logos; #[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)] +#[logos(error = LexError)] pub enum RawToken { #[token("=>", priority = 3)] Arrow, @@ -208,3 +209,9 @@ pub enum RawToken { #[regex(r"\P{ID_Start}\P{ID_Continue}*")] Ident, } + +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] +pub enum LexError { + #[default] + UnexpectedEof, +} From 3adc46d9ca163454beaf2a4884e91aaf21966f1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:11:15 +0900 Subject: [PATCH 015/201] self.input.bump(1) --- crates/swc_ecma_parser/src/lexer/jsx.rs | 20 ++++++++++---------- crates/swc_ecma_parser/src/lexer/mod.rs | 4 ++-- crates/swc_ecma_parser/src/lexer/number.rs | 2 +- crates/swc_ecma_parser/src/lexer/util.rs | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/jsx.rs b/crates/swc_ecma_parser/src/lexer/jsx.rs index 5aaf136ffafe..6bbb87e3a095 100644 --- a/crates/swc_ecma_parser/src/lexer/jsx.rs +++ b/crates/swc_ecma_parser/src/lexer/jsx.rs @@ -36,7 +36,7 @@ impl Lexer<'_> { if cur == '<' && self.state.is_expr_allowed { unsafe { // Safety: cur() was Some('<') - self.input.bump(); + self.input.bump(1); } return Ok(Some(Token::JSXTagStart)); } @@ -77,7 +77,7 @@ impl Lexer<'_> { ); unsafe { // Safety: cur() was Some('>') - self.input.bump() + self.input.bump(1) } } '}' => { @@ -118,7 +118,7 @@ impl Lexer<'_> { } else { unsafe { // Safety: cur() was Some(c) - self.input.bump() + self.input.bump(1) } } } @@ -153,7 +153,7 @@ impl Lexer<'_> { debug_assert_eq!(c, Some('&')); unsafe { // Safety: cur() was Some('&') - self.input.bump(); + self.input.bump(1); } let start_pos = self.input.cur_pos(); @@ -165,7 +165,7 @@ impl Lexer<'_> { }; unsafe { // Safety: cur() was Some(c) - self.input.bump(); + self.input.bump(1); } if c == ';' { @@ -208,13 +208,13 @@ impl Lexer<'_> { let ch = self.input.cur().unwrap(); unsafe { // Safety: cur() was Some(ch) - self.input.bump(); + self.input.bump(1); } let out = if ch == '\r' && self.input.cur() == Some('\n') { unsafe { // Safety: cur() was Some('\n') - self.input.bump(); + self.input.bump(1); } Either::Left(if normalize_crlf { "\n" } else { "\r\n" }) } else { @@ -234,7 +234,7 @@ impl Lexer<'_> { unsafe { // Safety: cur() was Some(quote) - self.input.bump(); // `quote` + self.input.bump(1); // `quote` } let mut out = String::new(); @@ -306,7 +306,7 @@ impl Lexer<'_> { } else { unsafe { // Safety: cur() was Some(ch) - self.input.bump(); + self.input.bump(1); } } } @@ -338,7 +338,7 @@ impl Lexer<'_> { if self.input.peek_ahead().is_some() { unsafe { // Safety: We called peek_ahead() which means cur() was Some - self.input.bump(); + self.input.bump(1); } } diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 3e7ed57a8f36..875b36211adf 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -197,7 +197,7 @@ impl<'a> Lexer<'a> { unsafe { // Safety: cur() is Some('#') - self.input.bump(); // '#' + self.input.bump(1); // '#' } // `#` can also be a part of shebangs, however they should have been @@ -220,7 +220,7 @@ impl<'a> Lexer<'a> { None => { unsafe { // Safety: cur() is Some(',') - self.input.bump(); + self.input.bump(1); } return Ok(tok!('.')); } diff --git a/crates/swc_ecma_parser/src/lexer/number.rs b/crates/swc_ecma_parser/src/lexer/number.rs index 0a13ff969d46..5fc15f1e510b 100644 --- a/crates/swc_ecma_parser/src/lexer/number.rs +++ b/crates/swc_ecma_parser/src/lexer/number.rs @@ -477,7 +477,7 @@ impl Lexer<'_> { // Ignore this _ character unsafe { // Safety: cur() returns Some(c) where c is a valid char - self.input.bump(); + self.input.bump(1); } continue; diff --git a/crates/swc_ecma_parser/src/lexer/util.rs b/crates/swc_ecma_parser/src/lexer/util.rs index 0dd8c52a834d..5b8711981d4f 100644 --- a/crates/swc_ecma_parser/src/lexer/util.rs +++ b/crates/swc_ecma_parser/src/lexer/util.rs @@ -38,7 +38,7 @@ impl Lexer<'_> { pub(super) fn bump(&mut self) { unsafe { // Safety: Actually this is not safe but this is an internal method. - self.input.bump() + self.input.bump(1) } } From 92004631b2343d660b0c9d32e6d365437ac6b934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:13:24 +0900 Subject: [PATCH 016/201] RawBuffer --- crates/swc_ecma_raw_lexer/src/lib.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 54c11d0d6fde..40b7004acf25 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -1,5 +1,22 @@ use logos::Logos; +#[derive(Debug, Clone)] +pub struct RawBuffer<'a>(logos::Lexer<'a, RawToken>); + +impl std::ops::DerefMut for RawBuffer<'_> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl<'a> std::ops::Deref for RawBuffer<'a> { + type Target = logos::Lexer<'a, RawToken>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + #[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)] #[logos(error = LexError)] pub enum RawToken { From 68321b12f28db1044ad9c1fd596b91e93f12bb8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:17:29 +0900 Subject: [PATCH 017/201] RawBuffer --- crates/swc_ecma_raw_lexer/Cargo.toml | 1 + crates/swc_ecma_raw_lexer/src/lib.rs | 21 ++++++++++----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/crates/swc_ecma_raw_lexer/Cargo.toml b/crates/swc_ecma_raw_lexer/Cargo.toml index 1867677e7632..df8e0b127ee9 100644 --- a/crates/swc_ecma_raw_lexer/Cargo.toml +++ b/crates/swc_ecma_raw_lexer/Cargo.toml @@ -11,3 +11,4 @@ version = "0.1.0" [dependencies] logos = "0.15.0" +swc_common = { version = "5.0.0", path = "../swc_common" } diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 40b7004acf25..1e57827f9b79 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -1,19 +1,18 @@ use logos::Logos; +use swc_common::{input::StringInput, BytePos}; #[derive(Debug, Clone)] -pub struct RawBuffer<'a>(logos::Lexer<'a, RawToken>); - -impl std::ops::DerefMut for RawBuffer<'_> { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } +pub struct RawBuffer<'a> { + lexer: logos::Lexer<'a, RawToken>, + pos: BytePos, } -impl<'a> std::ops::Deref for RawBuffer<'a> { - type Target = logos::Lexer<'a, RawToken>; - - fn deref(&self) -> &Self::Target { - &self.0 +impl<'a> RawBuffer<'a> { + pub fn new(input: StringInput<'a>) -> Self { + Self { + lexer: logos::Lexer::new(input.as_str()), + pos: input.start_pos(), + } } } From b84a9e7d4ad5aee93135a6928362dc27f1931b82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:20:49 +0900 Subject: [PATCH 018/201] raw buffer work --- crates/swc_ecma_raw_lexer/src/lib.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 1e57827f9b79..34c20a3f45ec 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -14,6 +14,22 @@ impl<'a> RawBuffer<'a> { pos: input.start_pos(), } } + + pub fn cur_pos(&self) -> BytePos { + self.pos + } + + pub fn cur(&self) -> Result, LexError> { + self.lexer.clone().next().transpose() + } +} + +impl Iterator for RawBuffer<'_> { + type Item = Result; + + fn next(&mut self) -> Option { + self.lexer.next() + } } #[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)] From 52265b497688cec4b06bb7b803234d12c565b600 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:24:43 +0900 Subject: [PATCH 019/201] RawBuffer --- crates/swc_ecma_raw_lexer/src/lib.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 34c20a3f45ec..c60423659379 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -5,6 +5,8 @@ use swc_common::{input::StringInput, BytePos}; pub struct RawBuffer<'a> { lexer: logos::Lexer<'a, RawToken>, pos: BytePos, + orig_str: &'a str, + start_pos: BytePos, } impl<'a> RawBuffer<'a> { @@ -12,6 +14,8 @@ impl<'a> RawBuffer<'a> { Self { lexer: logos::Lexer::new(input.as_str()), pos: input.start_pos(), + orig_str: input.as_str(), + start_pos: input.start_pos(), } } @@ -22,6 +26,24 @@ impl<'a> RawBuffer<'a> { pub fn cur(&self) -> Result, LexError> { self.lexer.clone().next().transpose() } + + /// # Safety + /// + /// - `start` and `end` must be within the bounds of `self.orig_str` + pub unsafe fn slice(&self, start: BytePos, end: BytePos) -> &str { + let lo = start.0 - self.start_pos.0; + let hi = end.0 - self.start_pos.0; + + self.orig_str.get_unchecked(lo as usize..hi as usize) + } + + /// # Safety + /// + /// - `n` must be equal or smaller than lefting length of `self.orig_str` + pub unsafe fn bump(&mut self, n: usize) { + self.lexer.bump(n); + self.pos = self.pos + BytePos(n as u32); + } } impl Iterator for RawBuffer<'_> { From 28414ec84ba128ae5dbc72bd3c4c2b8bc42d2c9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:26:04 +0900 Subject: [PATCH 020/201] lexerror --- crates/swc_ecma_parser/src/error.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/crates/swc_ecma_parser/src/error.rs b/crates/swc_ecma_parser/src/error.rs index 37ebffe5b5ea..224a331c46e7 100644 --- a/crates/swc_ecma_parser/src/error.rs +++ b/crates/swc_ecma_parser/src/error.rs @@ -5,7 +5,7 @@ use std::{borrow::Cow, fmt::Debug}; use swc_atoms::JsWord; use swc_common::{ errors::{DiagnosticBuilder, Handler}, - Span, Spanned, + Span, Spanned, DUMMY_SP, }; use crate::token::Token; @@ -800,3 +800,14 @@ impl Error { fn size_of_error() { assert_eq!(std::mem::size_of::(), 8); } + +impl From for Error { + fn from(e: swc_ecma_raw_lexer::LexError) -> Self { + Self::new( + DUMMY_SP, + match e { + swc_ecma_raw_lexer::LexError::UnexpectedEof => SyntaxError::Eof, + }, + ) + } +} From 3861977772a1fff64ff8c2dd38eade706f97f963 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:28:54 +0900 Subject: [PATCH 021/201] more lexer work --- crates/swc_ecma_parser/src/lexer/jsx.rs | 12 ++++++------ crates/swc_ecma_parser/src/lexer/mod.rs | 9 +++++---- crates/swc_ecma_parser/src/lexer/number.rs | 2 +- crates/swc_ecma_parser/src/lexer/state.rs | 2 +- crates/swc_ecma_parser/src/lexer/util.rs | 5 +---- 5 files changed, 14 insertions(+), 16 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/jsx.rs b/crates/swc_ecma_parser/src/lexer/jsx.rs index 6bbb87e3a095..fd037560d4ac 100644 --- a/crates/swc_ecma_parser/src/lexer/jsx.rs +++ b/crates/swc_ecma_parser/src/lexer/jsx.rs @@ -12,7 +12,7 @@ impl Lexer<'_> { let mut value = String::new(); loop { - let cur = match self.input.cur() { + let cur = match self.input.cur()? { Some(c) => c, None => { let start = self.state.start; @@ -95,7 +95,7 @@ impl Lexer<'_> { '&' => { value.push_str(unsafe { // Safety: We already checked for the range - self.input.slice()[chunk_start..cur_pos] + self.input.slice(chunk_start, cur_pos) }); let jsx_entity = self.read_jsx_entity()?; @@ -159,7 +159,7 @@ impl Lexer<'_> { let start_pos = self.input.cur_pos(); for _ in 0..10 { - let c = match self.input.cur() { + let c = match self.input.cur()? { Some(c) => c, None => break, }; @@ -241,7 +241,7 @@ impl Lexer<'_> { let mut chunk_start = self.input.cur_pos(); loop { - let ch = match self.input.cur() { + let ch = match self.input.cur()? { Some(c) => c, None => { let start = self.state.start; @@ -362,8 +362,8 @@ impl Lexer<'_> { /// by isIdentifierStart in readToken. pub(super) fn read_jsx_word(&mut self) -> LexResult { debug_assert!(self.syntax.jsx()); - debug_assert!(self.input.cur().is_some()); - debug_assert!(self.input.cur().unwrap().is_ident_start()); + debug_assert!(self.input.cur()?.is_some()); + debug_assert!(self.input.cur()?.unwrap().is_ident_start()); let mut first = true; let slice = self.input.uncons_while(|c| { diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 875b36211adf..d84b80f25a69 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -7,6 +7,7 @@ use smallvec::{smallvec, SmallVec}; use swc_atoms::{Atom, AtomStoreCell}; use swc_common::{comments::Comments, input::StringInput, BytePos, Span}; use swc_ecma_ast::{op, AssignOp, EsVersion, Ident}; +use swc_ecma_raw_lexer::RawBuffer; use self::{comments_buffer::CommentsBuffer, state::State, util::*}; pub use self::{ @@ -115,7 +116,7 @@ pub struct Lexer<'a> { comments_buffer: Option, pub(crate) ctx: Context, - input: logos::Lexer<'a, RawToken>, + input: RawBuffer<'a>, start_pos: BytePos, state: State, @@ -145,7 +146,7 @@ impl<'a> Lexer<'a> { comments, comments_buffer: comments.is_some().then(CommentsBuffer::new), ctx: Default::default(), - input, + input: RawBuffer::new(input), start_pos, state: State::new(syntax, start_pos), syntax, @@ -171,8 +172,8 @@ impl<'a> Lexer<'a> { /// babel: `getTokenFromCode` fn read_token(&mut self) -> LexResult> { - let byte = match self.input.as_str().as_bytes().first() { - Some(&v) => v, + let byte = match self.input.next() { + Some(v) => v, None => return Ok(None), }; diff --git a/crates/swc_ecma_parser/src/lexer/number.rs b/crates/swc_ecma_parser/src/lexer/number.rs index 5fc15f1e510b..1b012996086c 100644 --- a/crates/swc_ecma_parser/src/lexer/number.rs +++ b/crates/swc_ecma_parser/src/lexer/number.rs @@ -247,7 +247,7 @@ impl Lexer<'_> { self.bump(); - match self.input.cur() { + match self.input.cur()? { Some(..) => { self.bump(); } diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index 0167a2640e3e..6fce649adf61 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -266,7 +266,7 @@ impl Lexer<'_> { *start = self.input.cur_pos(); }; - match self.input.cur() { + match self.input.cur()? { Some(..) => {} // End of input. None => { diff --git a/crates/swc_ecma_parser/src/lexer/util.rs b/crates/swc_ecma_parser/src/lexer/util.rs index 5b8711981d4f..3a02fd6a6fde 100644 --- a/crates/swc_ecma_parser/src/lexer/util.rs +++ b/crates/swc_ecma_parser/src/lexer/util.rs @@ -11,10 +11,7 @@ use swc_common::{ use swc_ecma_ast::Ident; use tracing::warn; -use super::{ - comments_buffer::BufferedComment, input::Input, whitespace::SkipWhitespace, Char, LexResult, - Lexer, -}; +use super::{comments_buffer::BufferedComment, whitespace::SkipWhitespace, Char, LexResult, Lexer}; use crate::{ error::{Error, SyntaxError}, lexer::comments_buffer::BufferedCommentKind, From e9ea1cf1fc83a2dc44fd184e6030cac28c2bc0a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:30:21 +0900 Subject: [PATCH 022/201] more lexer work --- crates/swc_ecma_parser/src/lexer/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index d84b80f25a69..c135a631d403 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -822,7 +822,7 @@ impl Lexer<'_> { let s = unsafe { // Safety: start and end are valid position because we got them from // `self.input` - l.input.slice()[slice_start..start] + l.input.slice(slice_start, start) }; buf.push_str(s); unsafe { @@ -880,7 +880,7 @@ impl Lexer<'_> { let s = unsafe { // Safety: slice_start and end are valid position because we got them from // `self.input` - l.input.slice()[slice_start..end] + l.input.slice(slice_start, end) }; let s = unsafe { // Safety: We don't use 'static. We just bypass the lifetime check. @@ -892,7 +892,7 @@ impl Lexer<'_> { let s = unsafe { // Safety: slice_start and end are valid position because we got them from // `self.input` - l.input.slice()[slice_start..end] + l.input.slice(slice_start, end) }; buf.push_str(s); From a7ee32279b421da22041bc82c0fa8798c75051a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:31:22 +0900 Subject: [PATCH 023/201] more lexer work --- crates/swc_ecma_parser/src/lexer/jsx.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/jsx.rs b/crates/swc_ecma_parser/src/lexer/jsx.rs index fd037560d4ac..6774f3f83b51 100644 --- a/crates/swc_ecma_parser/src/lexer/jsx.rs +++ b/crates/swc_ecma_parser/src/lexer/jsx.rs @@ -1,5 +1,6 @@ use either::Either; use smartstring::{LazyCompact, SmartString}; +use swc_ecma_raw_lexer::RawToken; use super::*; @@ -22,7 +23,7 @@ impl Lexer<'_> { let cur_pos = self.input.cur_pos(); match cur { - '<' if self.had_line_break_before_last() && self.is_str("<<<<<< ") => { + RawToken::LtOp if self.had_line_break_before_last() && self.is_str("<<<<<< ") => { let span = Span::new(cur_pos, cur_pos + BytePos(7)); self.emit_error_span(span, SyntaxError::TS1185); @@ -30,10 +31,10 @@ impl Lexer<'_> { self.skip_space::(); return self.read_token(); } - '<' | '{' => { + RawToken::LtOp | RawToken::LBrace => { // if cur_pos == self.state.start { - if cur == '<' && self.state.is_expr_allowed { + if cur == RawToken::LtOp && self.state.is_expr_allowed { unsafe { // Safety: cur() was Some('<') self.input.bump(1); @@ -68,7 +69,7 @@ impl Lexer<'_> { return Ok(Some(Token::JSXText { raw, value })); } - '>' => { + RawToken::GtOp => { self.emit_error( cur_pos, SyntaxError::UnexpectedTokenWithSuggestions { @@ -80,7 +81,7 @@ impl Lexer<'_> { self.input.bump(1) } } - '}' => { + RawToken::RBrace => { self.emit_error( cur_pos, SyntaxError::UnexpectedTokenWithSuggestions { @@ -92,7 +93,7 @@ impl Lexer<'_> { self.input.bump(1) } } - '&' => { + RawToken::BitAndOp => { value.push_str(unsafe { // Safety: We already checked for the range self.input.slice(chunk_start, cur_pos) From 74de14a71811c5400d21f9d1124322fe741664be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:31:39 +0900 Subject: [PATCH 024/201] remove unused --- crates/swc_ecma_parser/src/lexer/util.rs | 40 ------------------------ 1 file changed, 40 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/util.rs b/crates/swc_ecma_parser/src/lexer/util.rs index 3a02fd6a6fde..f3db2f3f6c84 100644 --- a/crates/swc_ecma_parser/src/lexer/util.rs +++ b/crates/swc_ecma_parser/src/lexer/util.rs @@ -39,46 +39,6 @@ impl Lexer<'_> { } } - #[inline(always)] - pub(super) fn is(&mut self, c: u8) -> bool { - self.input.is_byte(c) - } - - #[inline(always)] - pub(super) fn is_str(&self, s: &str) -> bool { - self.input.is_str(s) - } - - #[inline(always)] - pub(super) fn eat(&mut self, c: u8) -> bool { - self.input.eat_byte(c) - } - - #[inline(always)] - pub(super) fn cur(&mut self) -> Option { - self.input.cur() - } - - #[inline(always)] - pub(super) fn peek(&mut self) -> Option { - self.input.peek() - } - - #[inline(always)] - pub(super) fn peek_ahead(&mut self) -> Option { - self.input.peek_ahead() - } - - #[inline(always)] - pub(super) fn cur_pos(&mut self) -> BytePos { - self.input.cur_pos() - } - - #[inline(always)] - pub(super) fn last_pos(&self) -> BytePos { - self.input.last_pos() - } - /// Shorthand for `let span = self.span(start); self.error_span(span)` #[cold] #[inline(never)] From 9cb5e4ff18211abec55daf6f62db0e057e10a26e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:33:26 +0900 Subject: [PATCH 025/201] RawBuffer --- crates/swc_ecma_raw_lexer/src/lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index c60423659379..d57a4b281d10 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -27,6 +27,14 @@ impl<'a> RawBuffer<'a> { self.lexer.clone().next().transpose() } + pub fn peek(&self) -> Result, LexError> { + self.lexer.clone().nth(1).transpose() + } + + pub fn peek_ahead(&self) -> Result, LexError> { + self.lexer.clone().nth(2).transpose() + } + /// # Safety /// /// - `start` and `end` must be within the bounds of `self.orig_str` From a75fa5463ac3db7628093e39740e6b85369543ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:38:29 +0900 Subject: [PATCH 026/201] eat() --- crates/swc_ecma_raw_lexer/src/lib.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index d57a4b281d10..9d0e65910205 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -52,6 +52,20 @@ impl<'a> RawBuffer<'a> { self.lexer.bump(n); self.pos = self.pos + BytePos(n as u32); } + + pub fn eat(&mut self, token: RawToken) -> Result { + let cur = self.cur()?; + + if cur == Some(token) { + unsafe { + // Safety: cur() was Some(token) + self.bump(1); + } + Ok(true) + } else { + Ok(false) + } + } } impl Iterator for RawBuffer<'_> { @@ -270,6 +284,10 @@ pub enum RawToken { #[regex(r"\P{ID_Start}\P{ID_Continue}*")] Ident, + + #[token("\r", priority = 3)] + #[token("\n", priority = 3)] + NewLine, } #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] From ed5736a38b450c230a4484afa188e1db38963b7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:38:53 +0900 Subject: [PATCH 027/201] more lexer work --- crates/swc_ecma_parser/src/lexer/mod.rs | 49 ++----------------------- 1 file changed, 3 insertions(+), 46 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index c135a631d403..11a1d958e900 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -7,7 +7,7 @@ use smallvec::{smallvec, SmallVec}; use swc_atoms::{Atom, AtomStoreCell}; use swc_common::{comments::Comments, input::StringInput, BytePos, Span}; use swc_ecma_ast::{op, AssignOp, EsVersion, Ident}; -use swc_ecma_raw_lexer::RawBuffer; +use swc_ecma_raw_lexer::{RawBuffer, RawToken}; use self::{comments_buffer::CommentsBuffer, state::State, util::*}; pub use self::{ @@ -183,7 +183,7 @@ impl<'a> Lexer<'a> { Some(handler) => handler(self), None => { let start = self.cur_pos(); - self.input.bump_bytes(1); + self.input.bump(1); self.error_span( pos_span(start), SyntaxError::UnexpectedChar { c: byte as _ }, @@ -252,49 +252,6 @@ impl<'a> Lexer<'a> { Ok(tok!('.')) } - /// Read a token given `?`. - /// - /// This is extracted as a method to reduce size of `read_token`. - #[inline(never)] - fn read_token_question_mark(&mut self) -> LexResult { - match self.input.peek() { - Some('?') => { - unsafe { - // Safety: peek() was some - self.input.bump(2); - } - if self.input.cur() == Some('=') { - unsafe { - // Safety: cur() was some - self.input.bump(1); - } - - return Ok(tok!("??=")); - } - Ok(tok!("??")) - } - _ => { - unsafe { - // Safety: peek() is callable only if cur() is Some - self.input.bump(1); - } - Ok(tok!('?')) - } - } - } - - /// Read a token given `:`. - /// - /// This is extracted as a method to reduce size of `read_token`. - #[inline(never)] - fn read_token_colon(&mut self) -> LexResult { - unsafe { - // Safety: cur() is Some(':') - self.input.bump(1); - } - Ok(tok!(':')) - } - /// Read a token given `0`. /// /// This is extracted as a method to reduce size of `read_token`. @@ -451,7 +408,7 @@ impl<'a> Lexer<'a> { '\r' => { self.bump(); // remove '\r' - self.eat(b'\n'); + self.input.eat(RawToken::NewLine); return Ok(None); } From 7c89de7590136904da6d7e541f656f1771055241 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:38:56 +0900 Subject: [PATCH 028/201] cargo lockfile --- Cargo.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.lock b/Cargo.lock index 27232ff4a3ae..aba469578fdb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5340,6 +5340,7 @@ name = "swc_ecma_raw_lexer" version = "0.1.0" dependencies = [ "logos", + "swc_common", ] [[package]] From fd690017b04e9a1e11416d39cfcb25f10a0148bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:39:48 +0900 Subject: [PATCH 029/201] more lexer work --- crates/swc_ecma_parser/src/lexer/mod.rs | 111 ++++++++------------- crates/swc_ecma_parser/src/lexer/number.rs | 60 +++++------ crates/swc_ecma_parser/src/lexer/state.rs | 10 +- crates/swc_ecma_parser/src/lexer/util.rs | 18 ++-- 4 files changed, 85 insertions(+), 114 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 11a1d958e900..77406d3994ba 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -182,7 +182,7 @@ impl<'a> Lexer<'a> { match handler { Some(handler) => handler(self), None => { - let start = self.cur_pos(); + let start = self.input.cur_pos(); self.input.bump(1); self.error_span( pos_span(start), @@ -194,7 +194,7 @@ impl<'a> Lexer<'a> { /// `#` fn read_token_number_sign(&mut self) -> LexResult> { - debug_assert!(self.cur().is_some()); + debug_assert!(self.input.cur().is_some()); unsafe { // Safety: cur() is Some('#') @@ -204,7 +204,7 @@ impl<'a> Lexer<'a> { // `#` can also be a part of shebangs, however they should have been // handled by `read_shebang()` debug_assert!( - !self.input.is_at_start() || self.cur() != Some('!'), + !self.input.is_at_start() || self.input.cur() != Some('!'), "#! should have already been handled by read_shebang()" ); Ok(Some(Token::Hash)) @@ -283,7 +283,7 @@ impl<'a> Lexer<'a> { #[inline(never)] fn read_token_logical(&mut self, c: u8) -> LexResult { let had_line_break_before_last = self.had_line_break_before_last(); - let start = self.cur_pos(); + let start = self.input.cur_pos(); unsafe { // Safety: cur() is Some(c as char) @@ -343,50 +343,17 @@ impl<'a> Lexer<'a> { Ok(Token::BinOp(token)) } - /// Read a token given `*` or `%`. - /// - /// This is extracted as a method to reduce size of `read_token`. - #[inline(never)] - fn read_token_mul_mod(&mut self, c: u8) -> LexResult { - let is_mul = c == b'*'; - unsafe { - // Safety: cur() is Some(c) - self.input.bump(1); - } - let mut token = if is_mul { - Token::BinOp(BinOpToken::Mul) - } else { - Token::BinOp(BinOpToken::Mod) - }; - - // check for ** - if is_mul && self.input.eat_byte(b'*') { - token = Token::BinOp(BinOpToken::Exp) - } - - if self.input.eat_byte(b'=') { - token = match token { - Token::BinOp(BinOpToken::Mul) => Token::AssignOp(AssignOp::MulAssign), - Token::BinOp(BinOpToken::Mod) => Token::AssignOp(AssignOp::ModAssign), - Token::BinOp(BinOpToken::Exp) => Token::AssignOp(AssignOp::ExpAssign), - _ => unreachable!(), - } - } - - Ok(token) - } - /// Read an escaped character for string literal. /// /// In template literal, we should preserve raw string. fn read_escaped_char(&mut self, in_template: bool) -> LexResult>> { - debug_assert_eq!(self.cur(), Some('\\')); + debug_assert_eq!(self.input.cur(), Some('\\')); - let start = self.cur_pos(); + let start = self.input.cur_pos(); self.bump(); // '\' - let c = match self.cur() { + let c = match self.input.cur()? { Some(c) => c, None => self.error_span(pos_span(start), SyntaxError::InvalidStrEscape)?, }; @@ -444,7 +411,7 @@ impl<'a> Lexer<'a> { self.bump(); let first_c = if c == '0' { - match self.cur() { + match self.input.cur() { Some(next) if next.is_digit(8) => c, // \0 is not an octal literal nor decimal literal. _ => return Ok(Some(vec!['\u{0000}'.into()])), @@ -464,7 +431,7 @@ impl<'a> Lexer<'a> { macro_rules! one { ($check:expr) => {{ - let cur = self.cur(); + let cur = self.input.cur(); match cur.and_then(|c| c.to_digit(8)) { Some(v) => { @@ -504,7 +471,7 @@ impl<'a> Lexer<'a> { } fn read_token_plus_minus(&mut self, c: u8) -> LexResult> { - let start = self.cur_pos(); + let start = self.input.cur_pos(); unsafe { // Safety: cur() is Some(c), if this method is called. @@ -547,7 +514,7 @@ impl<'a> Lexer<'a> { } fn read_token_bang_or_eq(&mut self, c: u8) -> LexResult> { - let start = self.cur_pos(); + let start = self.input.cur_pos(); let had_line_break_before_last = self.had_line_break_before_last(); unsafe { @@ -593,7 +560,7 @@ impl<'a> Lexer<'a> { impl Lexer<'_> { #[inline(never)] fn read_slash(&mut self) -> LexResult> { - debug_assert_eq!(self.cur(), Some('/')); + debug_assert_eq!(self.input.cur(), Some('/')); // Divide operator self.bump(); @@ -607,11 +574,11 @@ impl Lexer<'_> { #[inline(never)] fn read_token_lt_gt(&mut self) -> LexResult> { - debug_assert!(self.cur() == Some('<') || self.cur() == Some('>')); + debug_assert!(self.input.cur() == Some('<') || self.input.cur() == Some('>')); let had_line_break_before_last = self.had_line_break_before_last(); - let start = self.cur_pos(); - let c = self.cur().unwrap(); + let start = self.input.cur_pos(); + let c = self.input.cur().unwrap(); self.bump(); if self.syntax.typescript() && self.ctx.in_type && !self.ctx.should_not_lex_lt_or_gt_as_type @@ -639,7 +606,7 @@ impl Lexer<'_> { }; // '<<', '>>' - if self.cur() == Some(c) { + if self.input.cur() == Some(c) { self.bump(); op = if c == '<' { BinOpToken::LShift @@ -648,7 +615,7 @@ impl Lexer<'_> { }; //'>>>' - if c == '>' && self.cur() == Some(c) { + if c == '>' && self.input.cur() == Some(c) { self.bump(); op = BinOpToken::ZeroFillRShift; } @@ -692,7 +659,7 @@ impl Lexer<'_> { /// This can be used if there's no keyword starting with the first /// character. fn read_ident_unknown(&mut self) -> LexResult { - debug_assert!(self.cur().is_some()); + debug_assert!(self.input.cur().is_some()); let (word, _) = self .read_word_as_str_with(|l, s, _, _| Word::Ident(IdentLike::Other(l.atoms.atom(s))))?; @@ -706,9 +673,9 @@ impl Lexer<'_> { &mut self, convert: &dyn Fn(&str) -> Option, ) -> LexResult> { - debug_assert!(self.cur().is_some()); + debug_assert!(self.input.cur().is_some()); - let start = self.cur_pos(); + let start = self.input.cur_pos(); let (word, has_escape) = self.read_word_as_str_with(|l, s, _, can_be_known| { if can_be_known { if let Some(word) = convert(s) { @@ -740,10 +707,10 @@ impl Lexer<'_> { where F: for<'any> FnOnce(&'any mut Lexer<'_>, &str, bool, bool) -> Ret, { - debug_assert!(self.cur().is_some()); + debug_assert!(self.input.cur().is_some()); let mut first = true; let mut can_be_keyword = true; - let mut slice_start = self.cur_pos(); + let mut slice_start = self.input.cur_pos(); let mut has_escape = false; self.with_buf(|l, buf| { @@ -861,7 +828,7 @@ impl Lexer<'_> { } fn read_unicode_escape(&mut self) -> LexResult> { - debug_assert_eq!(self.cur(), Some('u')); + debug_assert_eq!(self.input.cur(), Some('u')); let mut chars = Vec::new(); let mut is_curly = false; @@ -878,7 +845,7 @@ impl Lexer<'_> { if 0x0010_ffff >= val { char::from_u32(val) } else { - let start = self.cur_pos(); + let start = self.input.cur_pos(); self.error( start, @@ -893,7 +860,7 @@ impl Lexer<'_> { } } _ => { - let start = self.cur_pos(); + let start = self.input.cur_pos(); self.error( start, @@ -960,9 +927,9 @@ impl Lexer<'_> { /// See https://tc39.github.io/ecma262/#sec-literals-string-literals fn read_str_lit(&mut self) -> LexResult { - debug_assert!(self.cur() == Some('\'') || self.cur() == Some('"')); - let start = self.cur_pos(); - let quote = self.cur().unwrap() as u8; + debug_assert!(self.input.cur() == Some('\'') || self.input.cur() == Some('"')); + let start = self.input.cur_pos(); + let quote = self.input.cur().unwrap() as u8; self.bump(); // '"' @@ -1092,9 +1059,9 @@ impl Lexer<'_> { self.input.reset_to(start); } - debug_assert_eq!(self.cur(), Some('/')); + debug_assert_eq!(self.input.cur(), Some('/')); - let start = self.cur_pos(); + let start = self.input.cur_pos(); self.bump(); @@ -1145,9 +1112,9 @@ impl Lexer<'_> { // Need to use `read_word` because '\uXXXX' sequences are allowed // here (don't ask). - // let flags_start = self.cur_pos(); + // let flags_start = self.input.cur_pos(); let flags = { - match self.cur() { + match self.input.cur() { Some(c) if c.is_ident_start() => self .read_word_as_str_with(|l, s, _, _| l.atoms.atom(s)) .map(Some), @@ -1174,7 +1141,7 @@ impl Lexer<'_> { } fn read_tmpl_token(&mut self, start_of_tpl: BytePos) -> LexResult { - let start = self.cur_pos(); + let start = self.input.cur_pos(); let mut cooked = Ok(String::new()); let mut cooked_slice_start = start; @@ -1183,7 +1150,7 @@ impl Lexer<'_> { macro_rules! consume_cooked { () => {{ if let Ok(cooked) = &mut cooked { - let last_pos = self.cur_pos(); + let last_pos = self.input.cur_pos(); cooked.push_str(unsafe { // Safety: Both of start and last_pos are valid position because we got them // from `self.input` @@ -1193,9 +1160,9 @@ impl Lexer<'_> { }}; } - while let Some(c) = self.cur() { + while let Some(c) = self.input.cur() { if c == '`' || (c == '$' && self.peek() == Some('{')) { - if start == self.cur_pos() && self.state.last_was_tpl_element() { + if start == self.input.cur_pos() && self.state.last_was_tpl_element() { if c == '$' { self.bump(); self.bump(); @@ -1208,7 +1175,7 @@ impl Lexer<'_> { // If we don't have any escape let cooked = if cooked_slice_start == raw_slice_start { - let last_pos = self.cur_pos(); + let last_pos = self.input.cur_pos(); let s = unsafe { // Safety: Both of start and last_pos are valid position because we got them // from `self.input` @@ -1252,7 +1219,7 @@ impl Lexer<'_> { } } - cooked_slice_start = self.cur_pos(); + cooked_slice_start = self.input.cur_pos(); } else if c.is_line_terminator() { self.state.had_line_break = true; @@ -1276,7 +1243,7 @@ impl Lexer<'_> { if let Ok(ref mut cooked) = cooked { cooked.push(c); } - cooked_slice_start = self.cur_pos(); + cooked_slice_start = self.input.cur_pos(); } else { self.bump(); } diff --git a/crates/swc_ecma_parser/src/lexer/number.rs b/crates/swc_ecma_parser/src/lexer/number.rs index 1b012996086c..9bf786083f81 100644 --- a/crates/swc_ecma_parser/src/lexer/number.rs +++ b/crates/swc_ecma_parser/src/lexer/number.rs @@ -33,29 +33,29 @@ impl Lexer<'_> { &mut self, starts_with_dot: bool, ) -> LexResult, Atom)>> { - debug_assert!(self.cur().is_some()); + debug_assert!(self.input.cur().is_some()); if starts_with_dot { debug_assert_eq!( - self.cur(), + self.input.cur(), Some('.'), "read_number(starts_with_dot = true) expects current char to be '.'" ); } - let start = self.cur_pos(); + let start = self.input.cur_pos(); let val = if starts_with_dot { // first char is '.' 0f64 } else { - let starts_with_zero = self.cur().unwrap() == '0'; + let starts_with_zero = self.input.cur().unwrap() == '0'; // Use read_number_no_dot to support long numbers. let (val, s, not_octal) = self.read_number_no_dot_as_str::<10>()?; if self.eat(b'n') { - let end = self.cur_pos(); + let end = self.input.cur_pos(); let raw = unsafe { // Safety: We got both start and end position from `self.input` self.input.slice(start, end) @@ -80,7 +80,7 @@ impl Lexer<'_> { if start.0 != self.last_pos().0 - 1 { // `-1` is utf 8 length of `0` - let end = self.cur_pos(); + let end = self.input.cur_pos(); let raw = unsafe { // Safety: We got both start and end position from `self.input` self.input.slice(start, end) @@ -115,7 +115,7 @@ impl Lexer<'_> { panic!("failed to parse {} into float using BigInt", val_str) }); - let end = self.cur_pos(); + let end = self.input.cur_pos(); let raw = unsafe { // Safety: We got both start and end position from `self.input` self.input.slice(start, end) @@ -140,19 +140,19 @@ impl Lexer<'_> { // `0.a`, `08.a`, `102.a` are invalid. // // `.1.a`, `.1e-4.a` are valid, - if self.cur() == Some('.') { + if self.input.cur() == Some('.') { self.bump(); if starts_with_dot { - debug_assert!(self.cur().is_some()); - debug_assert!(self.cur().unwrap().is_ascii_digit()); + debug_assert!(self.input.cur().is_some()); + debug_assert!(self.input.cur().unwrap().is_ascii_digit()); } // Read numbers after dot self.read_int::<10>(0)?; val = { - let end = self.cur_pos(); + let end = self.input.cur_pos(); let raw = unsafe { // Safety: We got both start and end position from `self.input` self.input.slice(start, end) @@ -175,14 +175,14 @@ impl Lexer<'_> { // 1e2 = 100 // 1e+2 = 100 // 1e-2 = 0.01 - match self.cur() { + match self.input.cur() { Some('e') | Some('E') => { self.bump(); - let next = match self.cur() { + let next = match self.input.cur() { Some(next) => next, None => { - let pos = self.cur_pos(); + let pos = self.input.cur_pos(); self.error(pos, SyntaxError::NumLitTerminatedWithExp)? } }; @@ -204,7 +204,7 @@ impl Lexer<'_> { 0.0 } } else { - let end = self.cur_pos(); + let end = self.input.cur_pos(); let raw = unsafe { // Safety: We got both start and end position from `self.input` self.input.slice(start, end) @@ -224,7 +224,7 @@ impl Lexer<'_> { self.ensure_not_ident()?; - let end = self.cur_pos(); + let end = self.input.cur_pos(); let raw_str = unsafe { // Safety: We got both start and end position from `self.input` self.input.slice(start, end) @@ -241,9 +241,9 @@ impl Lexer<'_> { "radix should be one of 2, 8, 16, but got {}", RADIX ); - debug_assert_eq!(self.cur(), Some('0')); + debug_assert_eq!(self.input.cur(), Some('0')); - let start = self.cur_pos(); + let start = self.input.cur_pos(); self.bump(); @@ -259,7 +259,7 @@ impl Lexer<'_> { let (val, s, _) = self.read_number_no_dot_as_str::()?; if self.eat(b'n') { - let end = self.cur_pos(); + let end = self.input.cur_pos(); let raw = unsafe { // Safety: We got both start and end position from `self.input` self.input.slice(start, end) @@ -273,7 +273,7 @@ impl Lexer<'_> { self.ensure_not_ident()?; - let end = self.cur_pos(); + let end = self.input.cur_pos(); let raw = unsafe { // Safety: We got both start and end position from `self.input` self.input.slice(start, end) @@ -290,7 +290,7 @@ impl Lexer<'_> { "radix for read_number_no_dot should be one of 2, 8, 10, 16, but got {}", RADIX ); - let start = self.cur_pos(); + let start = self.input.cur_pos(); let mut read_any = false; @@ -321,7 +321,7 @@ impl Lexer<'_> { "radix for read_number_no_dot should be one of 2, 8, 10, 16, but got {}", RADIX ); - let start = self.cur_pos(); + let start = self.input.cur_pos(); let mut non_octal = false; let mut read_any = false; @@ -343,7 +343,7 @@ impl Lexer<'_> { self.error(start, SyntaxError::ExpectedDigit { radix: RADIX })?; } - let end = self.cur_pos(); + let end = self.input.cur_pos(); let raw = unsafe { // Safety: We got both start and end position from `self.input` self.input.slice(start, end) @@ -359,9 +359,9 @@ impl Lexer<'_> { /// Ensure that ident cannot directly follow numbers. fn ensure_not_ident(&mut self) -> LexResult<()> { - match self.cur() { + match self.input.cur() { Some(c) if c.is_ident_start() => { - let span = pos_span(self.cur_pos()); + let span = pos_span(self.input.cur_pos()); self.error_span(span, SyntaxError::IdentAfterNum)? } _ => Ok(()), @@ -435,14 +435,18 @@ impl Lexer<'_> { ); if cfg!(feature = "debug") { - trace!("read_digits(radix = {}), cur = {:?}", RADIX, self.cur()); + trace!( + "read_digits(radix = {}), cur = {:?}", + RADIX, + self.input.cur() + ); } - let start = self.cur_pos(); + let start = self.input.cur_pos(); let mut total: Ret = Default::default(); let mut prev = None; - while let Some(c) = self.cur() { + while let Some(c) = self.input.cur() { if allow_num_separator && c == '_' { let is_allowed = |c: Option| { if c.is_none() { diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index 6fce649adf61..4453a65e0568 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -290,7 +290,7 @@ impl Lexer<'_> { return self.read_jsx_token(); } - let c = self.cur(); + let c = self.input.cur(); if let Some(c) = c { if self.state.context.current() == Some(TokenContext::JSXOpeningTag) || self.state.context.current() == Some(TokenContext::JSXClosingTag) @@ -350,7 +350,7 @@ impl Iterator for Lexer<'_> { type Item = TokenAndSpan; fn next(&mut self) -> Option { - let mut start = self.cur_pos(); + let mut start = self.input.cur_pos(); let res = self.next_token(&mut start); @@ -676,7 +676,7 @@ impl TokenContexts { is_expr_allowed: bool, ) -> bool { if let Some(TokenType::Colon) = prev { - match self.current() { + match self.input.current() { Some(TokenContext::BraceStmt) => return true, // `{ a: {} }` // ^ ^ @@ -711,14 +711,14 @@ impl TokenContexts { Some(TokenType::LBrace) => { // https://github.com/swc-project/swc/issues/3241#issuecomment-1029584460 // - if self.current() == Some(TokenContext::BraceExpr) { + if self.input.current() == Some(TokenContext::BraceExpr) { let len = self.len(); if let Some(TokenContext::JSXOpeningTag) = self.0.get(len - 2) { return true; } } - return self.current() == Some(TokenContext::BraceStmt); + return self.input.current() == Some(TokenContext::BraceStmt); } // `class C { ... }` diff --git a/crates/swc_ecma_parser/src/lexer/util.rs b/crates/swc_ecma_parser/src/lexer/util.rs index f3db2f3f6c84..3b34abbec9a1 100644 --- a/crates/swc_ecma_parser/src/lexer/util.rs +++ b/crates/swc_ecma_parser/src/lexer/util.rs @@ -148,9 +148,9 @@ impl Lexer<'_> { #[inline(never)] pub(super) fn skip_line_comment(&mut self, start_skip: usize) { - let start = self.cur_pos(); + let start = self.input.cur_pos(); self.input.bump_bytes(start_skip); - let slice_start = self.cur_pos(); + let slice_start = self.input.cur_pos(); // foo // comment for foo // bar @@ -171,7 +171,7 @@ impl Lexer<'_> { }); self.input.bump_bytes(idx); - let end = self.cur_pos(); + let end = self.input.cur_pos(); if let Some(comments) = self.comments_buffer.as_mut() { let s = unsafe { @@ -204,15 +204,15 @@ impl Lexer<'_> { /// Expects current char to be '/' and next char to be '*'. #[inline(never)] pub(super) fn skip_block_comment(&mut self) { - let start = self.cur_pos(); + let start = self.input.cur_pos(); - debug_assert_eq!(self.cur(), Some('/')); + debug_assert_eq!(self.input.cur(), Some('/')); debug_assert_eq!(self.peek(), Some('*')); self.input.bump_bytes(2); // jsdoc - let slice_start = self.cur_pos(); + let slice_start = self.input.cur_pos(); let mut was_star = if self.input.is_byte(b'*') { self.bump(); true @@ -222,12 +222,12 @@ impl Lexer<'_> { let mut is_for_next = self.state.had_line_break || !self.state.can_have_trailing_comment(); - while let Some(c) = self.cur() { + while let Some(c) = self.input.cur() { if was_star && c == '/' { - debug_assert_eq!(self.cur(), Some('/')); + debug_assert_eq!(self.input.cur(), Some('/')); self.bump(); // '/' - let end = self.cur_pos(); + let end = self.input.cur_pos(); self.skip_space::(); From 95b5b10cd4c259e4901cd9b811a76dd8a2ccb128 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:41:21 +0900 Subject: [PATCH 030/201] more lexer work --- crates/swc_ecma_parser/src/lexer/jsx.rs | 2 +- crates/swc_ecma_parser/src/lexer/mod.rs | 24 +++------------------- crates/swc_ecma_parser/src/lexer/number.rs | 4 ++-- 3 files changed, 6 insertions(+), 24 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/jsx.rs b/crates/swc_ecma_parser/src/lexer/jsx.rs index 6774f3f83b51..57647a814592 100644 --- a/crates/swc_ecma_parser/src/lexer/jsx.rs +++ b/crates/swc_ecma_parser/src/lexer/jsx.rs @@ -150,7 +150,7 @@ impl Lexer<'_> { let mut s = SmartString::::default(); - let c = self.input.cur(); + let c = self.input.cur()?; debug_assert_eq!(c, Some('&')); unsafe { // Safety: cur() was Some('&') diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 77406d3994ba..49a96a049d60 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -192,24 +192,6 @@ impl<'a> Lexer<'a> { } } - /// `#` - fn read_token_number_sign(&mut self) -> LexResult> { - debug_assert!(self.input.cur().is_some()); - - unsafe { - // Safety: cur() is Some('#') - self.input.bump(1); // '#' - } - - // `#` can also be a part of shebangs, however they should have been - // handled by `read_shebang()` - debug_assert!( - !self.input.is_at_start() || self.input.cur() != Some('!'), - "#! should have already been handled by read_shebang()" - ); - Ok(Some(Token::Hash)) - } - /// Read a token given `.`. /// /// This is extracted as a method to reduce size of `read_token`. @@ -659,7 +641,7 @@ impl Lexer<'_> { /// This can be used if there's no keyword starting with the first /// character. fn read_ident_unknown(&mut self) -> LexResult { - debug_assert!(self.input.cur().is_some()); + debug_assert!(self.input.cur()?.is_some()); let (word, _) = self .read_word_as_str_with(|l, s, _, _| Word::Ident(IdentLike::Other(l.atoms.atom(s))))?; @@ -673,7 +655,7 @@ impl Lexer<'_> { &mut self, convert: &dyn Fn(&str) -> Option, ) -> LexResult> { - debug_assert!(self.input.cur().is_some()); + debug_assert!(self.input.cur()?.is_some()); let start = self.input.cur_pos(); let (word, has_escape) = self.read_word_as_str_with(|l, s, _, can_be_known| { @@ -707,7 +689,7 @@ impl Lexer<'_> { where F: for<'any> FnOnce(&'any mut Lexer<'_>, &str, bool, bool) -> Ret, { - debug_assert!(self.input.cur().is_some()); + debug_assert!(self.input.cur()?.is_some()); let mut first = true; let mut can_be_keyword = true; let mut slice_start = self.input.cur_pos(); diff --git a/crates/swc_ecma_parser/src/lexer/number.rs b/crates/swc_ecma_parser/src/lexer/number.rs index 9bf786083f81..ce75887fd318 100644 --- a/crates/swc_ecma_parser/src/lexer/number.rs +++ b/crates/swc_ecma_parser/src/lexer/number.rs @@ -33,7 +33,7 @@ impl Lexer<'_> { &mut self, starts_with_dot: bool, ) -> LexResult, Atom)>> { - debug_assert!(self.input.cur().is_some()); + debug_assert!(self.input.cur()?.is_some()); if starts_with_dot { debug_assert_eq!( @@ -144,7 +144,7 @@ impl Lexer<'_> { self.bump(); if starts_with_dot { - debug_assert!(self.input.cur().is_some()); + debug_assert!(self.input.cur()?.is_some()); debug_assert!(self.input.cur().unwrap().is_ascii_digit()); } From 650f1e08e05bb1a874cb4847a73d6703b4653270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:41:41 +0900 Subject: [PATCH 031/201] input --- crates/swc_ecma_parser/src/lexer/mod.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 49a96a049d60..8de07699f441 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -716,7 +716,7 @@ impl Lexer<'_> { if c == b'\\' { first = false; has_escape = true; - let start = l.cur_pos(); + let start = l.input.cur_pos(); l.bump(); if !l.is(b'u') { @@ -755,7 +755,7 @@ impl Lexer<'_> { buf.extend(c); } - slice_start = l.cur_pos(); + slice_start = l.input.cur_pos(); continue; } @@ -778,7 +778,7 @@ impl Lexer<'_> { break; } - let end = l.cur_pos(); + let end = l.input.cur_pos(); let value = if !has_escape { // Fast path: raw slice is enough if there's no escape. @@ -922,7 +922,7 @@ impl Lexer<'_> { loop { if let Some(c) = l.input.cur_as_ascii() { if c == quote { - let value_end = l.cur_pos(); + let value_end = l.input.cur_pos(); let value = if !has_escape { let s = unsafe { @@ -948,7 +948,7 @@ impl Lexer<'_> { l.input.bump(); } - let end = l.cur_pos(); + let end = l.input.cur_pos(); let raw = unsafe { // Safety: start and end are valid position because we got them from @@ -964,7 +964,7 @@ impl Lexer<'_> { has_escape = true; { - let end = l.cur_pos(); + let end = l.input.cur_pos(); let s = unsafe { // Safety: start and end are valid position because we got them from // `self.input` @@ -979,7 +979,7 @@ impl Lexer<'_> { } } - slice_start = l.cur_pos(); + slice_start = l.input.cur_pos(); continue; } @@ -1009,7 +1009,7 @@ impl Lexer<'_> { } { - let end = l.cur_pos(); + let end = l.input.cur_pos(); let s = unsafe { // Safety: start and end are valid position because we got them from // `self.input` @@ -1020,7 +1020,7 @@ impl Lexer<'_> { l.emit_error(start, SyntaxError::UnterminatedStrLit); - let end = l.cur_pos(); + let end = l.input.cur_pos(); let raw = unsafe { // Safety: start and end are valid position because we got them from @@ -1050,7 +1050,7 @@ impl Lexer<'_> { let (mut escaped, mut in_class) = (false, false); let content = self.with_buf(|l, buf| { - while let Some(c) = l.cur() { + while let Some(c) = l.input.cur() { // This is ported from babel. // Seems like regexp literal cannot contain linebreak. if c.is_line_terminator() { From 7c6dd0685877e87608135b58b0c81c8571f5567e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:43:20 +0900 Subject: [PATCH 032/201] input --- crates/swc_ecma_parser/src/lexer/mod.rs | 10 +++++----- crates/swc_ecma_parser/src/lexer/number.rs | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 8de07699f441..91bac285ffed 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -468,7 +468,7 @@ impl<'a> Lexer<'a> { } // Handle --> - if self.state.had_line_break && c == b'-' && self.eat(b'>') { + if self.state.had_line_break && c == b'-' && self.input.eat(b'>') { self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule); self.skip_line_comment(0); self.skip_space::(); @@ -547,7 +547,7 @@ impl Lexer<'_> { // Divide operator self.bump(); - Ok(Some(if self.eat(b'=') { + Ok(Some(if self.input.eat(b'=') { tok!("/=") } else { tok!('/') @@ -603,7 +603,7 @@ impl Lexer<'_> { } } - let token = if self.eat(b'=') { + let token = if self.input.eat(b'=') { match op { BinOpToken::Lt => Token::BinOp(BinOpToken::LtEq), BinOpToken::Gt => Token::BinOp(BinOpToken::GtEq), @@ -817,7 +817,7 @@ impl Lexer<'_> { self.bump(); // 'u' - if self.eat(b'{') { + if self.input.eat(b'{') { is_curly = true; } @@ -900,7 +900,7 @@ impl Lexer<'_> { } } - if is_curly && !self.eat(b'}') { + if is_curly && !self.input.eat(b'}') { self.error(state, SyntaxError::InvalidUnicodeEscape)? } diff --git a/crates/swc_ecma_parser/src/lexer/number.rs b/crates/swc_ecma_parser/src/lexer/number.rs index ce75887fd318..1745eb75b5ea 100644 --- a/crates/swc_ecma_parser/src/lexer/number.rs +++ b/crates/swc_ecma_parser/src/lexer/number.rs @@ -54,7 +54,7 @@ impl Lexer<'_> { // Use read_number_no_dot to support long numbers. let (val, s, not_octal) = self.read_number_no_dot_as_str::<10>()?; - if self.eat(b'n') { + if self.input.eat(b'n') { let end = self.input.cur_pos(); let raw = unsafe { // Safety: We got both start and end position from `self.input` @@ -258,7 +258,7 @@ impl Lexer<'_> { let (val, s, _) = self.read_number_no_dot_as_str::()?; - if self.eat(b'n') { + if self.input.eat(b'n') { let end = self.input.cur_pos(); let raw = unsafe { // Safety: We got both start and end position from `self.input` From 9758b9b77f4b807771118b7cfd94c0b9645d908b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:44:15 +0900 Subject: [PATCH 033/201] bump(1) --- crates/swc_ecma_parser/src/lexer/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 91bac285ffed..3c7e43cf6286 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -945,7 +945,7 @@ impl Lexer<'_> { unsafe { // Safety: cur is quote - l.input.bump(); + l.input.bump(1); } let end = l.input.cur_pos(); @@ -989,7 +989,7 @@ impl Lexer<'_> { unsafe { // Safety: cur is a ascii character - l.input.bump(); + l.input.bump(1); } continue; } @@ -1001,7 +1001,7 @@ impl Lexer<'_> { } unsafe { // Safety: cur is Some(c) - l.input.bump(); + l.input.bump(1); } } None => break, From f9cb822e61863a2c264a7cc5b56011e07ea30879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:45:27 +0900 Subject: [PATCH 034/201] more lexer work --- crates/swc_ecma_parser/src/lexer/mod.rs | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 3c7e43cf6286..b57eaf5cf24a 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -393,7 +393,7 @@ impl<'a> Lexer<'a> { self.bump(); let first_c = if c == '0' { - match self.input.cur() { + match self.input.cur()? { Some(next) if next.is_digit(8) => c, // \0 is not an octal literal nor decimal literal. _ => return Ok(Some(vec!['\u{0000}'.into()])), @@ -540,20 +540,6 @@ impl<'a> Lexer<'a> { } impl Lexer<'_> { - #[inline(never)] - fn read_slash(&mut self) -> LexResult> { - debug_assert_eq!(self.input.cur(), Some('/')); - - // Divide operator - self.bump(); - - Ok(Some(if self.input.eat(b'=') { - tok!("/=") - } else { - tok!('/') - })) - } - #[inline(never)] fn read_token_lt_gt(&mut self) -> LexResult> { debug_assert!(self.input.cur() == Some('<') || self.input.cur() == Some('>')); From 1c9e44f5de21497d91a8c263a6bc3912563e013a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 06:46:14 +0900 Subject: [PATCH 035/201] more lexer work --- crates/swc_ecma_parser/src/lexer/mod.rs | 10 +++++++--- crates/swc_ecma_parser/src/lexer/util.rs | 12 +++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index b57eaf5cf24a..b487673c480f 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -559,7 +559,11 @@ impl Lexer<'_> { } // XML style comment. `", priority = 3)] + LegacyCommentClose, } #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] From 6bd5942b4bbfc7c37eb8f41347e04db478b3a1d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 07:15:34 +0900 Subject: [PATCH 042/201] more lexer work --- crates/swc_ecma_parser/src/lexer/mod.rs | 14 +------------- crates/swc_ecma_raw_lexer/src/lib.rs | 6 ++++++ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 39ecb1dc022e..ddbf4fb45a84 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -191,18 +191,6 @@ impl<'a> Lexer<'a> { _ => {} } - - match handler { - Some(handler) => handler(self), - None => { - let start = self.input.cur_pos(); - self.input.bump(1); - self.error_span( - pos_span(start), - SyntaxError::UnexpectedChar { c: byte as _ }, - ) - } - } } /// Read a token given `.`. @@ -1126,7 +1114,7 @@ impl Lexer<'_> { cooked.push_str(unsafe { // Safety: Both of start and last_pos are valid position because we got them // from `self.input` - self.input.slice()[cooked_slice_start..last_pos] + self.input.slice(cooked_slice_start, last_pos) }); } }}; diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 82cdc51bbaaa..d7db876a7c2f 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -295,6 +295,12 @@ pub enum RawToken { #[token("-->", priority = 3)] LegacyCommentClose, + + #[token("<<<<<", priority = 3)] + LConflictMarker, + + #[token(">>>>>", priority = 3)] + RConflictMarker, } #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] From b4728b6728813fa0a3de7377e55ccf95ed59e26a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 07:15:45 +0900 Subject: [PATCH 043/201] more lexer work --- crates/swc_ecma_parser/src/lexer/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index ddbf4fb45a84..65120e014714 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -1139,7 +1139,7 @@ impl Lexer<'_> { let s = unsafe { // Safety: Both of start and last_pos are valid position because we got them // from `self.input` - self.input.slice()[cooked_slice_start..last_pos] + self.input.slice(cooked_slice_start, last_pos) }; Ok(self.atoms.atom(s)) @@ -1154,7 +1154,7 @@ impl Lexer<'_> { let raw = unsafe { // Safety: Both of start and last_pos are valid position because we got them // from `self.input` - self.input.slice()[raw_slice_start..end] + self.input.slice(raw_slice_start, end) }; return Ok(Token::Template { cooked, From 5926c0acab53f4de82d445f7a0c5ccf63e2817b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 07:17:48 +0900 Subject: [PATCH 044/201] more lexer work --- crates/swc_ecma_parser/src/lexer/mod.rs | 94 +++++-------------------- 1 file changed, 17 insertions(+), 77 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 65120e014714..5de0f990afe6 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -172,6 +172,7 @@ impl<'a> Lexer<'a> { /// babel: `getTokenFromCode` fn read_token(&mut self) -> LexResult> { + let start = self.input.cur_pos(); let cur = match self.input.cur()? { Some(cur) => cur, None => return Ok(None), @@ -179,8 +180,6 @@ impl<'a> Lexer<'a> { match cur { RawToken::LegacyCommentOpen => { - let start = self.input.cur_pos(); - // XML style comment. `", priority = 3)] + #[token("-->")] LegacyCommentClose, - #[token("<<<<<", priority = 3)] + #[token("<<<<<")] LConflictMarker, - #[token(">>>>>", priority = 3)] + #[token(">>>>>")] RConflictMarker, } From 5afb9bfaff0b3e1659890479e30d6344f74e065b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 09:37:27 +0900 Subject: [PATCH 058/201] more keywords --- crates/swc_ecma_raw_lexer/src/lib.rs | 239 ++++++++++++++++++++++++++- 1 file changed, 233 insertions(+), 6 deletions(-) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 2b1ec03f8388..5127a8016414 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -261,12 +261,6 @@ pub enum RawToken { #[token("&")] BitAndOp, - #[token("in")] - In, - - #[token("instanceof")] - InstanceOf, - #[token("**")] ExpOp, @@ -354,6 +348,239 @@ pub enum RawToken { #[token(">>>>>")] RConflictMarker, + + #[token("await")] + Await, + + #[token("break")] + Break, + + #[token("case")] + Case, + + #[token("catch")] + Catch, + + #[token("continue")] + Continue, + + #[token("debugger")] + Debugger, + + #[token("default")] + Default_, + + #[token("do")] + Do, + + #[token("else")] + Else, + + #[token("finally")] + Finally, + + #[token("for")] + For, + + #[token("function")] + Function, + + #[token("if")] + If, + + #[token("return")] + Return, + + #[token("switch")] + Switch, + + #[token("throw")] + Throw, + + #[token("try")] + Try, + + #[token("var")] + Var, + + #[token("let")] + Let, + + #[token("const")] + Const, + + #[token("while")] + While, + + #[token("with")] + With, + + #[token("new")] + New, + + #[token("this")] + This, + + #[token("super")] + Super, + + #[token("class")] + Class, + + #[token("extends")] + Extends, + + #[token("export")] + Export, + + #[token("import")] + Import, + + #[token("yield")] + Yield, + + #[token("in")] + In, + + #[token("instanceof")] + InstanceOf, + + #[token("typeof")] + TypeOf, + + #[token("void")] + Void, + + #[token("delete")] + Delete, + + #[token("abstract")] + Abstract, + + #[token("as")] + As, + + #[token("async")] + Async, + + #[token("from")] + From, + + #[token("of")] + Of, + + #[token("type")] + Type, + + #[token("global")] + Global, + + #[token("static")] + Static, + + #[token("using")] + Using, + + #[token("readonly")] + Readonly, + + #[token("unique")] + Unique, + + #[token("keyof")] + Keyof, + + #[token("declare")] + Declare, + + #[token("enum")] + Enum, + + #[token("is")] + Is, + + #[token("infer")] + Infer, + + Symbol, + + #[token("undefined")] + Undefined, + + #[token("interface")] + Interface, + + #[token("implements")] + Implements, + + #[token("asserts")] + Asserts, + + #[token("require")] + Require, + + #[token("get")] + Get, + + #[token("set")] + Set, + + #[token("any")] + Any, + + #[token("intrinsic")] + Intrinsic, + + #[token("unknown")] + Unknown, + + #[token("string")] + String, + + #[token("object")] + Object, + + #[token("number")] + Number, + + #[token("bigint")] + Bigint, + + #[token("boolean")] + Boolean, + + #[token("never")] + Never, + + #[token("assert")] + Assert, + + #[token("namespace")] + Namespace, + + #[token("accessor")] + Accessor, + + #[token("meta")] + Meta, + + #[token("target")] + Target, + + #[token("satisfies")] + Satisfies, + + #[token("package")] + Package, + + #[token("protected")] + Protected, + + #[token("private")] + Private, + + #[token("public")] + Public, } fn newline_callback(l: &mut Lexer) -> Skip { From b3ba90931c8f6a7d0209edc8d2bc355af9dc456f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 09:45:30 +0900 Subject: [PATCH 059/201] WIP --- crates/swc_ecma_parser/src/lexer/jsx.rs | 761 ++++---- crates/swc_ecma_parser/src/lexer/mod.rs | 1953 +++++++++++--------- crates/swc_ecma_parser/src/lexer/number.rs | 1660 +++++++++-------- crates/swc_ecma_parser/src/lexer/state.rs | 191 +- crates/swc_ecma_parser/src/lexer/util.rs | 358 ++-- crates/swc_ecma_parser/src/lib.rs | 4 +- 6 files changed, 2598 insertions(+), 2329 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/jsx.rs b/crates/swc_ecma_parser/src/lexer/jsx.rs index 69e9b486992f..0a07013bad0e 100644 --- a/crates/swc_ecma_parser/src/lexer/jsx.rs +++ b/crates/swc_ecma_parser/src/lexer/jsx.rs @@ -5,384 +5,389 @@ use swc_ecma_raw_lexer::RawToken; use super::*; impl Lexer<'_> { - pub(super) fn read_jsx_token(&mut self) -> LexResult> { - debug_assert!(self.syntax.jsx()); - - let start = self.input.cur_pos(); - let mut chunk_start = self.input.cur_pos(); - let mut value = String::new(); - - loop { - let cur = match self.input.cur()? { - Some(c) => c, - None => { - let start = self.state.start; - self.error(start, SyntaxError::UnterminatedJSXContents)? - } - }; - let cur_pos = self.input.cur_pos(); - - match cur { - RawToken::LtOp - if self.had_line_break_before_last() - && self.input.peek()? == Some(RawToken::LConflictMarker) => - { - let span = Span::new(cur_pos, cur_pos + BytePos(7)); - - self.emit_error_span(span, SyntaxError::TS1185); - self.skip_line_comment(6); - self.skip_space::(); - return self.read_token(); - } - RawToken::LtOp | RawToken::LBrace => { - // - if cur_pos == self.state.start { - if cur == RawToken::LtOp && self.state.is_expr_allowed { - unsafe { - // Safety: cur() was Some('<') - self.input.bump(1); - } - return Ok(Some(Token::JSXTagStart)); - } - return self.read_token(); - } - - let value = if value.is_empty() { - // Fast path: We don't need to allocate extra buffer for value - let s = unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }; - self.atoms.atom(s) - } else { - value.push_str(unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }); - self.atoms.atom(value) - }; - - let raw = { - let s = unsafe { - // Safety: We already checked for the range - self.input.slice(start, cur_pos) - }; - self.atoms.atom(s) - }; - - return Ok(Some(Token::JSXText { raw, value })); - } - RawToken::GtOp => { - self.emit_error( - cur_pos, - SyntaxError::UnexpectedTokenWithSuggestions { - candidate_list: vec!["`{'>'}`", "`>`"], - }, - ); - unsafe { - // Safety: cur() was Some('>') - self.input.bump(1) - } - } - RawToken::RBrace => { - self.emit_error( - cur_pos, - SyntaxError::UnexpectedTokenWithSuggestions { - candidate_list: vec!["`{'}'}`", "`}`"], - }, - ); - unsafe { - // Safety: cur() was Some('}') - self.input.bump(1) - } - } - RawToken::BitAndOp => { - value.push_str(unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }); - - let jsx_entity = self.read_jsx_entity()?; - - value.push(jsx_entity.0); - chunk_start = self.input.cur_pos(); - } - - _ => { - if cur.is_line_terminator() { - value.push_str(unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }); - match self.read_jsx_new_line(true)? { - Either::Left(s) => value.push_str(s), - Either::Right(c) => value.push(c), - } - chunk_start = self.input.cur_pos(); - } else { - unsafe { - // Safety: cur() was Some(c) - self.input.bump(1) - } - } - } - } - } - } - - pub(super) fn read_jsx_entity(&mut self) -> LexResult<(char, String)> { - debug_assert!(self.syntax.jsx()); - - fn from_code(s: &str, radix: u32) -> LexResult { - // TODO(kdy1): unwrap -> Err - let c = char::from_u32( - u32::from_str_radix(s, radix).expect("failed to parse string as number"), - ) - .expect("failed to parse number as char"); - - Ok(c) - } - - fn is_hex(s: &str) -> bool { - s.chars().all(|c| c.is_ascii_hexdigit()) - } - - fn is_dec(s: &str) -> bool { - s.chars().all(|c| c.is_ascii_digit()) - } - - let mut s = SmartString::::default(); - - let c = self.input.cur()?; - debug_assert_eq!(c, Some(RawToken::BitAndOp)); - unsafe { - // Safety: cur() was Some('&') - self.input.bump(1); - } - - let start_pos = self.input.cur_pos(); - - for _ in 0..10 { - let c = match self.input.cur()? { - Some(c) => c, - None => break, - }; - unsafe { - // Safety: cur() was Some(c) - self.input.bump(1); - } - - if c == RawToken::Semi { - if let Some(stripped) = s.strip_prefix('#') { - if stripped.starts_with('x') { - if is_hex(&s[2..]) { - let value = from_code(&s[2..], 16)?; - - return Ok((value, format!("&{};", s))); - } - } else if is_dec(stripped) { - let value = from_code(stripped, 10)?; - - return Ok((value, format!("&{};", s))); - } - } else if let Some(entity) = xhtml(&s) { - return Ok((entity, format!("&{};", s))); - } - - break; - } - - s.push(c) - } - - unsafe { - // Safety: start_pos is a valid position because we got it from self.input - self.input.reset_to(start_pos); - } - - Ok(('&', "&".to_string())) - } - - pub(super) fn read_jsx_new_line( - &mut self, - normalize_crlf: bool, - ) -> LexResult> { - debug_assert!(self.syntax.jsx()); - - let ch = self.input.cur().unwrap(); - unsafe { - // Safety: cur() was Some(ch) - self.input.bump(1); - } - - let out = if ch == '\r' && self.input.cur() == Some('\n') { - unsafe { - // Safety: cur() was Some('\n') - self.input.bump(1); - } - Either::Left(if normalize_crlf { "\n" } else { "\r\n" }) - } else { - Either::Right(ch) - }; - let cur_pos = self.input.cur_pos(); - self.state.cur_line += 1; - self.state.line_start = cur_pos; - - Ok(out) - } - - pub(super) fn read_jsx_str(&mut self, quote: char) -> LexResult { - debug_assert!(self.syntax.jsx()); - - let start = self.input.cur_pos(); - - unsafe { - // Safety: cur() was Some(quote) - self.input.bump(1); // `quote` - } - - let mut out = String::new(); - let mut chunk_start = self.input.cur_pos(); - - loop { - let ch = match self.input.cur()? { - Some(c) => c, - None => { - let start = self.state.start; - self.emit_error(start, SyntaxError::UnterminatedStrLit); - break; - } - }; - - let cur_pos = self.input.cur_pos(); - - if ch == '\\' { - let value = unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }; - - out.push_str(value); - out.push('\\'); - - self.bump(); - - chunk_start = self.input.cur_pos(); - - continue; - } - - if ch == quote { - break; - } - - if ch == '&' { - let value = unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }; - - out.push_str(value); - - let jsx_entity = self.read_jsx_entity()?; - - out.push(jsx_entity.0); - - chunk_start = self.input.cur_pos(); - } else if ch.is_line_terminator() { - let value = unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }; - - out.push_str(value); - - match self.read_jsx_new_line(false)? { - Either::Left(s) => { - out.push_str(s); - } - Either::Right(c) => { - out.push(c); - } - } - - chunk_start = cur_pos + BytePos(ch.len_utf8() as _); - } else { - unsafe { - // Safety: cur() was Some(ch) - self.input.bump(1); - } - } - } - - let value = if out.is_empty() { - // Fast path: We don't need to allocate - - let cur_pos = self.input.cur_pos(); - let value = unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }; - - self.atoms.atom(value) - } else { - let cur_pos = self.input.cur_pos(); - let value = unsafe { - // Safety: We already checked for the range - self.input.slice(chunk_start, cur_pos) - }; - - out.push_str(value); - - self.atoms.atom(out) - }; - - // it might be at the end of the file when - // the string literal is unterminated - if self.input.peek_ahead()?.is_some() { - unsafe { - // Safety: We called peek_ahead() which means cur() was Some - self.input.bump(1); - } - } - - let end = self.input.cur_pos(); - let raw = unsafe { - // Safety: Both of `start` and `end` are generated from `cur_pos()` - self.input.slice(start, end) - }; - - Ok(Token::Str { - value, - raw: self.atoms.atom(raw), - }) - } - - /// Read a JSX identifier (valid tag or attribute name). - /// - /// Optimized version since JSX identifiers can"t contain - /// escape characters and so can be read as single slice. - /// Also assumes that first character was already checked - /// by isIdentifierStart in readToken. - pub(super) fn read_jsx_word(&mut self) -> LexResult { - debug_assert!(self.syntax.jsx()); - debug_assert!(self.input.cur()?.is_some()); - debug_assert!(self.input.cur()?.unwrap().is_ident_start()); - - let mut first = true; - let slice = self.input.uncons_while(|c| { - if first { - first = false; - c.is_ident_start() - } else { - c.is_ident_part() || c == '-' - } - }); - - Ok(Token::JSXName { - name: self.atoms.atom(slice), - }) - } + // pub(super) fn read_jsx_token(&mut self) -> LexResult> { + // debug_assert!(self.syntax.jsx()); + + // let start = self.input.cur_pos(); + // let mut chunk_start = self.input.cur_pos(); + // let mut value = String::new(); + + // loop { + // let cur = match self.input.cur()? { + // Some(c) => c, + // None => { + // let start = self.state.start; + // self.error(start, SyntaxError::UnterminatedJSXContents)? + // } + // }; + // let cur_pos = self.input.cur_pos(); + + // match cur { + // RawToken::LtOp + // if self.had_line_break_before_last() + // && self.input.peek()? == Some(RawToken::LConflictMarker) + // => { + // let span = Span::new(cur_pos, cur_pos + BytePos(7)); + + // self.emit_error_span(span, SyntaxError::TS1185); + // self.skip_line_comment(6); + // self.skip_space::(); + // return self.read_token(); + // } + // RawToken::LtOp | RawToken::LBrace => { + // // + // if cur_pos == self.state.start { + // if cur == RawToken::LtOp && self.state.is_expr_allowed { + // unsafe { + // // Safety: cur() was Some('<') + // self.input.bump(1); + // } + // return Ok(Some(Token::JSXTagStart)); + // } + // return self.read_token(); + // } + + // let value = if value.is_empty() { + // // Fast path: We don't need to allocate extra buffer for + // value let s = unsafe { + // // Safety: We already checked for the range + // self.input.slice(chunk_start, cur_pos) + // }; + // self.atoms.atom(s) + // } else { + // value.push_str(unsafe { + // // Safety: We already checked for the range + // self.input.slice(chunk_start, cur_pos) + // }); + // self.atoms.atom(value) + // }; + + // let raw = { + // let s = unsafe { + // // Safety: We already checked for the range + // self.input.slice(start, cur_pos) + // }; + // self.atoms.atom(s) + // }; + + // return Ok(Some(Token::JSXText { raw, value })); + // } + // RawToken::GtOp => { + // self.emit_error( + // cur_pos, + // SyntaxError::UnexpectedTokenWithSuggestions { + // candidate_list: vec!["`{'>'}`", "`>`"], + // }, + // ); + // unsafe { + // // Safety: cur() was Some('>') + // self.input.bump(1) + // } + // } + // RawToken::RBrace => { + // self.emit_error( + // cur_pos, + // SyntaxError::UnexpectedTokenWithSuggestions { + // candidate_list: vec!["`{'}'}`", "`}`"], + // }, + // ); + // unsafe { + // // Safety: cur() was Some('}') + // self.input.bump(1) + // } + // } + // RawToken::BitAndOp => { + // value.push_str(unsafe { + // // Safety: We already checked for the range + // self.input.slice(chunk_start, cur_pos) + // }); + + // let jsx_entity = self.read_jsx_entity()?; + + // value.push(jsx_entity.0); + // chunk_start = self.input.cur_pos(); + // } + + // _ => { + // if cur.is_line_terminator() { + // value.push_str(unsafe { + // // Safety: We already checked for the range + // self.input.slice(chunk_start, cur_pos) + // }); + // match self.read_jsx_new_line(true)? { + // Either::Left(s) => value.push_str(s), + // Either::Right(c) => value.push(c), + // } + // chunk_start = self.input.cur_pos(); + // } else { + // unsafe { + // // Safety: cur() was Some(c) + // self.input.bump(1) + // } + // } + // } + // } + // } + // } + + // pub(super) fn read_jsx_entity(&mut self) -> LexResult<(char, String)> { + // debug_assert!(self.syntax.jsx()); + + // fn from_code(s: &str, radix: u32) -> LexResult { + // // TODO(kdy1): unwrap -> Err + // let c = char::from_u32( + // u32::from_str_radix(s, radix).expect("failed to parse string as + // number"), ) + // .expect("failed to parse number as char"); + + // Ok(c) + // } + + // fn is_hex(s: &str) -> bool { + // s.chars().all(|c| c.is_ascii_hexdigit()) + // } + + // fn is_dec(s: &str) -> bool { + // s.chars().all(|c| c.is_ascii_digit()) + // } + + // let mut s = SmartString::::default(); + + // let c = self.input.cur()?; + // debug_assert_eq!(c, Some(RawToken::BitAndOp)); + // unsafe { + // // Safety: cur() was Some('&') + // self.input.bump(1); + // } + + // let start_pos = self.input.cur_pos(); + + // for _ in 0..10 { + // let c = match self.input.cur()? { + // Some(c) => c, + // None => break, + // }; + // unsafe { + // // Safety: cur() was Some(c) + // self.input.bump(1); + // } + + // if c == RawToken::Semi { + // if let Some(stripped) = s.strip_prefix('#') { + // if stripped.starts_with('x') { + // if is_hex(&s[2..]) { + // let value = from_code(&s[2..], 16)?; + + // return Ok((value, format!("&{};", s))); + // } + // } else if is_dec(stripped) { + // let value = from_code(stripped, 10)?; + + // return Ok((value, format!("&{};", s))); + // } + // } else if let Some(entity) = xhtml(&s) { + // return Ok((entity, format!("&{};", s))); + // } + + // break; + // } + + // s.push(c) + // } + + // unsafe { + // // Safety: start_pos is a valid position because we got it from + // self.input self.input.reset_to(start_pos); + // } + + // Ok(('&', "&".to_string())) + // } + + // pub(super) fn read_jsx_new_line( + // &mut self, + // normalize_crlf: bool, + // ) -> LexResult> { + // todo!() + // // debug_assert!(self.syntax.jsx()); + + // // let ch = self.input.cur().unwrap(); + // // unsafe { + // // // Safety: cur() was Some(ch) + // // self.input.bump(1); + // // } + + // // let out = if ch == '\r' && self.input.cur() == Some('\n') { + // // unsafe { + // // // Safety: cur() was Some('\n') + // // self.input.bump(1); + // // } + // // Either::Left(if normalize_crlf { "\n" } else { "\r\n" }) + // // } else { + // // Either::Right(ch) + // // }; + // // let cur_pos = self.input.cur_pos(); + // // self.state.cur_line += 1; + // // self.state.line_start = cur_pos; + + // // Ok(out) + // } + + // pub(super) fn read_jsx_str(&mut self, quote: char) -> LexResult { + // todo!(); + + // // debug_assert!(self.syntax.jsx()); + + // // let start = self.input.cur_pos(); + + // // unsafe { + // // // Safety: cur() was Some(quote) + // // self.input.bump(1); // `quote` + // // } + + // // let mut out = String::new(); + // // let mut chunk_start = self.input.cur_pos(); + + // // loop { + // // let ch = match self.input.cur()? { + // // Some(c) => c, + // // None => { + // // let start = self.state.start; + // // self.emit_error(start, SyntaxError::UnterminatedStrLit); + // // break; + // // } + // // }; + + // // let cur_pos = self.input.cur_pos(); + + // // if ch == '\\' { + // // let value = unsafe { + // // // Safety: We already checked for the range + // // self.input.slice(chunk_start, cur_pos) + // // }; + + // // out.push_str(value); + // // out.push('\\'); + + // // self.bump(); + + // // chunk_start = self.input.cur_pos(); + + // // continue; + // // } + + // // if ch == quote { + // // break; + // // } + + // // if ch == '&' { + // // let value = unsafe { + // // // Safety: We already checked for the range + // // self.input.slice(chunk_start, cur_pos) + // // }; + + // // out.push_str(value); + + // // let jsx_entity = self.read_jsx_entity()?; + + // // out.push(jsx_entity.0); + + // // chunk_start = self.input.cur_pos(); + // // } else if ch.is_line_terminator() { + // // let value = unsafe { + // // // Safety: We already checked for the range + // // self.input.slice(chunk_start, cur_pos) + // // }; + + // // out.push_str(value); + + // // match self.read_jsx_new_line(false)? { + // // Either::Left(s) => { + // // out.push_str(s); + // // } + // // Either::Right(c) => { + // // out.push(c); + // // } + // // } + + // // chunk_start = cur_pos + BytePos(ch.len_utf8() as _); + // // } else { + // // unsafe { + // // // Safety: cur() was Some(ch) + // // self.input.bump(1); + // // } + // // } + // // } + + // // let value = if out.is_empty() { + // // // Fast path: We don't need to allocate + + // // let cur_pos = self.input.cur_pos(); + // // let value = unsafe { + // // // Safety: We already checked for the range + // // self.input.slice(chunk_start, cur_pos) + // // }; + + // // self.atoms.atom(value) + // // } else { + // // let cur_pos = self.input.cur_pos(); + // // let value = unsafe { + // // // Safety: We already checked for the range + // // self.input.slice(chunk_start, cur_pos) + // // }; + + // // out.push_str(value); + + // // self.atoms.atom(out) + // // }; + + // // // it might be at the end of the file when + // // // the string literal is unterminated + // // if self.input.peek_ahead()?.is_some() { + // // unsafe { + // // // Safety: We called peek_ahead() which means cur() was Some + // // self.input.bump(1); + // // } + // // } + + // // let end = self.input.cur_pos(); + // // let raw = unsafe { + // // // Safety: Both of `start` and `end` are generated from + // // `cur_pos()` self.input.slice(start, end) + // // }; + + // // Ok(Token::Str { + // // value, + // // raw: self.atoms.atom(raw), + // // }) + // } + + // /// Read a JSX identifier (valid tag or attribute name). + // /// + // /// Optimized version since JSX identifiers can"t contain + // /// escape characters and so can be read as single slice. + // /// Also assumes that first character was already checked + // /// by isIdentifierStart in readToken. + // pub(super) fn read_jsx_word(&mut self) -> LexResult { + // todo!() + + // // debug_assert!(self.syntax.jsx()); + // // debug_assert!(self.input.cur()?.is_some()); + // // debug_assert!(self.input.cur()?.unwrap().is_ident_start()); + + // // let mut first = true; + // // let slice = self.input.uncons_while(|c| { + // // if first { + // // first = false; + // // c.is_ident_start() + // // } else { + // // c.is_ident_part() || c == '-' + // // } + // // }); + + // // Ok(Token::JSXName { + // // name: self.atoms.atom(slice), + // // }) + // } } macro_rules! xhtml { diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 6a78756b08c2..bade182c9b18 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -16,7 +16,7 @@ pub use self::{ }; use crate::{ error::{Error, SyntaxError}, - token::{BinOpToken, IdentLike, Token, Word}, + token::{BinOpToken, IdentLike, Keyword, KnownIdent, Token, Word}, Context, Syntax, }; @@ -173,21 +173,29 @@ impl<'a> Lexer<'a> { /// babel: `getTokenFromCode` fn read_token(&mut self) -> LexResult> { let start = self.input.cur_pos(); - let cur = match self.input.cur()? { - Some(cur) => cur, + let cur = match self.input.next() { + Some(cur) => cur?, None => return Ok(None), }; - match cur { + Ok(Some(match cur { RawToken::LegacyCommentOpen => { // XML style comment. ` + self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule); + // self.skip_line_comment(0); + // self.skip_space::(); + return self.read_token(); + } + RawToken::LConflictMarker | RawToken::RConflictMarker if self.had_line_break_before_last() => { @@ -201,959 +209,1168 @@ impl<'a> Lexer<'a> { // ^ self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185); - self.skip_line_comment(5); - self.skip_space::(); + // self.skip_line_comment(5); + // self.skip_space::(); return self.read_token(); } - - _ => {} - } - } - - /// Read a token given `.`. - /// - /// This is extracted as a method to reduce size of `read_token`. - #[inline(never)] - fn read_token_dot(&mut self) -> LexResult { - // Check for eof - let next = match self.input.peek()? { - Some(next) => next, - None => { - unsafe { - // Safety: cur() is Some(',') - self.input.bump(1); - } - return Ok(tok!('.')); + RawToken::Arrow => Token::Arrow, + RawToken::Hash => Token::Hash, + RawToken::At => Token::At, + RawToken::Dot => Token::Dot, + RawToken::DotDotDot => Token::DotDotDot, + RawToken::Bang => Token::Bang, + RawToken::LParen => Token::LParen, + RawToken::RParen => Token::RParen, + RawToken::LBracket => Token::LBracket, + RawToken::RBracket => Token::RBracket, + RawToken::LBrace => Token::LBrace, + RawToken::RBrace => Token::RBrace, + RawToken::Semi => Token::Semi, + RawToken::Comma => Token::Comma, + RawToken::Colon => Token::Colon, + RawToken::BackQuote => Token::BackQuote, + RawToken::DollarLBrace => Token::DollarLBrace, + RawToken::QuestionMark => Token::QuestionMark, + RawToken::PlusPlus => Token::PlusPlus, + RawToken::MinusMinus => Token::MinusMinus, + RawToken::Tilde => Token::Tilde, + RawToken::Str => Token::Str { + value: self.atoms.atom(self.input.cur_slice()), + raw: self.atoms.atom(self.input.cur_slice()), + }, + RawToken::Num => Token::Num { + value: self.input.cur_slice().parse().unwrap(), + raw: self.atoms.atom(self.input.cur_slice()), + }, + RawToken::BigInt => Token::BigInt { + value: self.input.cur_slice().parse().map(Box::new).unwrap(), + raw: self.atoms.atom(self.input.cur_slice()), + }, + RawToken::Shebang => Token::Shebang(self.atoms.atom(self.input.cur_slice())), + RawToken::Null => Token::Word(Word::Null), + RawToken::True => Token::Word(Word::True), + RawToken::False => Token::Word(Word::False), + RawToken::EqEqOp => Token::BinOp(BinOpToken::EqEq), + RawToken::NotEqOp => Token::BinOp(BinOpToken::NotEq), + RawToken::EqEqEqOp => Token::BinOp(BinOpToken::EqEqEq), + RawToken::NotEqEqOp => Token::BinOp(BinOpToken::NotEqEq), + RawToken::LtOp => Token::BinOp(BinOpToken::Lt), + RawToken::LtEqOp => Token::BinOp(BinOpToken::LtEq), + RawToken::GtOp => Token::BinOp(BinOpToken::Gt), + RawToken::GtEqOp => Token::BinOp(BinOpToken::GtEq), + RawToken::LShiftOp => Token::BinOp(BinOpToken::LShift), + RawToken::RShiftOp => Token::BinOp(BinOpToken::RShift), + RawToken::ZeroFillRShiftOp => Token::BinOp(BinOpToken::ZeroFillRShift), + RawToken::AddOp => Token::BinOp(BinOpToken::Add), + RawToken::SubOp => Token::BinOp(BinOpToken::Sub), + RawToken::MulOp => Token::BinOp(BinOpToken::Mul), + RawToken::DivOp => Token::BinOp(BinOpToken::Div), + RawToken::ModOp => Token::BinOp(BinOpToken::Mod), + RawToken::BitOrOp => Token::BinOp(BinOpToken::BitOr), + RawToken::BitXorOp => Token::BinOp(BinOpToken::BitXor), + RawToken::BitAndOp => Token::BinOp(BinOpToken::BitAnd), + RawToken::ExpOp => Token::BinOp(BinOpToken::Exp), + RawToken::LogicalOrOp => Token::BinOp(BinOpToken::LogicalOr), + RawToken::LogicalAndOp => Token::BinOp(BinOpToken::LogicalAnd), + RawToken::NullishCoalescingOp => Token::BinOp(BinOpToken::NullishCoalescing), + RawToken::AssignOp => Token::AssignOp(AssignOp::Assign), + RawToken::AddAssignOp => Token::AssignOp(AssignOp::AddAssign), + RawToken::SubAssignOp => Token::AssignOp(AssignOp::SubAssign), + RawToken::MulAssignOp => Token::AssignOp(AssignOp::MulAssign), + RawToken::DivAssignOp => Token::AssignOp(AssignOp::DivAssign), + RawToken::ModAssignOp => Token::AssignOp(AssignOp::ModAssign), + RawToken::LShiftAssignOp => Token::AssignOp(AssignOp::LShiftAssign), + RawToken::RShiftAssignOp => Token::AssignOp(AssignOp::RShiftAssign), + RawToken::ZeroFillRShiftAssignOp => Token::AssignOp(AssignOp::ZeroFillRShiftAssign), + RawToken::BitOrAssignOp => Token::AssignOp(AssignOp::BitOrAssign), + RawToken::BitXorAssignOp => Token::AssignOp(AssignOp::BitXorAssign), + RawToken::BitAndAssignOp => Token::AssignOp(AssignOp::BitAndAssign), + RawToken::ExpAssignOp => Token::AssignOp(AssignOp::ExpAssign), + RawToken::AndAssignOp => Token::AssignOp(AssignOp::AndAssign), + RawToken::OrAssignOp => Token::AssignOp(AssignOp::OrAssign), + RawToken::NullishAssignOp => Token::AssignOp(AssignOp::NullishAssign), + RawToken::Ident => Token::Word(Word::Ident(IdentLike::Other({ + self.atoms.atom(self.input.cur_slice()) + }))), + RawToken::NewLine | RawToken::Whitespace => { + // self.skip_space::(); + return self.read_token(); } - }; - if next.is_ascii_digit() { - return self.read_number(true).map(|v| match v { - Left((value, raw)) => Token::Num { value, raw }, - Right((value, raw)) => Token::BigInt { value, raw }, - }); - } - - unsafe { - // Safety: cur() is Some - // 1st `.` - self.input.bump(1); - } - - if next == '.' && self.input.peek() == Some('.') { - unsafe { - // Safety: peek() was Some - - self.input.bump(2); // `..` + RawToken::LineComment + | RawToken::BlockComment + | RawToken::LegacyCommentOpen + | RawToken::LegacyCommentClose + | RawToken::LConflictMarker + | RawToken::RConflictMarker => { + // self.skip_line_comment(0); + // self.skip_space::(); + return self.read_token(); } - return Ok(tok!("...")); - } + RawToken::Await => Token::Word(Word::Keyword(Keyword::Await)), + RawToken::Break => Token::Word(Word::Keyword(Keyword::Break)), - Ok(tok!('.')) - } + RawToken::Case => Token::Word(Word::Keyword(Keyword::Case)), - /// Read a token given `0`. - /// - /// This is extracted as a method to reduce size of `read_token`. - #[inline(never)] - fn read_token_zero(&mut self) -> LexResult { - let next = self.input.peek()?; - - let bigint = match next { - Some('x') | Some('X') => self.read_radix_number::<16>(), - Some('o') | Some('O') => self.read_radix_number::<8>(), - Some('b') | Some('B') => self.read_radix_number::<2>(), - _ => { - return self.read_number(false).map(|v| match v { - Left((value, raw)) => Token::Num { value, raw }, - Right((value, raw)) => Token::BigInt { value, raw }, - }); - } - }; + RawToken::Catch => Token::Word(Word::Keyword(Keyword::Catch)), - bigint.map(|v| match v { - Left((value, raw)) => Token::Num { value, raw }, - Right((value, raw)) => Token::BigInt { value, raw }, - }) - } + RawToken::Continue => Token::Word(Word::Keyword(Keyword::Continue)), - /// Read a token given `|` or `&`. - /// - /// This is extracted as a method to reduce size of `read_token`. - #[inline(never)] - fn read_token_logical(&mut self, c: u8) -> LexResult { - let had_line_break_before_last = self.had_line_break_before_last(); - let start = self.input.cur_pos(); + RawToken::Debugger => Token::Word(Word::Keyword(Keyword::Debugger)), - unsafe { - // Safety: cur() is Some(c as char) - self.input.bump(1); - } - let token = if c == b'&' { - BinOpToken::BitAnd - } else { - BinOpToken::BitOr - }; + RawToken::Default_ => Token::Word(Word::Keyword(Keyword::Default_)), - // '|=', '&=' - if self.input.eat_byte(b'=') { - return Ok(Token::AssignOp(match token { - BinOpToken::BitAnd => AssignOp::BitAndAssign, - BinOpToken::BitOr => AssignOp::BitOrAssign, - _ => unreachable!(), - })); - } + RawToken::Do => Token::Word(Word::Keyword(Keyword::Do)), - // '||', '&&' - if self.input.cur() == Some(c as char) { - unsafe { - // Safety: cur() is Some(c) - self.input.bump(1); - } + RawToken::Else => Token::Word(Word::Keyword(Keyword::Else)), - if self.input.cur() == Some('=') { - unsafe { - // Safety: cur() is Some('=') - self.input.bump(1); - } - return Ok(Token::AssignOp(match token { - BinOpToken::BitAnd => op!("&&="), - BinOpToken::BitOr => op!("||="), - _ => unreachable!(), - })); - } + RawToken::Finally => Token::Word(Word::Keyword(Keyword::Finally)), - // ||||||| - // ^ - if had_line_break_before_last && token == BinOpToken::BitOr && self.is_str("||||| ") { - let span = fixed_len_span(start, 7); - self.emit_error_span(span, SyntaxError::TS1185); - self.skip_line_comment(5); - self.skip_space::(); - return self.error_span(span, SyntaxError::TS1185); - } + RawToken::For => Token::Word(Word::Keyword(Keyword::For)), - return Ok(Token::BinOp(match token { - BinOpToken::BitAnd => BinOpToken::LogicalAnd, - BinOpToken::BitOr => BinOpToken::LogicalOr, - _ => unreachable!(), - })); - } + RawToken::Function => Token::Word(Word::Keyword(Keyword::Function)), - Ok(Token::BinOp(token)) - } + RawToken::If => Token::Word(Word::Keyword(Keyword::If)), - /// Read an escaped character for string literal. - /// - /// In template literal, we should preserve raw string. - fn read_escaped_char(&mut self, in_template: bool) -> LexResult>> { - debug_assert_eq!(self.input.cur(), Some('\\')); + RawToken::Return => Token::Word(Word::Keyword(Keyword::Return)), - let start = self.input.cur_pos(); + RawToken::Switch => Token::Word(Word::Keyword(Keyword::Switch)), - self.bump(); // '\' + RawToken::Throw => Token::Word(Word::Keyword(Keyword::Throw)), - let c = match self.input.cur()? { - Some(c) => c, - None => self.error_span(pos_span(start), SyntaxError::InvalidStrEscape)?, - }; + RawToken::Try => Token::Word(Word::Keyword(Keyword::Try)), - macro_rules! push_c_and_ret { - ($c:expr) => {{ - $c - }}; - } + RawToken::Var => Token::Word(Word::Keyword(Keyword::Var)), - let c = match c { - '\\' => push_c_and_ret!('\\'), - 'n' => push_c_and_ret!('\n'), - 'r' => push_c_and_ret!('\r'), - 't' => push_c_and_ret!('\t'), - 'b' => push_c_and_ret!('\u{0008}'), - 'v' => push_c_and_ret!('\u{000b}'), - 'f' => push_c_and_ret!('\u{000c}'), - '\r' => { - self.bump(); // remove '\r' + RawToken::Let => Token::Word(Word::Keyword(Keyword::Let)), - self.input.eat(RawToken::NewLine); + RawToken::Const => Token::Word(Word::Keyword(Keyword::Const)), - return Ok(None); - } - '\n' | '\u{2028}' | '\u{2029}' => { - self.bump(); + RawToken::While => Token::Word(Word::Keyword(Keyword::While)), - return Ok(None); - } + RawToken::With => Token::Word(Word::Keyword(Keyword::With)), - // read hexadecimal escape sequences - 'x' => { - self.bump(); // 'x' - - match self.read_int_u32::<16>(2)? { - Some(val) => return Ok(Some(vec![Char::from(val)])), - None => self.error( - start, - SyntaxError::BadCharacterEscapeSequence { - expected: "2 hex characters", - }, - )?, - } - } + RawToken::New => Token::Word(Word::Keyword(Keyword::New)), - // read unicode escape sequences - 'u' => match self.read_unicode_escape() { - Ok(chars) => return Ok(Some(chars)), - Err(err) => self.error(start, err.into_kind())?, - }, + RawToken::This => Token::Word(Word::Keyword(Keyword::This)), - // octal escape sequences - '0'..='7' => { - self.bump(); + RawToken::Super => Token::Word(Word::Keyword(Keyword::Super)), - let first_c = if c == '0' { - match self.input.cur()? { - Some(next) if next.is_digit(8) => c, - // \0 is not an octal literal nor decimal literal. - _ => return Ok(Some(vec!['\u{0000}'.into()])), - } - } else { - c - }; + RawToken::Class => Token::Word(Word::Keyword(Keyword::Class)), - // TODO: Show template instead of strict mode - if in_template { - self.error(start, SyntaxError::LegacyOctal)? - } + RawToken::Extends => Token::Word(Word::Keyword(Keyword::Extends)), - self.emit_strict_mode_error(start, SyntaxError::LegacyOctal); - - let mut value: u8 = first_c.to_digit(8).unwrap() as u8; - - macro_rules! one { - ($check:expr) => {{ - let cur = self.input.cur(); - - match cur.and_then(|c| c.to_digit(8)) { - Some(v) => { - value = if $check { - let new_val = value - .checked_mul(8) - .and_then(|value| value.checked_add(v as u8)); - match new_val { - Some(val) => val, - None => return Ok(Some(vec![Char::from(value as char)])), - } - } else { - value * 8 + v as u8 - }; - - self.bump(); - } - _ => return Ok(Some(vec![Char::from(value as u32)])), - } - }}; - } + RawToken::Export => Token::Word(Word::Keyword(Keyword::Export)), - one!(false); - one!(true); + RawToken::Import => Token::Word(Word::Keyword(Keyword::Import)), - return Ok(Some(vec![Char::from(value as char)])); - } - _ => c, - }; + RawToken::Yield => Token::Word(Word::Keyword(Keyword::Yield)), - unsafe { - // Safety: cur() is Some(c) if this method is called. - self.input.bump(1); - } + RawToken::In => Token::Word(Word::Keyword(Keyword::In)), - Ok(Some(vec![c.into()])) - } + RawToken::InstanceOf => Token::Word(Word::Keyword(Keyword::InstanceOf)), - fn read_token_plus_minus(&mut self, c: u8) -> LexResult> { - let start = self.input.cur_pos(); + RawToken::TypeOf => Token::Word(Word::Keyword(Keyword::TypeOf)), - unsafe { - // Safety: cur() is Some(c), if this method is called. - self.input.bump(1); - } + RawToken::Void => Token::Word(Word::Keyword(Keyword::Void)), - // '++', '--' - Ok(Some(if self.input.cur() == Some(c as char) { - unsafe { - // Safety: cur() is Some(c) - self.input.bump(1); - } + RawToken::Delete => Token::Word(Word::Keyword(Keyword::Delete)), - // Handle --> - if self.state.had_line_break && c == b'-' && self.input.eat(b'>') { - self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule); - self.skip_line_comment(0); - self.skip_space::(); - return self.read_token(); - } + RawToken::Abstract => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Abstract))), - if c == b'+' { - Token::PlusPlus - } else { - Token::MinusMinus - } - } else if self.input.eat_byte(b'=') { - Token::AssignOp(if c == b'+' { - AssignOp::AddAssign - } else { - AssignOp::SubAssign - }) - } else { - Token::BinOp(if c == b'+' { - BinOpToken::Add - } else { - BinOpToken::Sub - }) - })) - } + RawToken::As => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::As))), - fn read_token_bang_or_eq(&mut self, c: u8) -> LexResult> { - let start = self.input.cur_pos(); - let had_line_break_before_last = self.had_line_break_before_last(); + RawToken::Async => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Async))), - unsafe { - // Safety: cur() is Some(c) if this method is called. - self.input.bump(1); - } + RawToken::From => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::From))), - Ok(Some(if self.input.eat(RawToken::AssignOp)? { - // "==" + RawToken::Of => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Of))), - if self.input.eat(RawToken::AssignOp)? { - if c == b'!' { - Token::BinOp(BinOpToken::NotEqEq) - } else { - // ======= - // ^ - if had_line_break_before_last && self.is_str("====") { - self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185); - self.skip_line_comment(4); - self.skip_space::(); - return self.read_token(); - } - - Token::BinOp(BinOpToken::EqEqEq) - } - } else if c == b'!' { - Token::BinOp(BinOpToken::NotEq) - } else { - Token::BinOp(BinOpToken::EqEq) - } - } else if c == b'=' && self.input.eat_byte(b'>') { - // "=>" + RawToken::Type => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Type))), - Token::Arrow - } else if c == b'!' { - Token::Bang - } else { - Token::AssignOp(AssignOp::Assign) - })) - } -} + RawToken::Global => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Global))), -impl Lexer<'_> { - /// This can be used if there's no keyword starting with the first - /// character. - fn read_ident_unknown(&mut self) -> LexResult { - debug_assert!(self.input.cur()?.is_some()); + RawToken::Static => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Static))), - let (word, _) = self - .read_word_as_str_with(|l, s, _, _| Word::Ident(IdentLike::Other(l.atoms.atom(s))))?; + RawToken::Using => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Using))), - Ok(Word(word)) - } + RawToken::Readonly => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Readonly))), - /// This can be used if there's no keyword starting with the first - /// character. - fn read_word_with( - &mut self, - convert: &dyn Fn(&str) -> Option, - ) -> LexResult> { - debug_assert!(self.input.cur()?.is_some()); + RawToken::Unique => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Unique))), - let start = self.input.cur_pos(); - let (word, has_escape) = self.read_word_as_str_with(|l, s, _, can_be_known| { - if can_be_known { - if let Some(word) = convert(s) { - return word; - } - } + RawToken::Keyof => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Keyof))), - Word::Ident(IdentLike::Other(l.atoms.atom(s))) - })?; - - // Note: ctx is store in lexer because of this error. - // 'await' and 'yield' may have semantic of reserved word, which means lexer - // should know context or parser should handle this error. Our approach to this - // problem is former one. - if has_escape && self.ctx.is_reserved(&word) { - self.error( - start, - SyntaxError::EscapeInReservedWord { word: word.into() }, - )? - } else { - Ok(Some(Token::Word(word))) - } - } + RawToken::Declare => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Declare))), - /// This method is optimized for texts without escape sequences. - /// - /// `convert(text, has_escape, can_be_keyword)` - fn read_word_as_str_with(&mut self, convert: F) -> LexResult<(Ret, bool)> - where - F: for<'any> FnOnce(&'any mut Lexer<'_>, &str, bool, bool) -> Ret, - { - debug_assert!(self.input.cur()?.is_some()); - let mut first = true; - let mut can_be_keyword = true; - let mut slice_start = self.input.cur_pos(); - let mut has_escape = false; - - self.with_buf(|l, buf| { - loop { - if let Some(c) = l.input.cur_as_ascii() { - // Performance optimization - if can_be_keyword && (c.is_ascii_uppercase() || c.is_ascii_digit()) { - can_be_keyword = false; - } - - if Ident::is_valid_continue(c as _) { - l.bump(); - continue; - } else if first && Ident::is_valid_start(c as _) { - l.bump(); - first = false; - continue; - } - - // unicode escape - if c == b'\\' { - first = false; - has_escape = true; - let start = l.input.cur_pos(); - l.bump(); - - if !l.is(b'u') { - l.error_span(pos_span(start), SyntaxError::ExpectedUnicodeEscape)? - } - - { - let end = l.input.cur_pos(); - let s = unsafe { - // Safety: start and end are valid position because we got them from - // `self.input` - l.input.slice(slice_start, start) - }; - buf.push_str(s); - unsafe { - // Safety: We got end from `self.input` - l.input.reset_to(end); - } - } - - let chars = l.read_unicode_escape()?; - - if let Some(c) = chars.first() { - let valid = if first { - c.is_ident_start() - } else { - c.is_ident_part() - }; - - if !valid { - l.emit_error(start, SyntaxError::InvalidIdentChar); - } - } - - for c in chars { - buf.extend(c); - } - - slice_start = l.input.cur_pos(); - continue; - } - - // ASCII but not a valid identifier - - break; - } + RawToken::Enum => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Enum))), - if let Some(c) = l.input.cur() { - if Ident::is_valid_continue(c) { - l.bump(); - continue; - } else if first && Ident::is_valid_start(c) { - l.bump(); - first = false; - continue; - } - } + RawToken::Is => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Is))), - break; - } + RawToken::Infer => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Infer))), - let end = l.input.cur_pos(); - - let value = if !has_escape { - // Fast path: raw slice is enough if there's no escape. - - let s = unsafe { - // Safety: slice_start and end are valid position because we got them from - // `self.input` - l.input.slice(slice_start, end) - }; - let s = unsafe { - // Safety: We don't use 'static. We just bypass the lifetime check. - transmute::<&str, &'static str>(s) - }; - - convert(l, s, has_escape, can_be_keyword) - } else { - let s = unsafe { - // Safety: slice_start and end are valid position because we got them from - // `self.input` - l.input.slice(slice_start, end) - }; - buf.push_str(s); - - convert(l, buf, has_escape, can_be_keyword) - }; - - Ok((value, has_escape)) - }) - } - - fn read_unicode_escape(&mut self) -> LexResult> { - debug_assert_eq!(self.input.cur(), Some('u')); - - let mut chars = Vec::new(); - let mut is_curly = false; + RawToken::Symbol => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Symbol))), - self.bump(); // 'u' - - if self.input.eat(b'{') { - is_curly = true; - } - - let state = self.input.cur_pos(); - let c = match self.read_int_u32::<16>(if is_curly { 0 } else { 4 }) { - Ok(Some(val)) => { - if 0x0010_ffff >= val { - char::from_u32(val) - } else { - let start = self.input.cur_pos(); - - self.error( - start, - SyntaxError::BadCharacterEscapeSequence { - expected: if is_curly { - "1-6 hex characters in the range 0 to 10FFFF." - } else { - "4 hex characters" - }, - }, - )? - } + RawToken::Undefined => { + Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Undefined))) } - _ => { - let start = self.input.cur_pos(); - self.error( - start, - SyntaxError::BadCharacterEscapeSequence { - expected: if is_curly { - "1-6 hex characters" - } else { - "4 hex characters" - }, - }, - )? + RawToken::Interface => { + Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Interface))) } - }; - match c { - Some(c) => { - chars.push(c.into()); + RawToken::Implements => { + Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Implements))) } - _ => { - unsafe { - // Safety: state is valid position because we got it from cur_pos() - self.input.reset_to(state); - } - - chars.push(Char::from('\\')); - chars.push(Char::from('u')); - - if is_curly { - chars.push(Char::from('{')); - - for _ in 0..6 { - if let Some(c) = self.input.cur() { - if c == '}' { - break; - } - self.bump(); + RawToken::Asserts => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Asserts))), - chars.push(Char::from(c)); - } else { - break; - } - } - - chars.push(Char::from('}')); - } else { - for _ in 0..4 { - if let Some(c) = self.input.cur()? { - self.bump(); - - chars.push(Char::from(c)); - } - } - } - } - } + RawToken::Require => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Require))), - if is_curly && !self.input.eat(b'}') { - self.error(state, SyntaxError::InvalidUnicodeEscape)? - } + RawToken::Asserts => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Asserts))), - Ok(chars) - } + RawToken::Get => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Get))), - /// See https://tc39.github.io/ecma262/#sec-literals-string-literals - fn read_str_lit(&mut self) -> LexResult { - debug_assert!(self.input.cur()? == Some('\'') || self.input.cur()? == Some('"')); - let start = self.input.cur_pos(); - let quote = self.input.cur()?.unwrap() as u8; - - self.bump(); // '"' - - let mut has_escape = false; - let mut slice_start = self.input.cur_pos(); - - self.with_buf(|l, buf| { - loop { - if let Some(c) = l.input.cur_as_ascii() { - if c == quote { - let value_end = l.input.cur_pos(); - - let value = if !has_escape { - let s = unsafe { - // Safety: slice_start and value_end are valid position because we - // got them from `self.input` - l.input.slice(slice_start, value_end) - }; - - l.atoms.atom(s) - } else { - let s = unsafe { - // Safety: slice_start and value_end are valid position because we - // got them from `self.input` - l.input.slice(slice_start, value_end) - }; - buf.push_str(s); - - l.atoms.atom(&**buf) - }; - - unsafe { - // Safety: cur is quote - l.input.bump(1); - } - - let end = l.input.cur_pos(); - - let raw = unsafe { - // Safety: start and end are valid position because we got them from - // `self.input` - l.input.slice(start, end) - }; - let raw = l.atoms.atom(raw); - - return Ok(Token::Str { value, raw }); - } - - if c == b'\\' { - has_escape = true; - - { - let end = l.input.cur_pos(); - let s = unsafe { - // Safety: start and end are valid position because we got them from - // `self.input` - l.input.slice(slice_start, end) - }; - buf.push_str(s); - } - - if let Some(chars) = l.read_escaped_char(false)? { - for c in chars { - buf.extend(c); - } - } - - slice_start = l.input.cur_pos(); - continue; - } - - if (c as char).is_line_break() { - break; - } - - unsafe { - // Safety: cur is a ascii character - l.input.bump(1); - } - continue; - } + RawToken::Set => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Set))), - match l.input.cur()? { - Some(c) => { - if c.is_line_break() { - break; - } - unsafe { - // Safety: cur is Some(c) - l.input.bump(1); - } - } - None => break, - } - } + RawToken::Any => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Any))), - { - let end = l.input.cur_pos(); - let s = unsafe { - // Safety: start and end are valid position because we got them from - // `self.input` - l.input.slice(slice_start, end) - }; - buf.push_str(s); + RawToken::Intrinsic => { + Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Intrinsic))) } - l.emit_error(start, SyntaxError::UnterminatedStrLit); + RawToken::Unknown => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Unknown))), - let end = l.input.cur_pos(); + RawToken::String => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::String))), - let raw = unsafe { - // Safety: start and end are valid position because we got them from - // `self.input` - l.input.slice(start, end) - }; - Ok(Token::Str { - value: l.atoms.atom(&*buf), - raw: l.atoms.atom(raw), - }) - }) - } - - /// Expects current char to be '/' - fn read_regexp(&mut self, start: BytePos) -> LexResult { - unsafe { - // Safety: start is valid position, and cur() is Some('/') - self.input.reset_to(start); - } - - debug_assert_eq!(self.input.cur(), Some('/')); + RawToken::Object => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Object))), - let start = self.input.cur_pos(); - - self.bump(); + RawToken::Number => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Number))), - let (mut escaped, mut in_class) = (false, false); + RawToken::Bigint => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Bigint))), - let content = self.with_buf(|l, buf| { - while let Some(c) = l.input.cur() { - // This is ported from babel. - // Seems like regexp literal cannot contain linebreak. - if c.is_line_terminator() { - let span = l.span(start); + RawToken::Boolean => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Boolean))), - return Err(Error::new(span, SyntaxError::UnterminatedRegExp)); - } + RawToken::Never => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Never))), - if escaped { - escaped = false; - } else { - match c { - '[' => in_class = true, - ']' if in_class => in_class = false, - // Terminates content part of regex literal - '/' if !in_class => break, - _ => {} - } - - escaped = c == '\\'; - } + RawToken::Assert => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Assert))), - l.bump(); - buf.push(c); + RawToken::Namespace => { + Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Namespace))) } - Ok(l.atoms.atom(&**buf)) - })?; + RawToken::Accessor => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Accessor))), - // input is terminated without following `/` - if !self.is(b'/') { - let span = self.span(start); + RawToken::Meta => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Meta))), - return Err(Error::new(span, SyntaxError::UnterminatedRegExp)); - } - - self.bump(); // '/' - - // Spec says "It is a Syntax Error if IdentifierPart contains a Unicode escape - // sequence." TODO: check for escape + RawToken::Target => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Target))), - // Need to use `read_word` because '\uXXXX' sequences are allowed - // here (don't ask). - // let flags_start = self.input.cur_pos(); - let flags = { - match self.input.cur() { - Some(c) if c.is_ident_start() => self - .read_word_as_str_with(|l, s, _, _| l.atoms.atom(s)) - .map(Some), - _ => Ok(None), + RawToken::Satisfies => { + Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Satisfies))) } - }? - .map(|(value, _)| value) - .unwrap_or_default(); - Ok(Token::Regex(content, flags)) - } + RawToken::Package => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Package))), - #[cold] - fn read_shebang(&mut self) -> LexResult> { - if self.input.cur() != Some('#') || self.input.peek() != Some('!') { - return Ok(None); - } - unsafe { - // Safety: "#!" - self.input.bump(2); - } - let s = self.input.uncons_while(|c| !c.is_line_terminator()); - Ok(Some(self.atoms.atom(s))) - } - - fn read_tmpl_token(&mut self, start_of_tpl: BytePos) -> LexResult { - let start = self.input.cur_pos(); - - let mut cooked = Ok(String::new()); - let mut cooked_slice_start = start; - let raw_slice_start = start; - - macro_rules! consume_cooked { - () => {{ - if let Ok(cooked) = &mut cooked { - let last_pos = self.input.cur_pos(); - cooked.push_str(unsafe { - // Safety: Both of start and last_pos are valid position because we got them - // from `self.input` - self.input.slice(cooked_slice_start, last_pos) - }); - } - }}; - } - - while let Some(c) = self.input.cur() { - if c == '`' || (c == '$' && self.input.peek() == Some('{')) { - if start == self.input.cur_pos() && self.state.last_was_tpl_element() { - if c == '$' { - self.bump(); - self.bump(); - return Ok(tok!("${")); - } else { - self.bump(); - return Ok(tok!('`')); - } - } - - // If we don't have any escape - let cooked = if cooked_slice_start == raw_slice_start { - let last_pos = self.input.cur_pos(); - let s = unsafe { - // Safety: Both of start and last_pos are valid position because we got them - // from `self.input` - self.input.slice(cooked_slice_start, last_pos) - }; - - Ok(self.atoms.atom(s)) - } else { - consume_cooked!(); - - cooked.map(|s| self.atoms.atom(s)) - }; - - // TODO: Handle error - let end = self.input.cur_pos(); - let raw = unsafe { - // Safety: Both of start and last_pos are valid position because we got them - // from `self.input` - self.input.slice(raw_slice_start, end) - }; - return Ok(Token::Template { - cooked, - raw: self.atoms.atom(raw), - }); + RawToken::Protected => { + Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Protected))) } - if c == '\\' { - consume_cooked!(); - - match self.read_escaped_char(true) { - Ok(Some(chars)) => { - if let Ok(ref mut cooked) = cooked { - for c in chars { - cooked.extend(c); - } - } - } - Ok(None) => {} - Err(error) => { - cooked = Err(error); - } - } - - cooked_slice_start = self.input.cur_pos(); - } else if c.is_line_terminator() { - self.state.had_line_break = true; - - consume_cooked!(); + RawToken::Private => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Private))), - let c = if c == '\r' && self.input.peek() == Some('\n') { - self.bump(); // '\r' - '\n' - } else { - match c { - '\n' => '\n', - '\r' => '\n', - '\u{2028}' => '\u{2028}', - '\u{2029}' => '\u{2029}', - _ => unreachable!(), - } - }; - - self.bump(); - - if let Ok(ref mut cooked) = cooked { - cooked.push(c); - } - cooked_slice_start = self.input.cur_pos(); - } else { - self.bump(); - } - } - - self.error(start_of_tpl, SyntaxError::UnterminatedTpl)? + RawToken::Public => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Public))), + })) } + // /// Read a token given `.`. + // /// + // /// This is extracted as a method to reduce size of `read_token`. + // #[inline(never)] + // fn read_token_dot(&mut self) -> LexResult { + // // Check for eof + // let next = match self.input.peek()? { + // Some(next) => next, + // None => { + // unsafe { + // // Safety: cur() is Some(',') + // self.input.bump(1); + // } + // return Ok(tok!('.')); + // } + // }; + // if next.is_ascii_digit() { + // return self.read_number(true).map(|v| match v { + // Left((value, raw)) => Token::Num { value, raw }, + // Right((value, raw)) => Token::BigInt { value, raw }, + // }); + // } + + // unsafe { + // // Safety: cur() is Some + // // 1st `.` + // self.input.bump(1); + // } + + // if next == '.' && self.input.peek() == Some('.') { + // unsafe { + // // Safety: peek() was Some + + // self.input.bump(2); // `..` + // } + + // return Ok(tok!("...")); + // } + + // Ok(tok!('.')) + // } + + // /// Read a token given `0`. + // /// + // /// This is extracted as a method to reduce size of `read_token`. + // #[inline(never)] + // fn read_token_zero(&mut self) -> LexResult { + // let next = self.input.peek()?; + + // let bigint = match next { + // Some('x') | Some('X') => self.read_radix_number::<16>(), + // Some('o') | Some('O') => self.read_radix_number::<8>(), + // Some('b') | Some('B') => self.read_radix_number::<2>(), + // _ => { + // return self.read_number(false).map(|v| match v { + // Left((value, raw)) => Token::Num { value, raw }, + // Right((value, raw)) => Token::BigInt { value, raw }, + // }); + // } + // }; + + // bigint.map(|v| match v { + // Left((value, raw)) => Token::Num { value, raw }, + // Right((value, raw)) => Token::BigInt { value, raw }, + // }) + // } + + // /// Read a token given `|` or `&`. + // /// + // /// This is extracted as a method to reduce size of `read_token`. + // #[inline(never)] + // fn read_token_logical(&mut self, c: u8) -> LexResult { + // let had_line_break_before_last = self.had_line_break_before_last(); + // let start = self.input.cur_pos(); + + // unsafe { + // // Safety: cur() is Some(c as char) + // self.input.bump(1); + // } + // let token = if c == b'&' { + // BinOpToken::BitAnd + // } else { + // BinOpToken::BitOr + // }; + + // // '|=', '&=' + // if self.input.eat_byte(b'=') { + // return Ok(Token::AssignOp(match token { + // BinOpToken::BitAnd => AssignOp::BitAndAssign, + // BinOpToken::BitOr => AssignOp::BitOrAssign, + // _ => unreachable!(), + // })); + // } + + // // '||', '&&' + // if self.input.cur() == Some(c as char) { + // unsafe { + // // Safety: cur() is Some(c) + // self.input.bump(1); + // } + + // if self.input.cur() == Some('=') { + // unsafe { + // // Safety: cur() is Some('=') + // self.input.bump(1); + // } + // return Ok(Token::AssignOp(match token { + // BinOpToken::BitAnd => op!("&&="), + // BinOpToken::BitOr => op!("||="), + // _ => unreachable!(), + // })); + // } + + // // ||||||| + // // ^ + // if had_line_break_before_last && token == BinOpToken::BitOr && + // self.is_str("||||| ") { let span = fixed_len_span(start, + // 7); self.emit_error_span(span, SyntaxError::TS1185); + // self.skip_line_comment(5); + // self.skip_space::(); + // return self.error_span(span, SyntaxError::TS1185); + // } + + // return Ok(Token::BinOp(match token { + // BinOpToken::BitAnd => BinOpToken::LogicalAnd, + // BinOpToken::BitOr => BinOpToken::LogicalOr, + // _ => unreachable!(), + // })); + // } + + // Ok(Token::BinOp(token)) + // } + + // /// Read an escaped character for string literal. + // /// + // /// In template literal, we should preserve raw string. + // fn read_escaped_char(&mut self, in_template: bool) -> + // LexResult>> { todo!(); + // // debug_assert_eq!(self.input.cur(), Some('\\')); + + // // let start = self.input.cur_pos(); + + // // self.bump(); // '\' + + // // let c = match self.input.cur()? { + // // Some(c) => c, + // // None => self.error_span(pos_span(start), + // // SyntaxError::InvalidStrEscape)?, }; + + // // macro_rules! push_c_and_ret { + // // ($c:expr) => {{ + // // $c + // // }}; + // // } + + // // let c = match c { + // // '\\' => push_c_and_ret!('\\'), + // // 'n' => push_c_and_ret!('\n'), + // // 'r' => push_c_and_ret!('\r'), + // // 't' => push_c_and_ret!('\t'), + // // 'b' => push_c_and_ret!('\u{0008}'), + // // 'v' => push_c_and_ret!('\u{000b}'), + // // 'f' => push_c_and_ret!('\u{000c}'), + // // '\r' => { + // // self.bump(); // remove '\r' + + // // self.input.eat(RawToken::NewLine); + + // // return Ok(None); + // // } + // // '\n' | '\u{2028}' | '\u{2029}' => { + // // self.bump(); + + // // return Ok(None); + // // } + + // // // read hexadecimal escape sequences + // // 'x' => { + // // self.bump(); // 'x' + + // // match self.read_int_u32::<16>(2)? { + // // Some(val) => return Ok(Some(vec![Char::from(val)])), + // // None => self.error( + // // start, + // // SyntaxError::BadCharacterEscapeSequence { + // // expected: "2 hex characters", + // // }, + // // )?, + // // } + // // } + + // // // read unicode escape sequences + // // 'u' => match self.read_unicode_escape() { + // // Ok(chars) => return Ok(Some(chars)), + // // Err(err) => self.error(start, err.into_kind())?, + // // }, + + // // // octal escape sequences + // // '0'..='7' => { + // // self.bump(); + + // // let first_c = if c == '0' { + // // match self.input.cur()? { + // // Some(next) if next.is_digit(8) => c, + // // // \0 is not an octal literal nor decimal literal. + // // _ => return Ok(Some(vec!['\u{0000}'.into()])), + // // } + // // } else { + // // c + // // }; + + // // // TODO: Show template instead of strict mode + // // if in_template { + // // self.error(start, SyntaxError::LegacyOctal)? + // // } + + // // self.emit_strict_mode_error(start, + // SyntaxError::LegacyOctal); + + // // let mut value: u8 = first_c.to_digit(8).unwrap() as u8; + + // // macro_rules! one { + // // ($check:expr) => {{ + // // let cur = self.input.cur(); + + // // match cur.and_then(|c| c.to_digit(8)) { + // // Some(v) => { + // // value = if $check { + // // let new_val = value + // // .checked_mul(8) + // // .and_then(|value| + // value.checked_add(v // as u8)); match + // new_val { // Some(val) => val, + // // None => return + // // Ok(Some(vec![Char::from(value as char)])), + // // } } else { + // // value * 8 + v as u8 + // // }; + + // // self.bump(); + // // } + // // _ => return Ok(Some(vec![Char::from(value as + // // u32)])), } + // // }}; + // // } + + // // one!(false); + // // one!(true); + + // // return Ok(Some(vec![Char::from(value as char)])); + // // } + // // _ => c, + // // }; + + // // unsafe { + // // // Safety: cur() is Some(c) if this method is called. + // // self.input.bump(1); + // // } + + // // Ok(Some(vec![c.into()])) + // } + + // fn read_token_bang_or_eq(&mut self, c: u8) -> LexResult> { + // let start = self.input.cur_pos(); + // let had_line_break_before_last = self.had_line_break_before_last(); + + // unsafe { + // // Safety: cur() is Some(c) if this method is called. + // self.input.bump(1); + // } + + // Ok(Some(if self.input.eat(RawToken::AssignOp)? { + // // "==" + + // if self.input.eat(RawToken::AssignOp)? { + // if c == b'!' { + // Token::BinOp(BinOpToken::NotEqEq) + // } else { + // // ======= + // // ^ + // if had_line_break_before_last && self.is_str("====") { + // self.emit_error_span(fixed_len_span(start, 7), + // SyntaxError::TS1185); self.skip_line_comment(4); + // self.skip_space::(); + // return self.read_token(); + // } + + // Token::BinOp(BinOpToken::EqEqEq) + // } + // } else if c == b'!' { + // Token::BinOp(BinOpToken::NotEq) + // } else { + // Token::BinOp(BinOpToken::EqEq) + // } + // } else if c == b'=' && self.input.eat_byte(b'>') { + // // "=>" + + // Token::Arrow + // } else if c == b'!' { + // Token::Bang + // } else { + // Token::AssignOp(AssignOp::Assign) + // })) + // } + // } + + // impl Lexer<'_> { + // /// This can be used if there's no keyword starting with the first + // /// character. + // fn read_ident_unknown(&mut self) -> LexResult { + // debug_assert!(self.input.cur()?.is_some()); + + // let (word, _) = self + // .read_word_as_str_with(|l, s, _, _| + // Word::Ident(IdentLike::Other(l.atoms.atom(s))))?; + + // Ok(Word(word)) + // } + + // /// This can be used if there's no keyword starting with the first + // /// character. + // fn read_word_with( + // &mut self, + // convert: &dyn Fn(&str) -> Option, + // ) -> LexResult> { + // debug_assert!(self.input.cur()?.is_some()); + + // let start = self.input.cur_pos(); + // let (word, has_escape) = self.read_word_as_str_with(|l, s, _, + // can_be_known| { if can_be_known { + // if let Some(word) = convert(s) { + // return word; + // } + // } + + // Word::Ident(IdentLike::Other(l.atoms.atom(s))) + // })?; + + // // Note: ctx is store in lexer because of this error. + // // 'await' and 'yield' may have semantic of reserved word, which + // means lexer // should know context or parser should handle this + // error. Our approach to this // problem is former one. + // if has_escape && self.ctx.is_reserved(&word) { + // self.error( + // start, + // SyntaxError::EscapeInReservedWord { word: word.into() }, + // )? + // } else { + // Ok(Some(Token::Word(word))) + // } + // } + + // /// This method is optimized for texts without escape sequences. + // /// + // /// `convert(text, has_escape, can_be_keyword)` + // fn read_word_as_str_with(&mut self, convert: F) -> + // LexResult<(Ret, bool)> where + // F: for<'any> FnOnce(&'any mut Lexer<'_>, &str, bool, bool) -> Ret, + // { + // debug_assert!(self.input.cur()?.is_some()); + // let mut first = true; + // let mut can_be_keyword = true; + // let mut slice_start = self.input.cur_pos(); + // let mut has_escape = false; + + // self.with_buf(|l, buf| { + // loop { + // if let Some(c) = l.input.cur_as_ascii() { + // // Performance optimization + // if can_be_keyword && (c.is_ascii_uppercase() || + // c.is_ascii_digit()) { can_be_keyword = false; + // } + + // if Ident::is_valid_continue(c as _) { + // l.bump(); + // continue; + // } else if first && Ident::is_valid_start(c as _) { + // l.bump(); + // first = false; + // continue; + // } + + // // unicode escape + // if c == b'\\' { + // first = false; + // has_escape = true; + // let start = l.input.cur_pos(); + // l.bump(); + + // if !l.is(b'u') { + // l.error_span(pos_span(start), + // SyntaxError::ExpectedUnicodeEscape)? } + + // { + // let end = l.input.cur_pos(); + // let s = unsafe { + // // Safety: start and end are valid position + // because we got them from // `self.input` + // l.input.slice(slice_start, start) + // }; + // buf.push_str(s); + // unsafe { + // // Safety: We got end from `self.input` + // l.input.reset_to(end); + // } + // } + + // let chars = l.read_unicode_escape()?; + + // if let Some(c) = chars.first() { + // let valid = if first { + // c.is_ident_start() + // } else { + // c.is_ident_part() + // }; + + // if !valid { + // l.emit_error(start, + // SyntaxError::InvalidIdentChar); } + // } + + // for c in chars { + // buf.extend(c); + // } + + // slice_start = l.input.cur_pos(); + // continue; + // } + + // // ASCII but not a valid identifier + + // break; + // } + + // if let Some(c) = l.input.cur() { + // if Ident::is_valid_continue(c) { + // l.bump(); + // continue; + // } else if first && Ident::is_valid_start(c) { + // l.bump(); + // first = false; + // continue; + // } + // } + + // break; + // } + + // let end = l.input.cur_pos(); + + // let value = if !has_escape { + // // Fast path: raw slice is enough if there's no escape. + + // let s = unsafe { + // // Safety: slice_start and end are valid position because + // we got them from // `self.input` + // l.input.slice(slice_start, end) + // }; + // let s = unsafe { + // // Safety: We don't use 'static. We just bypass the + // lifetime check. transmute::<&str, &'static str>(s) + // }; + + // convert(l, s, has_escape, can_be_keyword) + // } else { + // let s = unsafe { + // // Safety: slice_start and end are valid position because + // we got them from // `self.input` + // l.input.slice(slice_start, end) + // }; + // buf.push_str(s); + + // convert(l, buf, has_escape, can_be_keyword) + // }; + + // Ok((value, has_escape)) + // }) + // } + + // fn read_unicode_escape(&mut self) -> LexResult> { + // debug_assert_eq!(self.input.cur(), Some('u')); + + // let mut chars = Vec::new(); + // let mut is_curly = false; + + // self.bump(); // 'u' + + // if self.input.eat(b'{') { + // is_curly = true; + // } + + // let state = self.input.cur_pos(); + // let c = match self.read_int_u32::<16>(if is_curly { 0 } else { 4 }) { + // Ok(Some(val)) => { + // if 0x0010_ffff >= val { + // char::from_u32(val) + // } else { + // let start = self.input.cur_pos(); + + // self.error( + // start, + // SyntaxError::BadCharacterEscapeSequence { + // expected: if is_curly { + // "1-6 hex characters in the range 0 to + // 10FFFF." } else { + // "4 hex characters" + // }, + // }, + // )? + // } + // } + // _ => { + // let start = self.input.cur_pos(); + + // self.error( + // start, + // SyntaxError::BadCharacterEscapeSequence { + // expected: if is_curly { + // "1-6 hex characters" + // } else { + // "4 hex characters" + // }, + // }, + // )? + // } + // }; + + // match c { + // Some(c) => { + // chars.push(c.into()); + // } + // _ => { + // unsafe { + // // Safety: state is valid position because we got it from + // cur_pos() self.input.reset_to(state); + // } + + // chars.push(Char::from('\\')); + // chars.push(Char::from('u')); + + // if is_curly { + // chars.push(Char::from('{')); + + // for _ in 0..6 { + // if let Some(c) = self.input.cur() { + // if c == '}' { + // break; + // } + + // self.bump(); + + // chars.push(Char::from(c)); + // } else { + // break; + // } + // } + + // chars.push(Char::from('}')); + // } else { + // for _ in 0..4 { + // if let Some(c) = self.input.cur()? { + // self.bump(); + + // chars.push(Char::from(c)); + // } + // } + // } + // } + // } + + // if is_curly && !self.input.eat(b'}') { + // self.error(state, SyntaxError::InvalidUnicodeEscape)? + // } + + // Ok(chars) + // } + + // /// See https://tc39.github.io/ecma262/#sec-literals-string-literals + // fn read_str_lit(&mut self) -> LexResult { + // debug_assert!(self.input.cur()? == Some('\'') || self.input.cur()? == + // Some('"')); let start = self.input.cur_pos(); + // let quote = self.input.cur()?.unwrap() as u8; + + // self.bump(); // '"' + + // let mut has_escape = false; + // let mut slice_start = self.input.cur_pos(); + + // self.with_buf(|l, buf| { + // loop { + // if let Some(c) = l.input.cur_as_ascii() { + // if c == quote { + // let value_end = l.input.cur_pos(); + + // let value = if !has_escape { + // let s = unsafe { + // // Safety: slice_start and value_end are + // valid position because we // got them + // from `self.input` + // l.input.slice(slice_start, value_end) }; + + // l.atoms.atom(s) + // } else { + // let s = unsafe { + // // Safety: slice_start and value_end are + // valid position because we // got them + // from `self.input` + // l.input.slice(slice_start, value_end) }; + // buf.push_str(s); + + // l.atoms.atom(&**buf) + // }; + + // unsafe { + // // Safety: cur is quote + // l.input.bump(1); + // } + + // let end = l.input.cur_pos(); + + // let raw = unsafe { + // // Safety: start and end are valid position + // because we got them from // `self.input` + // l.input.slice(start, end) + // }; + // let raw = l.atoms.atom(raw); + + // return Ok(Token::Str { value, raw }); + // } + + // if c == b'\\' { + // has_escape = true; + + // { + // let end = l.input.cur_pos(); + // let s = unsafe { + // // Safety: start and end are valid position + // because we got them from // `self.input` + // l.input.slice(slice_start, end) + // }; + // buf.push_str(s); + // } + + // if let Some(chars) = l.read_escaped_char(false)? { + // for c in chars { + // buf.extend(c); + // } + // } + + // slice_start = l.input.cur_pos(); + // continue; + // } + + // if (c as char).is_line_break() { + // break; + // } + + // unsafe { + // // Safety: cur is a ascii character + // l.input.bump(1); + // } + // continue; + // } + + // match l.input.cur()? { + // Some(c) => { + // if c.is_line_break() { + // break; + // } + // unsafe { + // // Safety: cur is Some(c) + // l.input.bump(1); + // } + // } + // None => break, + // } + // } + + // { + // let end = l.input.cur_pos(); + // let s = unsafe { + // // Safety: start and end are valid position because we + // got them from // `self.input` + // l.input.slice(slice_start, end) + // }; + // buf.push_str(s); + // } + + // l.emit_error(start, SyntaxError::UnterminatedStrLit); + + // let end = l.input.cur_pos(); + + // let raw = unsafe { + // // Safety: start and end are valid position because we got + // them from // `self.input` + // l.input.slice(start, end) + // }; + // Ok(Token::Str { + // value: l.atoms.atom(&*buf), + // raw: l.atoms.atom(raw), + // }) + // }) + // } + + // /// Expects current char to be '/' + // fn read_regexp(&mut self, start: BytePos) -> LexResult { + // unsafe { + // // Safety: start is valid position, and cur() is Some('/') + // self.input.reset_to(start); + // } + + // debug_assert_eq!(self.input.cur()?, Some('/')); + + // let start = self.input.cur_pos(); + + // self.bump(); + + // let (mut escaped, mut in_class) = (false, false); + + // let content = self.with_buf(|l, buf| { + // while let Some(c) = l.input.cur() { + // // This is ported from babel. + // // Seems like regexp literal cannot contain linebreak. + // if c.is_line_terminator() { + // let span = l.span(start); + + // return Err(Error::new(span, + // SyntaxError::UnterminatedRegExp)); } + + // if escaped { + // escaped = false; + // } else { + // match c { + // '[' => in_class = true, + // ']' if in_class => in_class = false, + // // Terminates content part of regex literal + // '/' if !in_class => break, + // _ => {} + // } + + // escaped = c == '\\'; + // } + + // l.bump(); + // buf.push(c); + // } + + // Ok(l.atoms.atom(&**buf)) + // })?; + + // // input is terminated without following `/` + // if !self.is(b'/') { + // let span = self.span(start); + + // return Err(Error::new(span, SyntaxError::UnterminatedRegExp)); + // } + + // self.bump(); // '/' + + // // Spec says "It is a Syntax Error if IdentifierPart contains a + // Unicode escape // sequence." TODO: check for escape + + // // Need to use `read_word` because '\uXXXX' sequences are allowed + // // here (don't ask). + // // let flags_start = self.input.cur_pos(); + // let flags = { + // match self.input.cur() { + // Some(c) if c.is_ident_start() => self + // .read_word_as_str_with(|l, s, _, _| l.atoms.atom(s)) + // .map(Some), + // _ => Ok(None), + // } + // }? + // .map(|(value, _)| value) + // .unwrap_or_default(); + + // Ok(Token::Regex(content, flags)) + // } + + // fn read_tmpl_token(&mut self, start_of_tpl: BytePos) -> LexResult + // { let start = self.input.cur_pos(); + + // let mut cooked = Ok(String::new()); + // let mut cooked_slice_start = start; + // let raw_slice_start = start; + + // macro_rules! consume_cooked { + // () => {{ + // if let Ok(cooked) = &mut cooked { + // let last_pos = self.input.cur_pos(); + // cooked.push_str(unsafe { + // // Safety: Both of start and last_pos are valid + // position because we got them // from `self.input` + // self.input.slice(cooked_slice_start, last_pos) + // }); + // } + // }}; + // } + + // while let Some(c) = self.input.cur()? { + // if c == '`' || (c == '$' && self.input.peek()? == Some('{')) { + // if start == self.input.cur_pos() && + // self.state.last_was_tpl_element() { if c == '$' { + // self.bump(); + // self.bump(); + // return Ok(tok!("${")); + // } else { + // self.bump(); + // return Ok(tok!('`')); + // } + // } + + // // If we don't have any escape + // let cooked = if cooked_slice_start == raw_slice_start { + // let last_pos = self.input.cur_pos(); + // let s = unsafe { + // // Safety: Both of start and last_pos are valid + // position because we got them // from `self.input` + // self.input.slice(cooked_slice_start, last_pos) + // }; + + // Ok(self.atoms.atom(s)) + // } else { + // consume_cooked!(); + + // cooked.map(|s| self.atoms.atom(s)) + // }; + + // // TODO: Handle error + // let end = self.input.cur_pos(); + // let raw = unsafe { + // // Safety: Both of start and last_pos are valid position + // because we got them // from `self.input` + // self.input.slice(raw_slice_start, end) + // }; + // return Ok(Token::Template { + // cooked, + // raw: self.atoms.atom(raw), + // }); + // } + + // if c == '\\' { + // consume_cooked!(); + + // match self.read_escaped_char(true) { + // Ok(Some(chars)) => { + // if let Ok(ref mut cooked) = cooked { + // for c in chars { + // cooked.extend(c); + // } + // } + // } + // Ok(None) => {} + // Err(error) => { + // cooked = Err(error); + // } + // } + + // cooked_slice_start = self.input.cur_pos(); + // } else if c.is_line_terminator() { + // self.state.had_line_break = true; + + // consume_cooked!(); + + // let c = if c == '\r' && self.input.peek() == Some('\n') { + // self.bump(); // '\r' + // '\n' + // } else { + // match c { + // '\n' => '\n', + // '\r' => '\n', + // '\u{2028}' => '\u{2028}', + // '\u{2029}' => '\u{2029}', + // _ => unreachable!(), + // } + // }; + + // self.bump(); + + // if let Ok(ref mut cooked) = cooked { + // cooked.push(c); + // } + // cooked_slice_start = self.input.cur_pos(); + // } else { + // self.bump(); + // } + // } + + // self.error(start_of_tpl, SyntaxError::UnterminatedTpl)? + // } + #[inline] #[allow(clippy::misnamed_getters)] pub fn had_line_break_before_last(&self) -> bool { diff --git a/crates/swc_ecma_parser/src/lexer/number.rs b/crates/swc_ecma_parser/src/lexer/number.rs index 1745eb75b5ea..b60397bbeed6 100644 --- a/crates/swc_ecma_parser/src/lexer/number.rs +++ b/crates/swc_ecma_parser/src/lexer/number.rs @@ -27,811 +27,855 @@ impl LazyBigInt { } } -impl Lexer<'_> { - /// Reads an integer, octal integer, or floating-point number - pub(super) fn read_number( - &mut self, - starts_with_dot: bool, - ) -> LexResult, Atom)>> { - debug_assert!(self.input.cur()?.is_some()); - - if starts_with_dot { - debug_assert_eq!( - self.input.cur(), - Some('.'), - "read_number(starts_with_dot = true) expects current char to be '.'" - ); - } - - let start = self.input.cur_pos(); - - let val = if starts_with_dot { - // first char is '.' - 0f64 - } else { - let starts_with_zero = self.input.cur().unwrap() == '0'; - - // Use read_number_no_dot to support long numbers. - let (val, s, not_octal) = self.read_number_no_dot_as_str::<10>()?; - - if self.input.eat(b'n') { - let end = self.input.cur_pos(); - let raw = unsafe { - // Safety: We got both start and end position from `self.input` - self.input.slice(start, end) - }; - - return Ok(Either::Right(( - Box::new(s.into_value()), - self.atoms.atom(raw), - ))); - } - - if starts_with_zero { - // TODO: I guess it would be okay if I don't use -ffast-math - // (or something like that), but needs review. - if val == 0.0f64 { - // If only one zero is used, it's decimal. - // And if multiple zero is used, it's octal. - // - // e.g. `0` is decimal (so it can be part of float) - // - // e.g. `000` is octal - if start.0 != self.last_pos().0 - 1 { - // `-1` is utf 8 length of `0` - - let end = self.input.cur_pos(); - let raw = unsafe { - // Safety: We got both start and end position from `self.input` - self.input.slice(start, end) - }; - let raw = self.atoms.atom(raw); - return self - .make_legacy_octal(start, 0f64) - .map(|value| Either::Left((value, raw))); - } - } else { - // strict mode hates non-zero decimals starting with zero. - // e.g. 08.1 is strict mode violation but 0.1 is valid float. - - if val.fract() == 0.0 { - let val_str = &s.value; - - // if it contains '8' or '9', it's decimal. - if not_octal { - // Continue parsing - self.emit_strict_mode_error(start, SyntaxError::LegacyDecimal); - } else { - // It's Legacy octal, and we should reinterpret value. - let val = BigIntValue::from_str_radix(val_str, 8) - .unwrap_or_else(|err| { - panic!( - "failed to parse {} using `from_str_radix`: {:?}", - val_str, err - ) - }) - .to_f64() - .unwrap_or_else(|| { - panic!("failed to parse {} into float using BigInt", val_str) - }); - - let end = self.input.cur_pos(); - let raw = unsafe { - // Safety: We got both start and end position from `self.input` - self.input.slice(start, end) - }; - let raw = self.atoms.atom(raw); - - return self - .make_legacy_octal(start, val) - .map(|value| Either::Left((value, raw))); - } - } - } - } - - val - }; - - // At this point, number cannot be an octal literal. - - let mut val: f64 = val; - - // `0.a`, `08.a`, `102.a` are invalid. - // - // `.1.a`, `.1e-4.a` are valid, - if self.input.cur() == Some('.') { - self.bump(); - - if starts_with_dot { - debug_assert!(self.input.cur()?.is_some()); - debug_assert!(self.input.cur().unwrap().is_ascii_digit()); - } - - // Read numbers after dot - self.read_int::<10>(0)?; - - val = { - let end = self.input.cur_pos(); - let raw = unsafe { - // Safety: We got both start and end position from `self.input` - self.input.slice(start, end) - }; - - // Remove number separator from number - if raw.contains('_') { - Cow::Owned(raw.replace('_', "")) - } else { - Cow::Borrowed(raw) - } - .parse() - .expect("failed to parse float using rust's impl") - }; - } - - // Handle 'e' and 'E' - // - // .5e1 = 5 - // 1e2 = 100 - // 1e+2 = 100 - // 1e-2 = 0.01 - match self.input.cur() { - Some('e') | Some('E') => { - self.bump(); - - let next = match self.input.cur() { - Some(next) => next, - None => { - let pos = self.input.cur_pos(); - self.error(pos, SyntaxError::NumLitTerminatedWithExp)? - } - }; - - let positive = if next == '+' || next == '-' { - self.bump(); // remove '+', '-' - - next == '+' - } else { - true - }; - - let exp = self.read_number_no_dot::<10>()?; - - val = if exp == f64::INFINITY { - if positive && val != 0.0 { - f64::INFINITY - } else { - 0.0 - } - } else { - let end = self.input.cur_pos(); - let raw = unsafe { - // Safety: We got both start and end position from `self.input` - self.input.slice(start, end) - }; - - if raw.contains('_') { - Cow::Owned(raw.replace('_', "")) - } else { - Cow::Borrowed(raw) - } - .parse() - .expect("failed to parse float literal") - } - } - _ => {} - } - - self.ensure_not_ident()?; - - let end = self.input.cur_pos(); - let raw_str = unsafe { - // Safety: We got both start and end position from `self.input` - self.input.slice(start, end) - }; - Ok(Either::Left((val, raw_str.into()))) - } - - /// Returns `Left(value)` or `Right(BigInt)` - pub(super) fn read_radix_number( - &mut self, - ) -> LexResult, Atom)>> { - debug_assert!( - RADIX == 2 || RADIX == 8 || RADIX == 16, - "radix should be one of 2, 8, 16, but got {}", - RADIX - ); - debug_assert_eq!(self.input.cur(), Some('0')); - - let start = self.input.cur_pos(); - - self.bump(); - - match self.input.cur()? { - Some(..) => { - self.bump(); - } - _ => { - unreachable!(); - } - } - - let (val, s, _) = self.read_number_no_dot_as_str::()?; - - if self.input.eat(b'n') { - let end = self.input.cur_pos(); - let raw = unsafe { - // Safety: We got both start and end position from `self.input` - self.input.slice(start, end) - }; - - return Ok(Either::Right(( - Box::new(s.into_value()), - self.atoms.atom(raw), - ))); - } - - self.ensure_not_ident()?; - - let end = self.input.cur_pos(); - let raw = unsafe { - // Safety: We got both start and end position from `self.input` - self.input.slice(start, end) - }; - - Ok(Either::Left((val, self.atoms.atom(raw)))) - } - - /// This can read long integers like - /// "13612536612375123612312312312312312312312". - fn read_number_no_dot(&mut self) -> LexResult { - debug_assert!( - RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16, - "radix for read_number_no_dot should be one of 2, 8, 10, 16, but got {}", - RADIX - ); - let start = self.input.cur_pos(); - - let mut read_any = false; - - let res = self.read_digits::<_, f64, RADIX>( - |total, radix, v| { - read_any = true; - - Ok((f64::mul_add(total, radix as f64, v as f64), true)) - }, - true, - ); - - if !read_any { - self.error(start, SyntaxError::ExpectedDigit { radix: RADIX })?; - } - res - } - - /// This can read long integers like - /// "13612536612375123612312312312312312312312". - /// - /// - Returned `bool` is `true` is there was `8` or `9`. - fn read_number_no_dot_as_str( - &mut self, - ) -> LexResult<(f64, LazyBigInt, bool)> { - debug_assert!( - RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16, - "radix for read_number_no_dot should be one of 2, 8, 10, 16, but got {}", - RADIX - ); - let start = self.input.cur_pos(); - - let mut non_octal = false; - let mut read_any = false; - - self.read_digits::<_, f64, RADIX>( - |total, radix, v| { - read_any = true; - - if v == 8 || v == 9 { - non_octal = true; - } - - Ok((f64::mul_add(total, radix as f64, v as f64), true)) - }, - true, - )?; - - if !read_any { - self.error(start, SyntaxError::ExpectedDigit { radix: RADIX })?; - } - - let end = self.input.cur_pos(); - let raw = unsafe { - // Safety: We got both start and end position from `self.input` - self.input.slice(start, end) - }; - // Remove number separator from number - let raw_number_str = raw.replace('_', ""); - let parsed_float = BigIntValue::from_str_radix(&raw_number_str, RADIX as u32) - .expect("failed to parse float using BigInt") - .to_f64() - .expect("failed to parse float using BigInt"); - Ok((parsed_float, LazyBigInt::new(raw_number_str), non_octal)) - } - - /// Ensure that ident cannot directly follow numbers. - fn ensure_not_ident(&mut self) -> LexResult<()> { - match self.input.cur() { - Some(c) if c.is_ident_start() => { - let span = pos_span(self.input.cur_pos()); - self.error_span(span, SyntaxError::IdentAfterNum)? - } - _ => Ok(()), - } - } - - /// Read an integer in the given radix. Return `None` if zero digits - /// were read, the integer value otherwise. - /// When `len` is not zero, this - /// will return `None` unless the integer has exactly `len` digits. - pub(super) fn read_int(&mut self, len: u8) -> LexResult> { - let mut count = 0u16; - let v = self.read_digits::<_, Option, RADIX>( - |opt: Option, radix, val| { - count += 1; - let total = opt.unwrap_or_default() * radix as f64 + val as f64; - - Ok((Some(total), count != len as u16)) - }, - true, - )?; - if len != 0 && count != len as u16 { - Ok(None) - } else { - Ok(v) - } - } - - pub(super) fn read_int_u32(&mut self, len: u8) -> LexResult> { - let start = self.state.start; - - let mut count = 0; - let v = self.read_digits::<_, Option, RADIX>( - |opt: Option, radix, val| { - count += 1; - - let total = opt - .unwrap_or_default() - .checked_mul(radix as u32) - .and_then(|v| v.checked_add(val)) - .ok_or_else(|| { - let span = Span::new(start, start); - Error::new(span, SyntaxError::InvalidUnicodeEscape) - })?; - - Ok((Some(total), count != len)) - }, - true, - )?; - if len != 0 && count != len { - Ok(None) - } else { - Ok(v) - } - } - - /// `op`- |total, radix, value| -> (total * radix + value, continue) - fn read_digits( - &mut self, - mut op: F, - allow_num_separator: bool, - ) -> LexResult - where - F: FnMut(Ret, u8, u32) -> LexResult<(Ret, bool)>, - Ret: Copy + Default, - { - debug_assert!( - RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16, - "radix for read_int should be one of 2, 8, 10, 16, but got {}", - RADIX - ); - - if cfg!(feature = "debug") { - trace!( - "read_digits(radix = {}), cur = {:?}", - RADIX, - self.input.cur() - ); - } - - let start = self.input.cur_pos(); - let mut total: Ret = Default::default(); - let mut prev = None; - - while let Some(c) = self.input.cur() { - if allow_num_separator && c == '_' { - let is_allowed = |c: Option| { - if c.is_none() { - return false; - } - - let c = c.unwrap(); - - c.is_digit(RADIX as _) - }; - let is_forbidden = |c: Option| { - if c.is_none() { - return true; - } - - if RADIX == 16 { - matches!(c.unwrap(), '.' | 'X' | '_' | 'x') - } else { - matches!(c.unwrap(), '.' | 'B' | 'E' | 'O' | '_' | 'b' | 'e' | 'o') - } - }; - - let next = self.input.peek(); - - if !is_allowed(next) || is_forbidden(prev) || is_forbidden(next) { - self.emit_error( - start, - SyntaxError::NumericSeparatorIsAllowedOnlyBetweenTwoDigits, - ); - } - - // Ignore this _ character - unsafe { - // Safety: cur() returns Some(c) where c is a valid char - self.input.bump(1); - } - - continue; - } - - // e.g. (val for a) = 10 where radix = 16 - let val = if let Some(val) = c.to_digit(RADIX as _) { - val - } else { - return Ok(total); - }; - - self.bump(); - - let (t, cont) = op(total, RADIX, val)?; - - total = t; - - if !cont { - return Ok(total); - } - - prev = Some(c); - } - - Ok(total) - } - - fn make_legacy_octal(&mut self, start: BytePos, val: f64) -> LexResult { - self.ensure_not_ident()?; - - if self.syntax.typescript() && self.target >= EsVersion::Es5 { - self.emit_error(start, SyntaxError::TS1085); - } - - self.emit_strict_mode_error(start, SyntaxError::LegacyOctal); - - Ok(val) - } -} - -#[cfg(test)] -mod tests { - use std::panic; - - use super::*; - - fn lex(s: &'static str, f: F) -> Ret - where - F: FnOnce(&mut Lexer<'_>) -> Ret, - { - crate::with_test_sess(s, |_, input| { - let mut l = Lexer::new( - Syntax::Es(Default::default()), - Default::default(), - input, - None, - ); - let ret = f(&mut l); - assert_eq!(l.input.cur(), None); - Ok(ret) - }) - .unwrap() - } - - fn num(s: &'static str) -> (f64, Atom) { - lex(s, |l| { - l.read_number(s.starts_with('.')).unwrap().left().unwrap() - }) - } - - fn int(s: &'static str) -> u32 { - lex(s, |l| { - l.read_int_u32::(0) - .unwrap() - .expect("read_int returned None") - }) - } - - const LONG: &str = "1e10000000000000000000000000000000000000000\ - 0000000000000000000000000000000000000000000000000000"; - #[test] - fn num_inf() { - assert_eq!(num(LONG), (f64::INFINITY, LONG.into())); - } - - /// Number >= 2^53 - #[test] - fn num_big_exp() { - assert_eq!((1e30, "1e30".into()), num("1e30")); - } - - #[test] - fn num_very_big_exp() { - const LARGE_POSITIVE_EXP: &str = - "1e100000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000"; - const LARGE_NEGATIVE_EXP: &str = - "1e-10000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 000000000000000000000000000000000000000000000000000000"; - const ZERO_WITH_LARGE_POSITIVE_EXP: &str = - "0e100000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000"; - const ZERO_WITH_LARGE_NEGATIVE_EXP: &str = - "0e-10000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 000000000000000000000000000000000000000000000000000000"; - const LARGE_MANTISSA_WITH_LARGE_NEGATIVE_EXP: &str = - "10000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 000000000000000000000000000000000000000000000000000000\ - e-100000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 00000000000000000000000000000000000000000000000000000000000000000\ - 000000000000000000000000000000000000000000000000000000"; - - assert_eq!( - num(LARGE_POSITIVE_EXP), - (f64::INFINITY, LARGE_POSITIVE_EXP.into()) - ); - assert_eq!(num(LARGE_NEGATIVE_EXP), (0.0, LARGE_NEGATIVE_EXP.into())); - assert_eq!( - num(ZERO_WITH_LARGE_POSITIVE_EXP), - (0.0, ZERO_WITH_LARGE_POSITIVE_EXP.into()) - ); - assert_eq!( - num(ZERO_WITH_LARGE_NEGATIVE_EXP), - (0.0, ZERO_WITH_LARGE_NEGATIVE_EXP.into()) - ); - assert_eq!( - num(LARGE_MANTISSA_WITH_LARGE_NEGATIVE_EXP), - (0.0, LARGE_MANTISSA_WITH_LARGE_NEGATIVE_EXP.into()) - ); - } - - #[test] - fn num_big_many_zero() { - assert_eq!( - ( - 1_000_000_000_000_000_000_000_000_000_000f64, - "1000000000000000000000000000000".into() - ), - num("1000000000000000000000000000000") - ); - assert_eq!( - (3.402_823_466_385_288_6e38, "34028234663852886e22".into()), - num("34028234663852886e22"), - ); - } - - #[test] - fn big_number_with_fract() { - assert_eq!( - (77777777777777777.1f64, "77777777777777777.1".into()), - num("77777777777777777.1") - ) - } - - #[test] - fn issue_480() { - assert_eq!((9.09, "9.09".into()), num("9.09")) - } - - #[test] - fn num_legacy_octal() { - assert_eq!((0o12 as f64, "0012".into()), num("0012")); - assert_eq!((10f64, "012".into()), num("012")); - } - - #[test] - fn read_int_1() { - assert_eq!(60, int::<10>("60")); - assert_eq!(0o73, int::<8>("73")); - } - - #[test] - fn read_int_short() { - assert_eq!(7, int::<10>("7")); - assert_eq!(10, int::<10>("10")); - } - - #[test] - fn read_radix_number() { - assert_eq!( - (0o73 as f64, "0o73".into()), - lex("0o73", |l| l - .read_radix_number::<8>() - .unwrap() - .left() - .unwrap()) - ); - } - - #[test] - fn read_num_sep() { - assert_eq!(1_000, int::<10>("1_000")); - assert_eq!(0xaebece, int::<16>("AE_BE_CE")); - assert_eq!(0b1010000110000101, int::<2>("1010_0001_1000_0101")); - assert_eq!(0o0666, int::<8>("0_6_6_6")); - } - - #[test] - fn read_bigint() { - assert_eq!( - lex( - "10000000000000000000000000000000000000000000000000000n", - |l| l.read_number(false).unwrap().right().unwrap() - ), - ( - Box::new( - "10000000000000000000000000000000000000000000000000000" - .parse::() - .unwrap() - ), - Atom::from("10000000000000000000000000000000000000000000000000000n") - ), - ); - } - - #[test] - fn large_bin_number() { - const LONG: &str = - "0B11111111111111111111111111111111111111111111111101001010100000010111110001111111111"; - const VERY_LARGE_BINARY_NUMBER: &str = - "0B1111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 111111111111111111111111111111111111111111111111111111111111111111\ - 0010111110001111111111"; - assert_eq!( - lex(LONG, |l| l - .read_radix_number::<2>() - .unwrap() - .left() - .unwrap()), - (9.671_406_556_917_009e24, LONG.into()) - ); - assert_eq!( - lex(VERY_LARGE_BINARY_NUMBER, |l| l - .read_radix_number::<2>() - .unwrap() - .left() - .unwrap()), - (1.0972248137587377e304, VERY_LARGE_BINARY_NUMBER.into()) - ); - } - - #[test] - fn large_float_number() { - const LONG: &str = "9.671406556917009e+24"; - - assert_eq!(num(LONG), (9.671_406_556_917_009e24, LONG.into())); - } - - /// Valid even on strict mode. - const VALID_CASES: &[&str] = &[".0", "0.e-1", "0e8", ".8e1", "0.8e1", "1.18e1"]; - const INVALID_CASES_ON_STRICT: &[&str] = &["08e1", "08.1", "08.8e1", "08", "01"]; - const INVALID_CASES: &[&str] = &["01.8e1", "012e1", "00e1", "00.0"]; - - fn test_floats(strict: bool, success: bool, cases: &'static [&'static str]) { - for case in cases { - println!( - "Testing {} (when strict = {}); Expects success = {}", - case, strict, success - ); - // lazy way to get expected values - let expected: f64 = (i64::from_str_radix(case, 8).map(|v| v as f64)) - .or_else(|_| case.parse::().map(|v| v as f64)) - .or_else(|_| case.parse::()) - .unwrap_or_else(|err| { - panic!( - "failed to parse '{}' as float using str.parse(): {}", - case, err - ) - }); - - let vec = panic::catch_unwind(|| { - crate::with_test_sess(case, |_, input| { - let mut l = Lexer::new(Syntax::default(), Default::default(), input, None); - l.ctx.strict = strict; - Ok(l.map(|ts| ts.token).collect::>()) - }) - .unwrap() - }); - - if success { - let vec = match vec { - Ok(vec) => vec, - Err(err) => panic::resume_unwind(err), - }; - - assert_eq!(vec.len(), 1); - - let token = vec.into_iter().next().unwrap(); - let value = match token { - Token::Num { value, .. } => value, - _ => { - panic!("expected num token in test") - } - }; - - assert_eq!(expected, value); - } else if let Ok(vec) = vec { - assert_ne!( - vec![Token::Num { - value: expected, - raw: expected.to_string().into() - }], - vec - ) - } - } - } - - // #[test] - // fn strict_mode() { - // test_floats(true, true, VALID_CASES); - // test_floats(true, false, INVALID_CASES_ON_STRICT); - // test_floats(true, false, INVALID_CASES); - // } - - #[test] - fn non_strict() { - test_floats(false, true, VALID_CASES); - test_floats(false, true, INVALID_CASES_ON_STRICT); - test_floats(false, false, INVALID_CASES); - } -} +// impl Lexer<'_> { +// /// Reads an integer, octal integer, or floating-point number +// pub(super) fn read_number( +// &mut self, +// starts_with_dot: bool, +// ) -> LexResult, Atom)>> { +// debug_assert!(self.input.cur()?.is_some()); + +// if starts_with_dot { +// debug_assert_eq!( +// self.input.cur(), +// Some('.'), +// "read_number(starts_with_dot = true) expects current char to +// be '.'" ); +// } + +// let start = self.input.cur_pos(); + +// let val = if starts_with_dot { +// // first char is '.' +// 0f64 +// } else { +// let starts_with_zero = self.input.cur().unwrap() == '0'; + +// // Use read_number_no_dot to support long numbers. +// let (val, s, not_octal) = +// self.read_number_no_dot_as_str::<10>()?; + +// if self.input.eat(b'n') { +// let end = self.input.cur_pos(); +// let raw = unsafe { +// // Safety: We got both start and end position from +// `self.input` self.input.slice(start, end) +// }; + +// return Ok(Either::Right(( +// Box::new(s.into_value()), +// self.atoms.atom(raw), +// ))); +// } + +// if starts_with_zero { +// // TODO: I guess it would be okay if I don't use -ffast-math +// // (or something like that), but needs review. +// if val == 0.0f64 { +// // If only one zero is used, it's decimal. +// // And if multiple zero is used, it's octal. +// // +// // e.g. `0` is decimal (so it can be part of float) +// // +// // e.g. `000` is octal +// if start.0 != self.last_pos().0 - 1 { +// // `-1` is utf 8 length of `0` + +// let end = self.input.cur_pos(); +// let raw = unsafe { +// // Safety: We got both start and end position +// from `self.input` self.input.slice(start, end) +// }; +// let raw = self.atoms.atom(raw); +// return self +// .make_legacy_octal(start, 0f64) +// .map(|value| Either::Left((value, raw))); +// } +// } else { +// // strict mode hates non-zero decimals starting with +// zero. // e.g. 08.1 is strict mode violation but 0.1 is +// valid float. + +// if val.fract() == 0.0 { +// let val_str = &s.value; + +// // if it contains '8' or '9', it's decimal. +// if not_octal { +// // Continue parsing +// self.emit_strict_mode_error(start, +// SyntaxError::LegacyDecimal); } else { +// // It's Legacy octal, and we should reinterpret +// value. let val = +// BigIntValue::from_str_radix(val_str, 8) +// .unwrap_or_else(|err| { panic!( +// "failed to parse {} using +// `from_str_radix`: {:?}", val_str, err +// ) +// }) +// .to_f64() +// .unwrap_or_else(|| { +// panic!("failed to parse {} into float +// using BigInt", val_str) }); + +// let end = self.input.cur_pos(); +// let raw = unsafe { +// // Safety: We got both start and end position +// from `self.input` self.input.slice(start, +// end) }; +// let raw = self.atoms.atom(raw); + +// return self +// .make_legacy_octal(start, val) +// .map(|value| Either::Left((value, raw))); +// } +// } +// } +// } + +// val +// }; + +// // At this point, number cannot be an octal literal. + +// let mut val: f64 = val; + +// // `0.a`, `08.a`, `102.a` are invalid. +// // +// // `.1.a`, `.1e-4.a` are valid, +// if self.input.cur() == Some('.') { +// self.bump(); + +// if starts_with_dot { +// debug_assert!(self.input.cur()?.is_some()); +// debug_assert!(self.input.cur().unwrap().is_ascii_digit()); +// } + +// // Read numbers after dot +// self.read_int::<10>(0)?; + +// val = { +// let end = self.input.cur_pos(); +// let raw = unsafe { +// // Safety: We got both start and end position from +// `self.input` self.input.slice(start, end) +// }; + +// // Remove number separator from number +// if raw.contains('_') { +// Cow::Owned(raw.replace('_', "")) +// } else { +// Cow::Borrowed(raw) +// } +// .parse() +// .expect("failed to parse float using rust's impl") +// }; +// } + +// // Handle 'e' and 'E' +// // +// // .5e1 = 5 +// // 1e2 = 100 +// // 1e+2 = 100 +// // 1e-2 = 0.01 +// match self.input.cur() { +// Some('e') | Some('E') => { +// self.bump(); + +// let next = match self.input.cur() { +// Some(next) => next, +// None => { +// let pos = self.input.cur_pos(); +// self.error(pos, +// SyntaxError::NumLitTerminatedWithExp)? } +// }; + +// let positive = if next == '+' || next == '-' { +// self.bump(); // remove '+', '-' + +// next == '+' +// } else { +// true +// }; + +// let exp = self.read_number_no_dot::<10>()?; + +// val = if exp == f64::INFINITY { +// if positive && val != 0.0 { +// f64::INFINITY +// } else { +// 0.0 +// } +// } else { +// let end = self.input.cur_pos(); +// let raw = unsafe { +// // Safety: We got both start and end position from +// `self.input` self.input.slice(start, end) +// }; + +// if raw.contains('_') { +// Cow::Owned(raw.replace('_', "")) +// } else { +// Cow::Borrowed(raw) +// } +// .parse() +// .expect("failed to parse float literal") +// } +// } +// _ => {} +// } + +// self.ensure_not_ident()?; + +// let end = self.input.cur_pos(); +// let raw_str = unsafe { +// // Safety: We got both start and end position from `self.input` +// self.input.slice(start, end) +// }; +// Ok(Either::Left((val, raw_str.into()))) +// } + +// /// Returns `Left(value)` or `Right(BigInt)` +// pub(super) fn read_radix_number( +// &mut self, +// ) -> LexResult, Atom)>> { +// debug_assert!( +// RADIX == 2 || RADIX == 8 || RADIX == 16, +// "radix should be one of 2, 8, 16, but got {}", +// RADIX +// ); +// debug_assert_eq!(self.input.cur(), Some('0')); + +// let start = self.input.cur_pos(); + +// self.bump(); + +// match self.input.cur()? { +// Some(..) => { +// self.bump(); +// } +// _ => { +// unreachable!(); +// } +// } + +// let (val, s, _) = self.read_number_no_dot_as_str::()?; + +// if self.input.eat(b'n') { +// let end = self.input.cur_pos(); +// let raw = unsafe { +// // Safety: We got both start and end position from +// `self.input` self.input.slice(start, end) +// }; + +// return Ok(Either::Right(( +// Box::new(s.into_value()), +// self.atoms.atom(raw), +// ))); +// } + +// self.ensure_not_ident()?; + +// let end = self.input.cur_pos(); +// let raw = unsafe { +// // Safety: We got both start and end position from `self.input` +// self.input.slice(start, end) +// }; + +// Ok(Either::Left((val, self.atoms.atom(raw)))) +// } + +// /// This can read long integers like +// /// "13612536612375123612312312312312312312312". +// fn read_number_no_dot(&mut self) -> LexResult { +// debug_assert!( +// RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16, +// "radix for read_number_no_dot should be one of 2, 8, 10, 16, but +// got {}", RADIX +// ); +// let start = self.input.cur_pos(); + +// let mut read_any = false; + +// let res = self.read_digits::<_, f64, RADIX>( +// |total, radix, v| { +// read_any = true; + +// Ok((f64::mul_add(total, radix as f64, v as f64), true)) +// }, +// true, +// ); + +// if !read_any { +// self.error(start, SyntaxError::ExpectedDigit { radix: RADIX })?; +// } +// res +// } + +// /// This can read long integers like +// /// "13612536612375123612312312312312312312312". +// /// +// /// - Returned `bool` is `true` is there was `8` or `9`. +// fn read_number_no_dot_as_str( +// &mut self, +// ) -> LexResult<(f64, LazyBigInt, bool)> { +// debug_assert!( +// RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16, +// "radix for read_number_no_dot should be one of 2, 8, 10, 16, but +// got {}", RADIX +// ); +// let start = self.input.cur_pos(); + +// let mut non_octal = false; +// let mut read_any = false; + +// self.read_digits::<_, f64, RADIX>( +// |total, radix, v| { +// read_any = true; + +// if v == 8 || v == 9 { +// non_octal = true; +// } + +// Ok((f64::mul_add(total, radix as f64, v as f64), true)) +// }, +// true, +// )?; + +// if !read_any { +// self.error(start, SyntaxError::ExpectedDigit { radix: RADIX })?; +// } + +// let end = self.input.cur_pos(); +// let raw = unsafe { +// // Safety: We got both start and end position from `self.input` +// self.input.slice(start, end) +// }; +// // Remove number separator from number +// let raw_number_str = raw.replace('_', ""); +// let parsed_float = BigIntValue::from_str_radix(&raw_number_str, RADIX +// as u32) .expect("failed to parse float using BigInt") +// .to_f64() +// .expect("failed to parse float using BigInt"); +// Ok((parsed_float, LazyBigInt::new(raw_number_str), non_octal)) +// } + +// /// Ensure that ident cannot directly follow numbers. +// fn ensure_not_ident(&mut self) -> LexResult<()> { +// match self.input.cur() { +// Some(c) if c.is_ident_start() => { +// let span = pos_span(self.input.cur_pos()); +// self.error_span(span, SyntaxError::IdentAfterNum)? +// } +// _ => Ok(()), +// } +// } + +// /// Read an integer in the given radix. Return `None` if zero digits +// /// were read, the integer value otherwise. +// /// When `len` is not zero, this +// /// will return `None` unless the integer has exactly `len` digits. +// pub(super) fn read_int(&mut self, len: u8) -> +// LexResult> { let mut count = 0u16; +// let v = self.read_digits::<_, Option, RADIX>( +// |opt: Option, radix, val| { +// count += 1; +// let total = opt.unwrap_or_default() * radix as f64 + val as +// f64; + +// Ok((Some(total), count != len as u16)) +// }, +// true, +// )?; +// if len != 0 && count != len as u16 { +// Ok(None) +// } else { +// Ok(v) +// } +// } + +// pub(super) fn read_int_u32(&mut self, len: u8) -> +// LexResult> { let start = self.state.start; + +// let mut count = 0; +// let v = self.read_digits::<_, Option, RADIX>( +// |opt: Option, radix, val| { +// count += 1; + +// let total = opt +// .unwrap_or_default() +// .checked_mul(radix as u32) +// .and_then(|v| v.checked_add(val)) +// .ok_or_else(|| { +// let span = Span::new(start, start); +// Error::new(span, SyntaxError::InvalidUnicodeEscape) +// })?; + +// Ok((Some(total), count != len)) +// }, +// true, +// )?; +// if len != 0 && count != len { +// Ok(None) +// } else { +// Ok(v) +// } +// } + +// /// `op`- |total, radix, value| -> (total * radix + value, continue) +// fn read_digits( +// &mut self, +// mut op: F, +// allow_num_separator: bool, +// ) -> LexResult +// where +// F: FnMut(Ret, u8, u32) -> LexResult<(Ret, bool)>, +// Ret: Copy + Default, +// { +// debug_assert!( +// RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16, +// "radix for read_int should be one of 2, 8, 10, 16, but got {}", +// RADIX +// ); + +// if cfg!(feature = "debug") { +// trace!( +// "read_digits(radix = {}), cur = {:?}", +// RADIX, +// self.input.cur() +// ); +// } + +// let start = self.input.cur_pos(); +// let mut total: Ret = Default::default(); +// let mut prev = None; + +// while let Some(c) = self.input.cur() { +// if allow_num_separator && c == '_' { +// let is_allowed = |c: Option| { +// if c.is_none() { +// return false; +// } + +// let c = c.unwrap(); + +// c.is_digit(RADIX as _) +// }; +// let is_forbidden = |c: Option| { +// if c.is_none() { +// return true; +// } + +// if RADIX == 16 { +// matches!(c.unwrap(), '.' | 'X' | '_' | 'x') +// } else { +// matches!(c.unwrap(), '.' | 'B' | 'E' | 'O' | '_' | +// 'b' | 'e' | 'o') } +// }; + +// let next = self.input.peek(); + +// if !is_allowed(next) || is_forbidden(prev) || +// is_forbidden(next) { self.emit_error( +// start, +// +// SyntaxError::NumericSeparatorIsAllowedOnlyBetweenTwoDigits, +// ); } + +// // Ignore this _ character +// unsafe { +// // Safety: cur() returns Some(c) where c is a valid char +// self.input.bump(1); +// } + +// continue; +// } + +// // e.g. (val for a) = 10 where radix = 16 +// let val = if let Some(val) = c.to_digit(RADIX as _) { +// val +// } else { +// return Ok(total); +// }; + +// self.bump(); + +// let (t, cont) = op(total, RADIX, val)?; + +// total = t; + +// if !cont { +// return Ok(total); +// } + +// prev = Some(c); +// } + +// Ok(total) +// } + +// fn make_legacy_octal(&mut self, start: BytePos, val: f64) -> +// LexResult { self.ensure_not_ident()?; + +// if self.syntax.typescript() && self.target >= EsVersion::Es5 { +// self.emit_error(start, SyntaxError::TS1085); +// } + +// self.emit_strict_mode_error(start, SyntaxError::LegacyOctal); + +// Ok(val) +// } +// } + +// #[cfg(test)] +// mod tests { +// use std::panic; + +// use super::*; + +// fn lex(s: &'static str, f: F) -> Ret +// where +// F: FnOnce(&mut Lexer<'_>) -> Ret, +// { +// crate::with_test_sess(s, |_, input| { +// let mut l = Lexer::new( +// Syntax::Es(Default::default()), +// Default::default(), +// input, +// None, +// ); +// let ret = f(&mut l); +// assert_eq!(l.input.cur()?, None); +// Ok(ret) +// }) +// .unwrap() +// } + +// fn num(s: &'static str) -> (f64, Atom) { +// lex(s, |l| { +// l.read_number(s.starts_with('.')).unwrap().left().unwrap() +// }) +// } + +// fn int(s: &'static str) -> u32 { +// lex(s, |l| { +// l.read_int_u32::(0) +// .unwrap() +// .expect("read_int returned None") +// }) +// } + +// const LONG: &str = "1e10000000000000000000000000000000000000000\ +// +// 0000000000000000000000000000000000000000000000000000"; #[test] +// fn num_inf() { +// assert_eq!(num(LONG), (f64::INFINITY, LONG.into())); +// } + +// /// Number >= 2^53 +// #[test] +// fn num_big_exp() { +// assert_eq!((1e30, "1e30".into()), num("1e30")); +// } + +// #[test] +// fn num_very_big_exp() { +// const LARGE_POSITIVE_EXP: &str = +// +// "1e100000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// 00000000000000000000000000000000000000000000000000000"; +// const LARGE_NEGATIVE_EXP: &str = +// +// "1e-10000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// 000000000000000000000000000000000000000000000000000000"; +// const ZERO_WITH_LARGE_POSITIVE_EXP: &str = +// +// "0e100000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// 00000000000000000000000000000000000000000000000000000"; +// const ZERO_WITH_LARGE_NEGATIVE_EXP: &str = +// +// "0e-10000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// 000000000000000000000000000000000000000000000000000000"; +// const LARGE_MANTISSA_WITH_LARGE_NEGATIVE_EXP: &str = +// "10000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// 000000000000000000000000000000000000000000000000000000\ +// +// e-100000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// +// 00000000000000000000000000000000000000000000000000000000000000000\ +// 000000000000000000000000000000000000000000000000000000"; + +// assert_eq!( +// num(LARGE_POSITIVE_EXP), +// (f64::INFINITY, LARGE_POSITIVE_EXP.into()) +// ); +// assert_eq!(num(LARGE_NEGATIVE_EXP), (0.0, +// LARGE_NEGATIVE_EXP.into())); assert_eq!( +// num(ZERO_WITH_LARGE_POSITIVE_EXP), +// (0.0, ZERO_WITH_LARGE_POSITIVE_EXP.into()) +// ); +// assert_eq!( +// num(ZERO_WITH_LARGE_NEGATIVE_EXP), +// (0.0, ZERO_WITH_LARGE_NEGATIVE_EXP.into()) +// ); +// assert_eq!( +// num(LARGE_MANTISSA_WITH_LARGE_NEGATIVE_EXP), +// (0.0, LARGE_MANTISSA_WITH_LARGE_NEGATIVE_EXP.into()) +// ); +// } + +// #[test] +// fn num_big_many_zero() { +// assert_eq!( +// ( +// 1_000_000_000_000_000_000_000_000_000_000f64, +// "1000000000000000000000000000000".into() +// ), +// num("1000000000000000000000000000000") +// ); +// assert_eq!( +// (3.402_823_466_385_288_6e38, "34028234663852886e22".into()), +// num("34028234663852886e22"), +// ); +// } + +// #[test] +// fn big_number_with_fract() { +// assert_eq!( +// (77777777777777777.1f64, "77777777777777777.1".into()), +// num("77777777777777777.1") +// ) +// } + +// #[test] +// fn issue_480() { +// assert_eq!((9.09, "9.09".into()), num("9.09")) +// } + +// #[test] +// fn num_legacy_octal() { +// assert_eq!((0o12 as f64, "0012".into()), num("0012")); +// assert_eq!((10f64, "012".into()), num("012")); +// } + +// #[test] +// fn read_int_1() { +// assert_eq!(60, int::<10>("60")); +// assert_eq!(0o73, int::<8>("73")); +// } + +// #[test] +// fn read_int_short() { +// assert_eq!(7, int::<10>("7")); +// assert_eq!(10, int::<10>("10")); +// } + +// #[test] +// fn read_radix_number() { +// assert_eq!( +// (0o73 as f64, "0o73".into()), +// lex("0o73", |l| l +// .read_radix_number::<8>() +// .unwrap() +// .left() +// .unwrap()) +// ); +// } + +// #[test] +// fn read_num_sep() { +// assert_eq!(1_000, int::<10>("1_000")); +// assert_eq!(0xaebece, int::<16>("AE_BE_CE")); +// assert_eq!(0b1010000110000101, int::<2>("1010_0001_1000_0101")); +// assert_eq!(0o0666, int::<8>("0_6_6_6")); +// } + +// #[test] +// fn read_bigint() { +// assert_eq!( +// lex( +// "10000000000000000000000000000000000000000000000000000n", +// |l| l.read_number(false).unwrap().right().unwrap() +// ), +// ( +// Box::new( +// "10000000000000000000000000000000000000000000000000000" +// .parse::() +// .unwrap() +// ), +// +// Atom::from("10000000000000000000000000000000000000000000000000000n") +// ), +// ); +// } + +// #[test] +// fn large_bin_number() { +// const LONG: &str = +// +// "0B11111111111111111111111111111111111111111111111101001010100000010111110001111111111" +// ; const VERY_LARGE_BINARY_NUMBER: &str = +// +// "0B1111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// +// 111111111111111111111111111111111111111111111111111111111111111111\ +// 0010111110001111111111"; +// assert_eq!( +// lex(LONG, |l| l +// .read_radix_number::<2>() +// .unwrap() +// .left() +// .unwrap()), +// (9.671_406_556_917_009e24, LONG.into()) +// ); +// assert_eq!( +// lex(VERY_LARGE_BINARY_NUMBER, |l| l +// .read_radix_number::<2>() +// .unwrap() +// .left() +// .unwrap()), +// (1.0972248137587377e304, VERY_LARGE_BINARY_NUMBER.into()) +// ); +// } + +// #[test] +// fn large_float_number() { +// const LONG: &str = "9.671406556917009e+24"; + +// assert_eq!(num(LONG), (9.671_406_556_917_009e24, LONG.into())); +// } + +// /// Valid even on strict mode. +// const VALID_CASES: &[&str] = &[".0", "0.e-1", "0e8", ".8e1", "0.8e1", +// "1.18e1"]; const INVALID_CASES_ON_STRICT: &[&str] = &["08e1", "08.1", +// "08.8e1", "08", "01"]; const INVALID_CASES: &[&str] = &["01.8e1", +// "012e1", "00e1", "00.0"]; + +// fn test_floats(strict: bool, success: bool, cases: &'static [&'static +// str]) { for case in cases { +// println!( +// "Testing {} (when strict = {}); Expects success = {}", +// case, strict, success +// ); +// // lazy way to get expected values +// let expected: f64 = (i64::from_str_radix(case, 8).map(|v| v as +// f64)) .or_else(|_| case.parse::().map(|v| v as f64)) +// .or_else(|_| case.parse::()) +// .unwrap_or_else(|err| { +// panic!( +// "failed to parse '{}' as float using str.parse(): +// {}", case, err +// ) +// }); + +// let vec = panic::catch_unwind(|| { +// crate::with_test_sess(case, |_, input| { +// let mut l = Lexer::new(Syntax::default(), +// Default::default(), input, None); l.ctx.strict = strict; +// Ok(l.map(|ts| ts.token).collect::>()) +// }) +// .unwrap() +// }); + +// if success { +// let vec = match vec { +// Ok(vec) => vec, +// Err(err) => panic::resume_unwind(err), +// }; + +// assert_eq!(vec.len(), 1); + +// let token = vec.into_iter().next().unwrap(); +// let value = match token { +// Token::Num { value, .. } => value, +// _ => { +// panic!("expected num token in test") +// } +// }; + +// assert_eq!(expected, value); +// } else if let Ok(vec) = vec { +// assert_ne!( +// vec![Token::Num { +// value: expected, +// raw: expected.to_string().into() +// }], +// vec +// ) +// } +// } +// } + +// // #[test] +// // fn strict_mode() { +// // test_floats(true, true, VALID_CASES); +// // test_floats(true, false, INVALID_CASES_ON_STRICT); +// // test_floats(true, false, INVALID_CASES); +// // } + +// #[test] +// fn non_strict() { +// test_floats(false, true, VALID_CASES); +// test_floats(false, true, INVALID_CASES_ON_STRICT); +// test_floats(false, false, INVALID_CASES); +// } +// } diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index 77d8edc324fe..6c8ba9a9dae1 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -246,100 +246,101 @@ impl Lexer<'_> { } fn next_token(&mut self, start: &mut BytePos) -> Result, Error> { - if let Some(start) = self.state.next_regexp { - return Ok(Some(self.read_regexp(start)?)); - } - - if self.state.is_first { - if let Some(shebang) = self.read_shebang()? { - return Ok(Some(Token::Shebang(shebang))); - } - } - - self.state.had_line_break = self.state.is_first; - self.state.is_first = false; - - // skip spaces before getting next character, if we are allowed to. - if self.state.can_skip_space() { - self.skip_space::(); - *start = self.input.cur_pos(); - }; - - match self.input.cur()? { - Some(..) => {} - // End of input. - None => { - self.consume_pending_comments(); - - return Ok(None); - } - }; - - // println!( - // "\tContext: ({:?}) {:?}", - // self.input.cur().unwrap(), - // self.state.context.0 - // ); - - self.state.start = *start; - - if self.syntax.jsx() && !self.ctx.in_property_name && !self.ctx.in_type { - //jsx - if self.state.context.current() == Some(TokenContext::JSXExpr) { - return self.read_jsx_token(); - } - - let c = self.input.cur()?; - if let Some(c) = c { - if self.state.context.current() == Some(TokenContext::JSXOpeningTag) - || self.state.context.current() == Some(TokenContext::JSXClosingTag) - { - if c.is_ident_start() { - return self.read_jsx_word().map(Some); - } - - if c == '>' { - unsafe { - // Safety: cur() is Some('>') - self.input.bump(1); - } - return Ok(Some(Token::JSXTagEnd)); - } - - if (c == '\'' || c == '"') - && self.state.context.current() == Some(TokenContext::JSXOpeningTag) - { - return self.read_jsx_str(c).map(Some); - } - } - - if c == '<' && self.state.is_expr_allowed && self.input.peek() != Some('!') { - let had_line_break_before_last = self.had_line_break_before_last(); - let cur_pos = self.input.cur_pos(); - - unsafe { - // Safety: cur() is Some('<') - self.input.bump(1); - } - - if had_line_break_before_last && self.is_str("<<<<<< ") { - let span = Span::new(cur_pos, cur_pos + BytePos(7)); - - self.emit_error_span(span, SyntaxError::TS1185); - self.skip_line_comment(6); - self.skip_space::(); - return self.read_token(); - } - - return Ok(Some(Token::JSXTagStart)); - } - } - } - - if let Some(TokenContext::Tpl {}) = self.state.context.current() { - let start = self.state.tpl_start; - return self.read_tmpl_token(start).map(Some); - } + // if let Some(start) = self.state.next_regexp { + // return Ok(Some(self.read_regexp(start)?)); + // } + + // if self.state.is_first { + // if let Some(shebang) = self.read_shebang()? { + // return Ok(Some(Token::Shebang(shebang))); + // } + // } + + // self.state.had_line_break = self.state.is_first; + // self.state.is_first = false; + + // // skip spaces before getting next character, if we are allowed to. + // if self.state.can_skip_space() { + // self.skip_space::(); + // *start = self.input.cur_pos(); + // }; + + // match self.input.cur()? { + // Some(..) => {} + // // End of input. + // None => { + // self.consume_pending_comments(); + + // return Ok(None); + // } + // }; + + // // println!( + // // "\tContext: ({:?}) {:?}", + // // self.input.cur().unwrap(), + // // self.state.context.0 + // // ); + + // self.state.start = *start; + + // if self.syntax.jsx() && !self.ctx.in_property_name && !self.ctx.in_type { + // //jsx + // if self.state.context.current() == Some(TokenContext::JSXExpr) { + // return self.read_jsx_token(); + // } + + // let c = self.input.cur()?; + // if let Some(c) = c { + // if self.state.context.current() == Some(TokenContext::JSXOpeningTag) + // || self.state.context.current() == Some(TokenContext::JSXClosingTag) + // { + // if c.is_ident_start() { + // return self.read_jsx_word().map(Some); + // } + + // if c == '>' { + // unsafe { + // // Safety: cur() is Some('>') + // self.input.bump(1); + // } + // return Ok(Some(Token::JSXTagEnd)); + // } + + // if (c == '\'' || c == '"') + // && self.state.context.current() == + // Some(TokenContext::JSXOpeningTag) { + // return self.read_jsx_str(c).map(Some); + // } + // } + + // if c == '<' && self.state.is_expr_allowed && self.input.peek() != + // Some('!') { let had_line_break_before_last = + // self.had_line_break_before_last(); let cur_pos = + // self.input.cur_pos(); + + // unsafe { + // // Safety: cur() is Some('<') + // self.input.bump(1); + // } + + // if had_line_break_before_last && self.is_str("<<<<<< ") { + // let span = Span::new(cur_pos, cur_pos + BytePos(7)); + + // self.emit_error_span(span, SyntaxError::TS1185); + // self.skip_line_comment(6); + // self.skip_space::(); + // return self.read_token(); + // } + + // return Ok(Some(Token::JSXTagStart)); + // } + // } + // } + + // if let Some(TokenContext::Tpl {}) = self.state.context.current() { + // let start = self.state.tpl_start; + // return self.read_tmpl_token(start).map(Some); + // } self.read_token() } @@ -371,7 +372,7 @@ impl Iterator for Lexer<'_> { } self.state.update(start, token.kind()); - self.state.prev_hi = self.last_pos(); + self.state.prev_hi = self.input.cur_pos(); self.state.had_line_break_before_last = self.had_line_break_before_last(); } diff --git a/crates/swc_ecma_parser/src/lexer/util.rs b/crates/swc_ecma_parser/src/lexer/util.rs index 1d835491154c..f263592aa83c 100644 --- a/crates/swc_ecma_parser/src/lexer/util.rs +++ b/crates/swc_ecma_parser/src/lexer/util.rs @@ -21,7 +21,7 @@ use crate::{ impl Lexer<'_> { pub(super) fn span(&self, start: BytePos) -> Span { - let end = self.last_pos(); + let end = self.input.cur_pos(); if cfg!(debug_assertions) && start > end { unreachable!( "assertion failed: (span.start <= span.end). @@ -110,183 +110,185 @@ impl Lexer<'_> { self.add_module_mode_error(err); } - /// Skip comments or whitespaces. - /// - /// See https://tc39.github.io/ecma262/#sec-white-space - #[inline(never)] - pub(super) fn skip_space(&mut self) -> Result<()> { - loop { - let (offset, newline) = { - let mut skip = SkipWhitespace { - input: self.input.as_str(), - newline: false, - offset: 0, - }; - - skip.scan(); - - (skip.offset, skip.newline) - }; - - self.input.bump(offset as usize); - if newline { - self.state.had_line_break = true; - } - - if LEX_COMMENTS && self.input.is_byte(b'/') { - if self.input.peek()? == Some('/') { - self.skip_line_comment(2); - continue; - } else if self.input.peek()? == Some('*') { - self.skip_block_comment(); - continue; - } - } - - break; - } - - Ok(()) - } - - #[inline(never)] - pub(super) fn skip_line_comment(&mut self, start_skip: usize) { - let start = self.input.cur_pos(); - self.input.bump_bytes(start_skip); - let slice_start = self.input.cur_pos(); - - // foo // comment for foo - // bar - // - // foo - // // comment for bar - // bar - // - let is_for_next = self.state.had_line_break || !self.state.can_have_trailing_line_comment(); - - let idx = self - .input - .as_str() - .find(['\r', '\n', '\u{2028}', '\u{2029}']) - .map_or(self.input.as_str().len(), |v| { - self.state.had_line_break = true; - v - }); - - self.input.bump_bytes(idx); - let end = self.input.cur_pos(); - - if let Some(comments) = self.comments_buffer.as_mut() { - let s = unsafe { - // Safety: We know that the start and the end are valid - self.input.slice(slice_start, end) - }; - let cmt = Comment { - kind: CommentKind::Line, - span: Span::new(start, end), - text: self.atoms.atom(s), - }; - - if is_for_next { - comments.push_pending_leading(cmt); - } else { - comments.push(BufferedComment { - kind: BufferedCommentKind::Trailing, - pos: self.state.prev_hi, - comment: cmt, - }); - } - } - - unsafe { - // Safety: We got end from self.input - self.input.reset_to(end); - } - } - - /// Expects current char to be '/' and next char to be '*'. - #[inline(never)] - pub(super) fn skip_block_comment(&mut self) { - let start = self.input.cur_pos(); - - debug_assert_eq!(self.input.cur(), Some('/')); - debug_assert_eq!(self.input.peek(), Some('*')); - - self.input.bump_bytes(2); - - // jsdoc - let slice_start = self.input.cur_pos(); - let mut was_star = if self.input.is_byte(b'*') { - self.bump(); - true - } else { - false - }; - - let mut is_for_next = self.state.had_line_break || !self.state.can_have_trailing_comment(); - - while let Some(c) = self.input.cur()? { - if was_star && c == '/' { - debug_assert_eq!(self.input.cur(), Some('/')); - self.bump(); // '/' - - let end = self.input.cur_pos(); - - self.skip_space::(); - - if self.input.eat(RawToken::Semi)? { - is_for_next = false; - } - - self.store_comment(is_for_next, start, end, slice_start); - - return; - } - if c.is_line_terminator() { - self.state.had_line_break = true; - } - - was_star = c == '*'; - self.bump(); - } - - let end = self.input.end_pos(); - let span = Span::new(end, end); - self.emit_error_span(span, SyntaxError::UnterminatedBlockComment) - } - - #[inline(never)] - fn store_comment( - &mut self, - is_for_next: bool, - start: BytePos, - end: BytePos, - slice_start: BytePos, - ) { - if let Some(comments) = self.comments_buffer.as_mut() { - let src = unsafe { - // Safety: We got slice_start and end from self.input so those are valid. - self.input.slice(slice_start, end) - }; - let s = &src[..src.len() - 2]; - let cmt = Comment { - kind: CommentKind::Block, - span: Span::new(start, end), - text: self.atoms.atom(s), - }; - - let _ = self.input.peek(); - if is_for_next { - comments.push_pending_leading(cmt); - } else { - comments.push(BufferedComment { - kind: BufferedCommentKind::Trailing, - pos: self.state.prev_hi, - comment: cmt, - }); - } - } - } + // /// Skip comments or whitespaces. + // /// + // /// See https://tc39.github.io/ecma262/#sec-white-space + // #[inline(never)] + // pub(super) fn skip_space(&mut self) -> Result<()> { + // loop { + // let (offset, newline) = { + // let mut skip = SkipWhitespace { + // input: self.input.as_str(), + // newline: false, + // offset: 0, + // }; + + // skip.scan(); + + // (skip.offset, skip.newline) + // }; + + // self.input.bump(offset as usize); + // if newline { + // self.state.had_line_break = true; + // } + + // if LEX_COMMENTS && self.input.is_byte(b'/') { + // if self.input.peek()? == Some('/') { + // self.skip_line_comment(2); + // continue; + // } else if self.input.peek()? == Some('*') { + // self.skip_block_comment(); + // continue; + // } + // } + + // break; + // } + + // Ok(()) + // } + + // #[inline(never)] + // pub(super) fn skip_line_comment(&mut self, start_skip: usize) { + // let start = self.input.cur_pos(); + // self.input.bump_bytes(start_skip); + // let slice_start = self.input.cur_pos(); + + // // foo // comment for foo + // // bar + // // + // // foo + // // // comment for bar + // // bar + // // + // let is_for_next = self.state.had_line_break || + // !self.state.can_have_trailing_line_comment(); + + // let idx = self + // .input + // .as_str() + // .find(['\r', '\n', '\u{2028}', '\u{2029}']) + // .map_or(self.input.as_str().len(), |v| { + // self.state.had_line_break = true; + // v + // }); + + // self.input.bump_bytes(idx); + // let end = self.input.cur_pos(); + + // if let Some(comments) = self.comments_buffer.as_mut() { + // let s = unsafe { + // // Safety: We know that the start and the end are valid + // self.input.slice(slice_start, end) + // }; + // let cmt = Comment { + // kind: CommentKind::Line, + // span: Span::new(start, end), + // text: self.atoms.atom(s), + // }; + + // if is_for_next { + // comments.push_pending_leading(cmt); + // } else { + // comments.push(BufferedComment { + // kind: BufferedCommentKind::Trailing, + // pos: self.state.prev_hi, + // comment: cmt, + // }); + // } + // } + + // unsafe { + // // Safety: We got end from self.input + // self.input.reset_to(end); + // } + // } + + // /// Expects current char to be '/' and next char to be '*'. + // #[inline(never)] + // pub(super) fn skip_block_comment(&mut self) { + // let start = self.input.cur_pos(); + + // debug_assert_eq!(self.input.cur(), Some('/')); + // debug_assert_eq!(self.input.peek(), Some('*')); + + // self.input.bump_bytes(2); + + // // jsdoc + // let slice_start = self.input.cur_pos(); + // let mut was_star = if self.input.is_byte(b'*') { + // self.bump(); + // true + // } else { + // false + // }; + + // let mut is_for_next = self.state.had_line_break || + // !self.state.can_have_trailing_comment(); + + // while let Some(c) = self.input.cur()? { + // if was_star && c == '/' { + // debug_assert_eq!(self.input.cur(), Some('/')); + // self.bump(); // '/' + + // let end = self.input.cur_pos(); + + // self.skip_space::(); + + // if self.input.eat(RawToken::Semi)? { + // is_for_next = false; + // } + + // self.store_comment(is_for_next, start, end, slice_start); + + // return; + // } + // if c.is_line_terminator() { + // self.state.had_line_break = true; + // } + + // was_star = c == '*'; + // self.bump(); + // } + + // let end = self.input.end_pos(); + // let span = Span::new(end, end); + // self.emit_error_span(span, SyntaxError::UnterminatedBlockComment) + // } + + // #[inline(never)] + // fn store_comment( + // &mut self, + // is_for_next: bool, + // start: BytePos, + // end: BytePos, + // slice_start: BytePos, + // ) { + // if let Some(comments) = self.comments_buffer.as_mut() { + // let src = unsafe { + // // Safety: We got slice_start and end from self.input so those + // are valid. self.input.slice(slice_start, end) + // }; + // let s = &src[..src.len() - 2]; + // let cmt = Comment { + // kind: CommentKind::Block, + // span: Span::new(start, end), + // text: self.atoms.atom(s), + // }; + + // let _ = self.input.peek(); + // if is_for_next { + // comments.push_pending_leading(cmt); + // } else { + // comments.push(BufferedComment { + // kind: BufferedCommentKind::Trailing, + // pos: self.state.prev_hi, + // comment: cmt, + // }); + // } + // } + // } } /// Implemented for `char`. diff --git a/crates/swc_ecma_parser/src/lib.rs b/crates/swc_ecma_parser/src/lib.rs index e2ed67dbff1f..0effb8d826ba 100644 --- a/crates/swc_ecma_parser/src/lib.rs +++ b/crates/swc_ecma_parser/src/lib.rs @@ -115,8 +115,8 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![cfg_attr(test, feature(test))] -#![deny(clippy::all)] -#![deny(unused)] +// #![deny(clippy::all)] +// #![deny(unused)] #![allow(clippy::nonminimal_bool)] #![allow(clippy::too_many_arguments)] #![allow(clippy::unnecessary_unwrap)] From 80bd19c023cbd1d4b5f0678326988be335ee86df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 10:16:07 +0900 Subject: [PATCH 060/201] comment regex --- crates/swc_ecma_raw_lexer/src/lib.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 5127a8016414..36f10c319d4e 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -105,6 +105,7 @@ impl Iterator for RawBuffer<'_> { fn next(&mut self) -> Option { let item = self.lexer.next()?; + dbg!(&item); let item = match item { Ok(item) => item, Err(e) => return Some(Err(e)), @@ -331,10 +332,10 @@ pub enum RawToken { #[regex(r"[ \t]+")] Whitespace, - #[regex(r"//.*")] + #[regex(r"//[^\n]*")] LineComment, - #[regex(r"/\*.*\*/")] + #[regex(r"/\*(?:[^*]|\*[^/])*\*/")] BlockComment, #[token(" - self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule); - // self.skip_line_comment(0); - // self.skip_space::(); - return self.read_token(); - } - RawToken::LConflictMarker | RawToken::RConflictMarker if self.had_line_break_before_last() => { @@ -333,1072 +325,958 @@ impl<'a> Lexer<'a> { | RawToken::RConflictMarker => { // self.skip_line_comment(0); // self.skip_space::(); + self.skip_line_comment(5); + self.skip_space::(); return self.read_token(); } - RawToken::Await => Token::Word(Word::Keyword(Keyword::Await)), - RawToken::Break => Token::Word(Word::Keyword(Keyword::Break)), + _ => {} + } + } - RawToken::Case => Token::Word(Word::Keyword(Keyword::Case)), + /// Read a token given `.`. + /// + /// This is extracted as a method to reduce size of `read_token`. + #[inline(never)] + fn read_token_dot(&mut self) -> LexResult { + // Check for eof + let next = match self.input.peek()? { + Some(next) => next, + None => { + unsafe { + // Safety: cur() is Some(',') + self.input.bump(1); + } + return Ok(tok!('.')); + } + }; + if next.is_ascii_digit() { + return self.read_number(true).map(|v| match v { + Left((value, raw)) => Token::Num { value, raw }, + Right((value, raw)) => Token::BigInt { value, raw }, + }); + } - RawToken::Catch => Token::Word(Word::Keyword(Keyword::Catch)), + unsafe { + // Safety: cur() is Some + // 1st `.` + self.input.bump(1); + } - RawToken::Continue => Token::Word(Word::Keyword(Keyword::Continue)), + if next == '.' && self.input.peek() == Some('.') { + unsafe { + // Safety: peek() was Some - RawToken::Debugger => Token::Word(Word::Keyword(Keyword::Debugger)), + self.input.bump(2); // `..` + } - RawToken::Default_ => Token::Word(Word::Keyword(Keyword::Default_)), + return Ok(tok!("...")); + } - RawToken::Do => Token::Word(Word::Keyword(Keyword::Do)), + Ok(tok!('.')) + } - RawToken::Else => Token::Word(Word::Keyword(Keyword::Else)), + /// Read a token given `0`. + /// + /// This is extracted as a method to reduce size of `read_token`. + #[inline(never)] + fn read_token_zero(&mut self) -> LexResult { + let next = self.input.peek()?; + + let bigint = match next { + Some('x') | Some('X') => self.read_radix_number::<16>(), + Some('o') | Some('O') => self.read_radix_number::<8>(), + Some('b') | Some('B') => self.read_radix_number::<2>(), + _ => { + return self.read_number(false).map(|v| match v { + Left((value, raw)) => Token::Num { value, raw }, + Right((value, raw)) => Token::BigInt { value, raw }, + }); + } + }; - RawToken::Finally => Token::Word(Word::Keyword(Keyword::Finally)), + bigint.map(|v| match v { + Left((value, raw)) => Token::Num { value, raw }, + Right((value, raw)) => Token::BigInt { value, raw }, + }) + } - RawToken::For => Token::Word(Word::Keyword(Keyword::For)), + /// Read a token given `|` or `&`. + /// + /// This is extracted as a method to reduce size of `read_token`. + #[inline(never)] + fn read_token_logical(&mut self, c: u8) -> LexResult { + let had_line_break_before_last = self.had_line_break_before_last(); + let start = self.input.cur_pos(); - RawToken::Function => Token::Word(Word::Keyword(Keyword::Function)), + unsafe { + // Safety: cur() is Some(c as char) + self.input.bump(1); + } + let token = if c == b'&' { + BinOpToken::BitAnd + } else { + BinOpToken::BitOr + }; - RawToken::If => Token::Word(Word::Keyword(Keyword::If)), + // '|=', '&=' + if self.input.eat_byte(b'=') { + return Ok(Token::AssignOp(match token { + BinOpToken::BitAnd => AssignOp::BitAndAssign, + BinOpToken::BitOr => AssignOp::BitOrAssign, + _ => unreachable!(), + })); + } - RawToken::Return => Token::Word(Word::Keyword(Keyword::Return)), + // '||', '&&' + if self.input.cur() == Some(c as char) { + unsafe { + // Safety: cur() is Some(c) + self.input.bump(1); + } - RawToken::Switch => Token::Word(Word::Keyword(Keyword::Switch)), + if self.input.cur() == Some('=') { + unsafe { + // Safety: cur() is Some('=') + self.input.bump(1); + } + return Ok(Token::AssignOp(match token { + BinOpToken::BitAnd => op!("&&="), + BinOpToken::BitOr => op!("||="), + _ => unreachable!(), + })); + } - RawToken::Throw => Token::Word(Word::Keyword(Keyword::Throw)), + // ||||||| + // ^ + if had_line_break_before_last && token == BinOpToken::BitOr && self.is_str("||||| ") { + let span = fixed_len_span(start, 7); + self.emit_error_span(span, SyntaxError::TS1185); + self.skip_line_comment(5); + self.skip_space::(); + return self.error_span(span, SyntaxError::TS1185); + } - RawToken::Try => Token::Word(Word::Keyword(Keyword::Try)), + return Ok(Token::BinOp(match token { + BinOpToken::BitAnd => BinOpToken::LogicalAnd, + BinOpToken::BitOr => BinOpToken::LogicalOr, + _ => unreachable!(), + })); + } - RawToken::Var => Token::Word(Word::Keyword(Keyword::Var)), + Ok(Token::BinOp(token)) + } - RawToken::Let => Token::Word(Word::Keyword(Keyword::Let)), + /// Read an escaped character for string literal. + /// + /// In template literal, we should preserve raw string. + fn read_escaped_char(&mut self, in_template: bool) -> LexResult>> { + debug_assert_eq!(self.input.cur(), Some('\\')); - RawToken::Const => Token::Word(Word::Keyword(Keyword::Const)), + let start = self.input.cur_pos(); - RawToken::While => Token::Word(Word::Keyword(Keyword::While)), + self.bump(); // '\' - RawToken::With => Token::Word(Word::Keyword(Keyword::With)), + let c = match self.input.cur()? { + Some(c) => c, + None => self.error_span(pos_span(start), SyntaxError::InvalidStrEscape)?, + }; - RawToken::New => Token::Word(Word::Keyword(Keyword::New)), + macro_rules! push_c_and_ret { + ($c:expr) => {{ + $c + }}; + } - RawToken::This => Token::Word(Word::Keyword(Keyword::This)), + let c = match c { + '\\' => push_c_and_ret!('\\'), + 'n' => push_c_and_ret!('\n'), + 'r' => push_c_and_ret!('\r'), + 't' => push_c_and_ret!('\t'), + 'b' => push_c_and_ret!('\u{0008}'), + 'v' => push_c_and_ret!('\u{000b}'), + 'f' => push_c_and_ret!('\u{000c}'), + '\r' => { + self.bump(); // remove '\r' - RawToken::Super => Token::Word(Word::Keyword(Keyword::Super)), + self.input.eat(RawToken::NewLine); - RawToken::Class => Token::Word(Word::Keyword(Keyword::Class)), + return Ok(None); + } + '\n' | '\u{2028}' | '\u{2029}' => { + self.bump(); - RawToken::Extends => Token::Word(Word::Keyword(Keyword::Extends)), + return Ok(None); + } - RawToken::Export => Token::Word(Word::Keyword(Keyword::Export)), + // read hexadecimal escape sequences + 'x' => { + self.bump(); // 'x' + + match self.read_int_u32::<16>(2)? { + Some(val) => return Ok(Some(vec![Char::from(val)])), + None => self.error( + start, + SyntaxError::BadCharacterEscapeSequence { + expected: "2 hex characters", + }, + )?, + } + } - RawToken::Import => Token::Word(Word::Keyword(Keyword::Import)), + // read unicode escape sequences + 'u' => match self.read_unicode_escape() { + Ok(chars) => return Ok(Some(chars)), + Err(err) => self.error(start, err.into_kind())?, + }, - RawToken::Yield => Token::Word(Word::Keyword(Keyword::Yield)), + // octal escape sequences + '0'..='7' => { + self.bump(); - RawToken::In => Token::Word(Word::Keyword(Keyword::In)), + let first_c = if c == '0' { + match self.input.cur()? { + Some(next) if next.is_digit(8) => c, + // \0 is not an octal literal nor decimal literal. + _ => return Ok(Some(vec!['\u{0000}'.into()])), + } + } else { + c + }; + + // TODO: Show template instead of strict mode + if in_template { + self.error(start, SyntaxError::LegacyOctal)? + } - RawToken::InstanceOf => Token::Word(Word::Keyword(Keyword::InstanceOf)), + self.emit_strict_mode_error(start, SyntaxError::LegacyOctal); + + let mut value: u8 = first_c.to_digit(8).unwrap() as u8; + + macro_rules! one { + ($check:expr) => {{ + let cur = self.input.cur(); + + match cur.and_then(|c| c.to_digit(8)) { + Some(v) => { + value = if $check { + let new_val = value + .checked_mul(8) + .and_then(|value| value.checked_add(v as u8)); + match new_val { + Some(val) => val, + None => return Ok(Some(vec![Char::from(value as char)])), + } + } else { + value * 8 + v as u8 + }; + + self.bump(); + } + _ => return Ok(Some(vec![Char::from(value as u32)])), + } + }}; + } - RawToken::TypeOf => Token::Word(Word::Keyword(Keyword::TypeOf)), + one!(false); + one!(true); - RawToken::Void => Token::Word(Word::Keyword(Keyword::Void)), + return Ok(Some(vec![Char::from(value as char)])); + } + _ => c, + }; - RawToken::Delete => Token::Word(Word::Keyword(Keyword::Delete)), + unsafe { + // Safety: cur() is Some(c) if this method is called. + self.input.bump(1); + } + + Ok(Some(vec![c.into()])) + } + + fn read_token_plus_minus(&mut self, c: u8) -> LexResult> { + let start = self.input.cur_pos(); - RawToken::Abstract => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Abstract))), + unsafe { + // Safety: cur() is Some(c), if this method is called. + self.input.bump(1); + } + + // '++', '--' + Ok(Some(if self.input.cur() == Some(c as char) { + unsafe { + // Safety: cur() is Some(c) + self.input.bump(1); + } + + // Handle --> + if self.state.had_line_break && c == b'-' && self.input.eat(b'>') { + self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule); + self.skip_line_comment(0); + self.skip_space::(); + return self.read_token(); + } + + if c == b'+' { + Token::PlusPlus + } else { + Token::MinusMinus + } + } else if self.input.eat_byte(b'=') { + Token::AssignOp(if c == b'+' { + AssignOp::AddAssign + } else { + AssignOp::SubAssign + }) + } else { + Token::BinOp(if c == b'+' { + BinOpToken::Add + } else { + BinOpToken::Sub + }) + })) + } + + fn read_token_bang_or_eq(&mut self, c: u8) -> LexResult> { + let start = self.input.cur_pos(); + let had_line_break_before_last = self.had_line_break_before_last(); + + unsafe { + // Safety: cur() is Some(c) if this method is called. + self.input.bump(1); + } + + Ok(Some(if self.input.eat(RawToken::AssignOp)? { + // "==" + + if self.input.eat(RawToken::AssignOp)? { + if c == b'!' { + Token::BinOp(BinOpToken::NotEqEq) + } else { + // ======= + // ^ + if had_line_break_before_last && self.is_str("====") { + self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185); + self.skip_line_comment(4); + self.skip_space::(); + return self.read_token(); + } + + Token::BinOp(BinOpToken::EqEqEq) + } + } else if c == b'!' { + Token::BinOp(BinOpToken::NotEq) + } else { + Token::BinOp(BinOpToken::EqEq) + } + } else if c == b'=' && self.input.eat_byte(b'>') { + // "=>" + + Token::Arrow + } else if c == b'!' { + Token::Bang + } else { + Token::AssignOp(AssignOp::Assign) + })) + } +} + +impl Lexer<'_> { + /// This can be used if there's no keyword starting with the first + /// character. + fn read_ident_unknown(&mut self) -> LexResult { + debug_assert!(self.input.cur()?.is_some()); + + let (word, _) = self + .read_word_as_str_with(|l, s, _, _| Word::Ident(IdentLike::Other(l.atoms.atom(s))))?; + + Ok(Word(word)) + } - RawToken::As => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::As))), + /// This can be used if there's no keyword starting with the first + /// character. + fn read_word_with( + &mut self, + convert: &dyn Fn(&str) -> Option, + ) -> LexResult> { + debug_assert!(self.input.cur()?.is_some()); - RawToken::Async => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Async))), + let start = self.input.cur_pos(); + let (word, has_escape) = self.read_word_as_str_with(|l, s, _, can_be_known| { + if can_be_known { + if let Some(word) = convert(s) { + return word; + } + } - RawToken::From => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::From))), + Word::Ident(IdentLike::Other(l.atoms.atom(s))) + })?; + + // Note: ctx is store in lexer because of this error. + // 'await' and 'yield' may have semantic of reserved word, which means lexer + // should know context or parser should handle this error. Our approach to this + // problem is former one. + if has_escape && self.ctx.is_reserved(&word) { + self.error( + start, + SyntaxError::EscapeInReservedWord { word: word.into() }, + )? + } else { + Ok(Some(Token::Word(word))) + } + } - RawToken::Of => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Of))), + /// This method is optimized for texts without escape sequences. + /// + /// `convert(text, has_escape, can_be_keyword)` + fn read_word_as_str_with(&mut self, convert: F) -> LexResult<(Ret, bool)> + where + F: for<'any> FnOnce(&'any mut Lexer<'_>, &str, bool, bool) -> Ret, + { + debug_assert!(self.input.cur()?.is_some()); + let mut first = true; + let mut can_be_keyword = true; + let mut slice_start = self.input.cur_pos(); + let mut has_escape = false; + + self.with_buf(|l, buf| { + loop { + if let Some(c) = l.input.cur_as_ascii() { + // Performance optimization + if can_be_keyword && (c.is_ascii_uppercase() || c.is_ascii_digit()) { + can_be_keyword = false; + } + + if Ident::is_valid_continue(c as _) { + l.bump(); + continue; + } else if first && Ident::is_valid_start(c as _) { + l.bump(); + first = false; + continue; + } + + // unicode escape + if c == b'\\' { + first = false; + has_escape = true; + let start = l.input.cur_pos(); + l.bump(); + + if !l.is(b'u') { + l.error_span(pos_span(start), SyntaxError::ExpectedUnicodeEscape)? + } + + { + let end = l.input.cur_pos(); + let s = unsafe { + // Safety: start and end are valid position because we got them from + // `self.input` + l.input.slice(slice_start, start) + }; + buf.push_str(s); + unsafe { + // Safety: We got end from `self.input` + l.input.reset_to(end); + } + } + + let chars = l.read_unicode_escape()?; + + if let Some(c) = chars.first() { + let valid = if first { + c.is_ident_start() + } else { + c.is_ident_part() + }; + + if !valid { + l.emit_error(start, SyntaxError::InvalidIdentChar); + } + } + + for c in chars { + buf.extend(c); + } + + slice_start = l.input.cur_pos(); + continue; + } + + // ASCII but not a valid identifier + + break; + } - RawToken::Type => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Type))), + if let Some(c) = l.input.cur() { + if Ident::is_valid_continue(c) { + l.bump(); + continue; + } else if first && Ident::is_valid_start(c) { + l.bump(); + first = false; + continue; + } + } - RawToken::Global => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Global))), + break; + } - RawToken::Static => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Static))), + let end = l.input.cur_pos(); - RawToken::Using => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Using))), + let value = if !has_escape { + // Fast path: raw slice is enough if there's no escape. - RawToken::Readonly => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Readonly))), + let s = unsafe { + // Safety: slice_start and end are valid position because we got them from + // `self.input` + l.input.slice(slice_start, end) + }; + let s = unsafe { + // Safety: We don't use 'static. We just bypass the lifetime check. + transmute::<&str, &'static str>(s) + }; - RawToken::Unique => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Unique))), + convert(l, s, has_escape, can_be_keyword) + } else { + let s = unsafe { + // Safety: slice_start and end are valid position because we got them from + // `self.input` + l.input.slice(slice_start, end) + }; + buf.push_str(s); - RawToken::Keyof => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Keyof))), + convert(l, buf, has_escape, can_be_keyword) + }; - RawToken::Declare => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Declare))), + Ok((value, has_escape)) + }) + } - RawToken::Enum => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Enum))), + fn read_unicode_escape(&mut self) -> LexResult> { + debug_assert_eq!(self.input.cur(), Some('u')); - RawToken::Is => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Is))), + let mut chars = Vec::new(); + let mut is_curly = false; - RawToken::Infer => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Infer))), + self.bump(); // 'u' - RawToken::Symbol => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Symbol))), + if self.input.eat(b'{') { + is_curly = true; + } - RawToken::Undefined => { - Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Undefined))) + let state = self.input.cur_pos(); + let c = match self.read_int_u32::<16>(if is_curly { 0 } else { 4 }) { + Ok(Some(val)) => { + if 0x0010_ffff >= val { + char::from_u32(val) + } else { + let start = self.input.cur_pos(); + + self.error( + start, + SyntaxError::BadCharacterEscapeSequence { + expected: if is_curly { + "1-6 hex characters in the range 0 to 10FFFF." + } else { + "4 hex characters" + }, + }, + )? + } } + _ => { + let start = self.input.cur_pos(); - RawToken::Interface => { - Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Interface))) + self.error( + start, + SyntaxError::BadCharacterEscapeSequence { + expected: if is_curly { + "1-6 hex characters" + } else { + "4 hex characters" + }, + }, + )? } + }; - RawToken::Implements => { - Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Implements))) + match c { + Some(c) => { + chars.push(c.into()); } + _ => { + unsafe { + // Safety: state is valid position because we got it from cur_pos() + self.input.reset_to(state); + } - RawToken::Asserts => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Asserts))), + chars.push(Char::from('\\')); + chars.push(Char::from('u')); - RawToken::Require => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Require))), + if is_curly { + chars.push(Char::from('{')); - RawToken::Asserts => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Asserts))), + for _ in 0..6 { + if let Some(c) = self.input.cur() { + if c == '}' { + break; + } - RawToken::Get => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Get))), + self.bump(); - RawToken::Set => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Set))), + chars.push(Char::from(c)); + } else { + break; + } + } - RawToken::Any => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Any))), + chars.push(Char::from('}')); + } else { + for _ in 0..4 { + if let Some(c) = self.input.cur()? { + self.bump(); - RawToken::Intrinsic => { - Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Intrinsic))) + chars.push(Char::from(c)); + } + } + } } + } - RawToken::Unknown => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Unknown))), + if is_curly && !self.input.eat(b'}') { + self.error(state, SyntaxError::InvalidUnicodeEscape)? + } - RawToken::String => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::String))), + Ok(chars) + } - RawToken::Object => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Object))), + /// See https://tc39.github.io/ecma262/#sec-literals-string-literals + fn read_str_lit(&mut self) -> LexResult { + debug_assert!(self.input.cur()? == Some('\'') || self.input.cur()? == Some('"')); + let start = self.input.cur_pos(); + let quote = self.input.cur()?.unwrap() as u8; + + self.bump(); // '"' + + let mut has_escape = false; + let mut slice_start = self.input.cur_pos(); + + self.with_buf(|l, buf| { + loop { + if let Some(c) = l.input.cur_as_ascii() { + if c == quote { + let value_end = l.input.cur_pos(); + + let value = if !has_escape { + let s = unsafe { + // Safety: slice_start and value_end are valid position because we + // got them from `self.input` + l.input.slice(slice_start, value_end) + }; + + l.atoms.atom(s) + } else { + let s = unsafe { + // Safety: slice_start and value_end are valid position because we + // got them from `self.input` + l.input.slice(slice_start, value_end) + }; + buf.push_str(s); + + l.atoms.atom(&**buf) + }; + + unsafe { + // Safety: cur is quote + l.input.bump(1); + } + + let end = l.input.cur_pos(); + + let raw = unsafe { + // Safety: start and end are valid position because we got them from + // `self.input` + l.input.slice(start, end) + }; + let raw = l.atoms.atom(raw); + + return Ok(Token::Str { value, raw }); + } + + if c == b'\\' { + has_escape = true; + + { + let end = l.input.cur_pos(); + let s = unsafe { + // Safety: start and end are valid position because we got them from + // `self.input` + l.input.slice(slice_start, end) + }; + buf.push_str(s); + } + + if let Some(chars) = l.read_escaped_char(false)? { + for c in chars { + buf.extend(c); + } + } + + slice_start = l.input.cur_pos(); + continue; + } + + if (c as char).is_line_break() { + break; + } + + unsafe { + // Safety: cur is a ascii character + l.input.bump(1); + } + continue; + } - RawToken::Number => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Number))), + match l.input.cur()? { + Some(c) => { + if c.is_line_break() { + break; + } + unsafe { + // Safety: cur is Some(c) + l.input.bump(1); + } + } + None => break, + } + } - RawToken::Bigint => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Bigint))), + { + let end = l.input.cur_pos(); + let s = unsafe { + // Safety: start and end are valid position because we got them from + // `self.input` + l.input.slice(slice_start, end) + }; + buf.push_str(s); + } - RawToken::Boolean => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Boolean))), + l.emit_error(start, SyntaxError::UnterminatedStrLit); - RawToken::Never => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Never))), + let end = l.input.cur_pos(); - RawToken::Assert => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Assert))), + let raw = unsafe { + // Safety: start and end are valid position because we got them from + // `self.input` + l.input.slice(start, end) + }; + Ok(Token::Str { + value: l.atoms.atom(&*buf), + raw: l.atoms.atom(raw), + }) + }) + } - RawToken::Namespace => { - Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Namespace))) - } + /// Expects current char to be '/' + fn read_regexp(&mut self, start: BytePos) -> LexResult { + unsafe { + // Safety: start is valid position, and cur() is Some('/') + self.input.reset_to(start); + } + + debug_assert_eq!(self.input.cur(), Some('/')); - RawToken::Accessor => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Accessor))), + let start = self.input.cur_pos(); - RawToken::Meta => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Meta))), + self.bump(); - RawToken::Target => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Target))), + let (mut escaped, mut in_class) = (false, false); - RawToken::Satisfies => { - Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Satisfies))) + let content = self.with_buf(|l, buf| { + while let Some(c) = l.input.cur() { + // This is ported from babel. + // Seems like regexp literal cannot contain linebreak. + if c.is_line_terminator() { + let span = l.span(start); + + return Err(Error::new(span, SyntaxError::UnterminatedRegExp)); + } + + if escaped { + escaped = false; + } else { + match c { + '[' => in_class = true, + ']' if in_class => in_class = false, + // Terminates content part of regex literal + '/' if !in_class => break, + _ => {} + } + + escaped = c == '\\'; + } + + l.bump(); + buf.push(c); } - RawToken::Package => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Package))), + Ok(l.atoms.atom(&**buf)) + })?; + + // input is terminated without following `/` + if !self.is(b'/') { + let span = self.span(start); - RawToken::Protected => { - Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Protected))) + return Err(Error::new(span, SyntaxError::UnterminatedRegExp)); + } + + self.bump(); // '/' + + // Spec says "It is a Syntax Error if IdentifierPart contains a Unicode escape + // sequence." TODO: check for escape + + // Need to use `read_word` because '\uXXXX' sequences are allowed + // here (don't ask). + // let flags_start = self.input.cur_pos(); + let flags = { + match self.input.cur() { + Some(c) if c.is_ident_start() => self + .read_word_as_str_with(|l, s, _, _| l.atoms.atom(s)) + .map(Some), + _ => Ok(None), } + }? + .map(|(value, _)| value) + .unwrap_or_default(); - RawToken::Private => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Private))), + Ok(Token::Regex(content, flags)) + } - RawToken::Public => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Public))), - })) + #[cold] + fn read_shebang(&mut self) -> LexResult> { + if self.input.cur() != Some('#') || self.input.peek() != Some('!') { + return Ok(None); + } + unsafe { + // Safety: "#!" + self.input.bump(2); + } + let s = self.input.uncons_while(|c| !c.is_line_terminator()); + Ok(Some(self.atoms.atom(s))) } - // /// Read a token given `.`. - // /// - // /// This is extracted as a method to reduce size of `read_token`. - // #[inline(never)] - // fn read_token_dot(&mut self) -> LexResult { - // // Check for eof - // let next = match self.input.peek()? { - // Some(next) => next, - // None => { - // unsafe { - // // Safety: cur() is Some(',') - // self.input.bump(1); - // } - // return Ok(tok!('.')); - // } - // }; - // if next.is_ascii_digit() { - // return self.read_number(true).map(|v| match v { - // Left((value, raw)) => Token::Num { value, raw }, - // Right((value, raw)) => Token::BigInt { value, raw }, - // }); - // } - - // unsafe { - // // Safety: cur() is Some - // // 1st `.` - // self.input.bump(1); - // } - - // if next == '.' && self.input.peek() == Some('.') { - // unsafe { - // // Safety: peek() was Some - - // self.input.bump(2); // `..` - // } - - // return Ok(tok!("...")); - // } - - // Ok(tok!('.')) - // } - - // /// Read a token given `0`. - // /// - // /// This is extracted as a method to reduce size of `read_token`. - // #[inline(never)] - // fn read_token_zero(&mut self) -> LexResult { - // let next = self.input.peek()?; - - // let bigint = match next { - // Some('x') | Some('X') => self.read_radix_number::<16>(), - // Some('o') | Some('O') => self.read_radix_number::<8>(), - // Some('b') | Some('B') => self.read_radix_number::<2>(), - // _ => { - // return self.read_number(false).map(|v| match v { - // Left((value, raw)) => Token::Num { value, raw }, - // Right((value, raw)) => Token::BigInt { value, raw }, - // }); - // } - // }; - - // bigint.map(|v| match v { - // Left((value, raw)) => Token::Num { value, raw }, - // Right((value, raw)) => Token::BigInt { value, raw }, - // }) - // } - - // /// Read a token given `|` or `&`. - // /// - // /// This is extracted as a method to reduce size of `read_token`. - // #[inline(never)] - // fn read_token_logical(&mut self, c: u8) -> LexResult { - // let had_line_break_before_last = self.had_line_break_before_last(); - // let start = self.input.cur_pos(); - - // unsafe { - // // Safety: cur() is Some(c as char) - // self.input.bump(1); - // } - // let token = if c == b'&' { - // BinOpToken::BitAnd - // } else { - // BinOpToken::BitOr - // }; - - // // '|=', '&=' - // if self.input.eat_byte(b'=') { - // return Ok(Token::AssignOp(match token { - // BinOpToken::BitAnd => AssignOp::BitAndAssign, - // BinOpToken::BitOr => AssignOp::BitOrAssign, - // _ => unreachable!(), - // })); - // } - - // // '||', '&&' - // if self.input.cur() == Some(c as char) { - // unsafe { - // // Safety: cur() is Some(c) - // self.input.bump(1); - // } - - // if self.input.cur() == Some('=') { - // unsafe { - // // Safety: cur() is Some('=') - // self.input.bump(1); - // } - // return Ok(Token::AssignOp(match token { - // BinOpToken::BitAnd => op!("&&="), - // BinOpToken::BitOr => op!("||="), - // _ => unreachable!(), - // })); - // } - - // // ||||||| - // // ^ - // if had_line_break_before_last && token == BinOpToken::BitOr && - // self.is_str("||||| ") { let span = fixed_len_span(start, - // 7); self.emit_error_span(span, SyntaxError::TS1185); - // self.skip_line_comment(5); - // self.skip_space::(); - // return self.error_span(span, SyntaxError::TS1185); - // } - - // return Ok(Token::BinOp(match token { - // BinOpToken::BitAnd => BinOpToken::LogicalAnd, - // BinOpToken::BitOr => BinOpToken::LogicalOr, - // _ => unreachable!(), - // })); - // } - - // Ok(Token::BinOp(token)) - // } - - // /// Read an escaped character for string literal. - // /// - // /// In template literal, we should preserve raw string. - // fn read_escaped_char(&mut self, in_template: bool) -> - // LexResult>> { todo!(); - // // debug_assert_eq!(self.input.cur(), Some('\\')); - - // // let start = self.input.cur_pos(); - - // // self.bump(); // '\' - - // // let c = match self.input.cur()? { - // // Some(c) => c, - // // None => self.error_span(pos_span(start), - // // SyntaxError::InvalidStrEscape)?, }; - - // // macro_rules! push_c_and_ret { - // // ($c:expr) => {{ - // // $c - // // }}; - // // } - - // // let c = match c { - // // '\\' => push_c_and_ret!('\\'), - // // 'n' => push_c_and_ret!('\n'), - // // 'r' => push_c_and_ret!('\r'), - // // 't' => push_c_and_ret!('\t'), - // // 'b' => push_c_and_ret!('\u{0008}'), - // // 'v' => push_c_and_ret!('\u{000b}'), - // // 'f' => push_c_and_ret!('\u{000c}'), - // // '\r' => { - // // self.bump(); // remove '\r' - - // // self.input.eat(RawToken::NewLine); - - // // return Ok(None); - // // } - // // '\n' | '\u{2028}' | '\u{2029}' => { - // // self.bump(); - - // // return Ok(None); - // // } - - // // // read hexadecimal escape sequences - // // 'x' => { - // // self.bump(); // 'x' - - // // match self.read_int_u32::<16>(2)? { - // // Some(val) => return Ok(Some(vec![Char::from(val)])), - // // None => self.error( - // // start, - // // SyntaxError::BadCharacterEscapeSequence { - // // expected: "2 hex characters", - // // }, - // // )?, - // // } - // // } - - // // // read unicode escape sequences - // // 'u' => match self.read_unicode_escape() { - // // Ok(chars) => return Ok(Some(chars)), - // // Err(err) => self.error(start, err.into_kind())?, - // // }, - - // // // octal escape sequences - // // '0'..='7' => { - // // self.bump(); - - // // let first_c = if c == '0' { - // // match self.input.cur()? { - // // Some(next) if next.is_digit(8) => c, - // // // \0 is not an octal literal nor decimal literal. - // // _ => return Ok(Some(vec!['\u{0000}'.into()])), - // // } - // // } else { - // // c - // // }; - - // // // TODO: Show template instead of strict mode - // // if in_template { - // // self.error(start, SyntaxError::LegacyOctal)? - // // } - - // // self.emit_strict_mode_error(start, - // SyntaxError::LegacyOctal); - - // // let mut value: u8 = first_c.to_digit(8).unwrap() as u8; - - // // macro_rules! one { - // // ($check:expr) => {{ - // // let cur = self.input.cur(); - - // // match cur.and_then(|c| c.to_digit(8)) { - // // Some(v) => { - // // value = if $check { - // // let new_val = value - // // .checked_mul(8) - // // .and_then(|value| - // value.checked_add(v // as u8)); match - // new_val { // Some(val) => val, - // // None => return - // // Ok(Some(vec![Char::from(value as char)])), - // // } } else { - // // value * 8 + v as u8 - // // }; - - // // self.bump(); - // // } - // // _ => return Ok(Some(vec![Char::from(value as - // // u32)])), } - // // }}; - // // } - - // // one!(false); - // // one!(true); - - // // return Ok(Some(vec![Char::from(value as char)])); - // // } - // // _ => c, - // // }; - - // // unsafe { - // // // Safety: cur() is Some(c) if this method is called. - // // self.input.bump(1); - // // } - - // // Ok(Some(vec![c.into()])) - // } - - // fn read_token_bang_or_eq(&mut self, c: u8) -> LexResult> { - // let start = self.input.cur_pos(); - // let had_line_break_before_last = self.had_line_break_before_last(); - - // unsafe { - // // Safety: cur() is Some(c) if this method is called. - // self.input.bump(1); - // } - - // Ok(Some(if self.input.eat(RawToken::AssignOp)? { - // // "==" - - // if self.input.eat(RawToken::AssignOp)? { - // if c == b'!' { - // Token::BinOp(BinOpToken::NotEqEq) - // } else { - // // ======= - // // ^ - // if had_line_break_before_last && self.is_str("====") { - // self.emit_error_span(fixed_len_span(start, 7), - // SyntaxError::TS1185); self.skip_line_comment(4); - // self.skip_space::(); - // return self.read_token(); - // } - - // Token::BinOp(BinOpToken::EqEqEq) - // } - // } else if c == b'!' { - // Token::BinOp(BinOpToken::NotEq) - // } else { - // Token::BinOp(BinOpToken::EqEq) - // } - // } else if c == b'=' && self.input.eat_byte(b'>') { - // // "=>" - - // Token::Arrow - // } else if c == b'!' { - // Token::Bang - // } else { - // Token::AssignOp(AssignOp::Assign) - // })) - // } - // } - - // impl Lexer<'_> { - // /// This can be used if there's no keyword starting with the first - // /// character. - // fn read_ident_unknown(&mut self) -> LexResult { - // debug_assert!(self.input.cur()?.is_some()); - - // let (word, _) = self - // .read_word_as_str_with(|l, s, _, _| - // Word::Ident(IdentLike::Other(l.atoms.atom(s))))?; - - // Ok(Word(word)) - // } - - // /// This can be used if there's no keyword starting with the first - // /// character. - // fn read_word_with( - // &mut self, - // convert: &dyn Fn(&str) -> Option, - // ) -> LexResult> { - // debug_assert!(self.input.cur()?.is_some()); - - // let start = self.input.cur_pos(); - // let (word, has_escape) = self.read_word_as_str_with(|l, s, _, - // can_be_known| { if can_be_known { - // if let Some(word) = convert(s) { - // return word; - // } - // } - - // Word::Ident(IdentLike::Other(l.atoms.atom(s))) - // })?; - - // // Note: ctx is store in lexer because of this error. - // // 'await' and 'yield' may have semantic of reserved word, which - // means lexer // should know context or parser should handle this - // error. Our approach to this // problem is former one. - // if has_escape && self.ctx.is_reserved(&word) { - // self.error( - // start, - // SyntaxError::EscapeInReservedWord { word: word.into() }, - // )? - // } else { - // Ok(Some(Token::Word(word))) - // } - // } - - // /// This method is optimized for texts without escape sequences. - // /// - // /// `convert(text, has_escape, can_be_keyword)` - // fn read_word_as_str_with(&mut self, convert: F) -> - // LexResult<(Ret, bool)> where - // F: for<'any> FnOnce(&'any mut Lexer<'_>, &str, bool, bool) -> Ret, - // { - // debug_assert!(self.input.cur()?.is_some()); - // let mut first = true; - // let mut can_be_keyword = true; - // let mut slice_start = self.input.cur_pos(); - // let mut has_escape = false; - - // self.with_buf(|l, buf| { - // loop { - // if let Some(c) = l.input.cur_as_ascii() { - // // Performance optimization - // if can_be_keyword && (c.is_ascii_uppercase() || - // c.is_ascii_digit()) { can_be_keyword = false; - // } - - // if Ident::is_valid_continue(c as _) { - // l.bump(); - // continue; - // } else if first && Ident::is_valid_start(c as _) { - // l.bump(); - // first = false; - // continue; - // } - - // // unicode escape - // if c == b'\\' { - // first = false; - // has_escape = true; - // let start = l.input.cur_pos(); - // l.bump(); - - // if !l.is(b'u') { - // l.error_span(pos_span(start), - // SyntaxError::ExpectedUnicodeEscape)? } - - // { - // let end = l.input.cur_pos(); - // let s = unsafe { - // // Safety: start and end are valid position - // because we got them from // `self.input` - // l.input.slice(slice_start, start) - // }; - // buf.push_str(s); - // unsafe { - // // Safety: We got end from `self.input` - // l.input.reset_to(end); - // } - // } - - // let chars = l.read_unicode_escape()?; - - // if let Some(c) = chars.first() { - // let valid = if first { - // c.is_ident_start() - // } else { - // c.is_ident_part() - // }; - - // if !valid { - // l.emit_error(start, - // SyntaxError::InvalidIdentChar); } - // } - - // for c in chars { - // buf.extend(c); - // } - - // slice_start = l.input.cur_pos(); - // continue; - // } - - // // ASCII but not a valid identifier - - // break; - // } - - // if let Some(c) = l.input.cur() { - // if Ident::is_valid_continue(c) { - // l.bump(); - // continue; - // } else if first && Ident::is_valid_start(c) { - // l.bump(); - // first = false; - // continue; - // } - // } - - // break; - // } - - // let end = l.input.cur_pos(); - - // let value = if !has_escape { - // // Fast path: raw slice is enough if there's no escape. - - // let s = unsafe { - // // Safety: slice_start and end are valid position because - // we got them from // `self.input` - // l.input.slice(slice_start, end) - // }; - // let s = unsafe { - // // Safety: We don't use 'static. We just bypass the - // lifetime check. transmute::<&str, &'static str>(s) - // }; - - // convert(l, s, has_escape, can_be_keyword) - // } else { - // let s = unsafe { - // // Safety: slice_start and end are valid position because - // we got them from // `self.input` - // l.input.slice(slice_start, end) - // }; - // buf.push_str(s); - - // convert(l, buf, has_escape, can_be_keyword) - // }; - - // Ok((value, has_escape)) - // }) - // } - - // fn read_unicode_escape(&mut self) -> LexResult> { - // debug_assert_eq!(self.input.cur(), Some('u')); - - // let mut chars = Vec::new(); - // let mut is_curly = false; - - // self.bump(); // 'u' - - // if self.input.eat(b'{') { - // is_curly = true; - // } - - // let state = self.input.cur_pos(); - // let c = match self.read_int_u32::<16>(if is_curly { 0 } else { 4 }) { - // Ok(Some(val)) => { - // if 0x0010_ffff >= val { - // char::from_u32(val) - // } else { - // let start = self.input.cur_pos(); - - // self.error( - // start, - // SyntaxError::BadCharacterEscapeSequence { - // expected: if is_curly { - // "1-6 hex characters in the range 0 to - // 10FFFF." } else { - // "4 hex characters" - // }, - // }, - // )? - // } - // } - // _ => { - // let start = self.input.cur_pos(); - - // self.error( - // start, - // SyntaxError::BadCharacterEscapeSequence { - // expected: if is_curly { - // "1-6 hex characters" - // } else { - // "4 hex characters" - // }, - // }, - // )? - // } - // }; - - // match c { - // Some(c) => { - // chars.push(c.into()); - // } - // _ => { - // unsafe { - // // Safety: state is valid position because we got it from - // cur_pos() self.input.reset_to(state); - // } - - // chars.push(Char::from('\\')); - // chars.push(Char::from('u')); - - // if is_curly { - // chars.push(Char::from('{')); - - // for _ in 0..6 { - // if let Some(c) = self.input.cur() { - // if c == '}' { - // break; - // } - - // self.bump(); - - // chars.push(Char::from(c)); - // } else { - // break; - // } - // } - - // chars.push(Char::from('}')); - // } else { - // for _ in 0..4 { - // if let Some(c) = self.input.cur()? { - // self.bump(); - - // chars.push(Char::from(c)); - // } - // } - // } - // } - // } - - // if is_curly && !self.input.eat(b'}') { - // self.error(state, SyntaxError::InvalidUnicodeEscape)? - // } - - // Ok(chars) - // } - - // /// See https://tc39.github.io/ecma262/#sec-literals-string-literals - // fn read_str_lit(&mut self) -> LexResult { - // debug_assert!(self.input.cur()? == Some('\'') || self.input.cur()? == - // Some('"')); let start = self.input.cur_pos(); - // let quote = self.input.cur()?.unwrap() as u8; - - // self.bump(); // '"' - - // let mut has_escape = false; - // let mut slice_start = self.input.cur_pos(); - - // self.with_buf(|l, buf| { - // loop { - // if let Some(c) = l.input.cur_as_ascii() { - // if c == quote { - // let value_end = l.input.cur_pos(); - - // let value = if !has_escape { - // let s = unsafe { - // // Safety: slice_start and value_end are - // valid position because we // got them - // from `self.input` - // l.input.slice(slice_start, value_end) }; - - // l.atoms.atom(s) - // } else { - // let s = unsafe { - // // Safety: slice_start and value_end are - // valid position because we // got them - // from `self.input` - // l.input.slice(slice_start, value_end) }; - // buf.push_str(s); - - // l.atoms.atom(&**buf) - // }; - - // unsafe { - // // Safety: cur is quote - // l.input.bump(1); - // } - - // let end = l.input.cur_pos(); - - // let raw = unsafe { - // // Safety: start and end are valid position - // because we got them from // `self.input` - // l.input.slice(start, end) - // }; - // let raw = l.atoms.atom(raw); - - // return Ok(Token::Str { value, raw }); - // } - - // if c == b'\\' { - // has_escape = true; - - // { - // let end = l.input.cur_pos(); - // let s = unsafe { - // // Safety: start and end are valid position - // because we got them from // `self.input` - // l.input.slice(slice_start, end) - // }; - // buf.push_str(s); - // } - - // if let Some(chars) = l.read_escaped_char(false)? { - // for c in chars { - // buf.extend(c); - // } - // } - - // slice_start = l.input.cur_pos(); - // continue; - // } - - // if (c as char).is_line_break() { - // break; - // } - - // unsafe { - // // Safety: cur is a ascii character - // l.input.bump(1); - // } - // continue; - // } - - // match l.input.cur()? { - // Some(c) => { - // if c.is_line_break() { - // break; - // } - // unsafe { - // // Safety: cur is Some(c) - // l.input.bump(1); - // } - // } - // None => break, - // } - // } - - // { - // let end = l.input.cur_pos(); - // let s = unsafe { - // // Safety: start and end are valid position because we - // got them from // `self.input` - // l.input.slice(slice_start, end) - // }; - // buf.push_str(s); - // } - - // l.emit_error(start, SyntaxError::UnterminatedStrLit); - - // let end = l.input.cur_pos(); - - // let raw = unsafe { - // // Safety: start and end are valid position because we got - // them from // `self.input` - // l.input.slice(start, end) - // }; - // Ok(Token::Str { - // value: l.atoms.atom(&*buf), - // raw: l.atoms.atom(raw), - // }) - // }) - // } - - // /// Expects current char to be '/' - // fn read_regexp(&mut self, start: BytePos) -> LexResult { - // unsafe { - // // Safety: start is valid position, and cur() is Some('/') - // self.input.reset_to(start); - // } - - // debug_assert_eq!(self.input.cur()?, Some('/')); - - // let start = self.input.cur_pos(); - - // self.bump(); - - // let (mut escaped, mut in_class) = (false, false); - - // let content = self.with_buf(|l, buf| { - // while let Some(c) = l.input.cur() { - // // This is ported from babel. - // // Seems like regexp literal cannot contain linebreak. - // if c.is_line_terminator() { - // let span = l.span(start); - - // return Err(Error::new(span, - // SyntaxError::UnterminatedRegExp)); } - - // if escaped { - // escaped = false; - // } else { - // match c { - // '[' => in_class = true, - // ']' if in_class => in_class = false, - // // Terminates content part of regex literal - // '/' if !in_class => break, - // _ => {} - // } - - // escaped = c == '\\'; - // } - - // l.bump(); - // buf.push(c); - // } - - // Ok(l.atoms.atom(&**buf)) - // })?; - - // // input is terminated without following `/` - // if !self.is(b'/') { - // let span = self.span(start); - - // return Err(Error::new(span, SyntaxError::UnterminatedRegExp)); - // } - - // self.bump(); // '/' - - // // Spec says "It is a Syntax Error if IdentifierPart contains a - // Unicode escape // sequence." TODO: check for escape - - // // Need to use `read_word` because '\uXXXX' sequences are allowed - // // here (don't ask). - // // let flags_start = self.input.cur_pos(); - // let flags = { - // match self.input.cur() { - // Some(c) if c.is_ident_start() => self - // .read_word_as_str_with(|l, s, _, _| l.atoms.atom(s)) - // .map(Some), - // _ => Ok(None), - // } - // }? - // .map(|(value, _)| value) - // .unwrap_or_default(); - - // Ok(Token::Regex(content, flags)) - // } - - // fn read_tmpl_token(&mut self, start_of_tpl: BytePos) -> LexResult - // { let start = self.input.cur_pos(); - - // let mut cooked = Ok(String::new()); - // let mut cooked_slice_start = start; - // let raw_slice_start = start; - - // macro_rules! consume_cooked { - // () => {{ - // if let Ok(cooked) = &mut cooked { - // let last_pos = self.input.cur_pos(); - // cooked.push_str(unsafe { - // // Safety: Both of start and last_pos are valid - // position because we got them // from `self.input` - // self.input.slice(cooked_slice_start, last_pos) - // }); - // } - // }}; - // } - - // while let Some(c) = self.input.cur()? { - // if c == '`' || (c == '$' && self.input.peek()? == Some('{')) { - // if start == self.input.cur_pos() && - // self.state.last_was_tpl_element() { if c == '$' { - // self.bump(); - // self.bump(); - // return Ok(tok!("${")); - // } else { - // self.bump(); - // return Ok(tok!('`')); - // } - // } - - // // If we don't have any escape - // let cooked = if cooked_slice_start == raw_slice_start { - // let last_pos = self.input.cur_pos(); - // let s = unsafe { - // // Safety: Both of start and last_pos are valid - // position because we got them // from `self.input` - // self.input.slice(cooked_slice_start, last_pos) - // }; - - // Ok(self.atoms.atom(s)) - // } else { - // consume_cooked!(); - - // cooked.map(|s| self.atoms.atom(s)) - // }; - - // // TODO: Handle error - // let end = self.input.cur_pos(); - // let raw = unsafe { - // // Safety: Both of start and last_pos are valid position - // because we got them // from `self.input` - // self.input.slice(raw_slice_start, end) - // }; - // return Ok(Token::Template { - // cooked, - // raw: self.atoms.atom(raw), - // }); - // } - - // if c == '\\' { - // consume_cooked!(); - - // match self.read_escaped_char(true) { - // Ok(Some(chars)) => { - // if let Ok(ref mut cooked) = cooked { - // for c in chars { - // cooked.extend(c); - // } - // } - // } - // Ok(None) => {} - // Err(error) => { - // cooked = Err(error); - // } - // } - - // cooked_slice_start = self.input.cur_pos(); - // } else if c.is_line_terminator() { - // self.state.had_line_break = true; - - // consume_cooked!(); - - // let c = if c == '\r' && self.input.peek() == Some('\n') { - // self.bump(); // '\r' - // '\n' - // } else { - // match c { - // '\n' => '\n', - // '\r' => '\n', - // '\u{2028}' => '\u{2028}', - // '\u{2029}' => '\u{2029}', - // _ => unreachable!(), - // } - // }; - - // self.bump(); - - // if let Ok(ref mut cooked) = cooked { - // cooked.push(c); - // } - // cooked_slice_start = self.input.cur_pos(); - // } else { - // self.bump(); - // } - // } - - // self.error(start_of_tpl, SyntaxError::UnterminatedTpl)? - // } + fn read_tmpl_token(&mut self, start_of_tpl: BytePos) -> LexResult { + let start = self.input.cur_pos(); + + let mut cooked = Ok(String::new()); + let mut cooked_slice_start = start; + let raw_slice_start = start; + + macro_rules! consume_cooked { + () => {{ + if let Ok(cooked) = &mut cooked { + let last_pos = self.input.cur_pos(); + cooked.push_str(unsafe { + // Safety: Both of start and last_pos are valid position because we got them + // from `self.input` + self.input.slice(cooked_slice_start, last_pos) + }); + } + }}; + } + + while let Some(c) = self.input.cur() { + if c == '`' || (c == '$' && self.input.peek() == Some('{')) { + if start == self.input.cur_pos() && self.state.last_was_tpl_element() { + if c == '$' { + self.bump(); + self.bump(); + return Ok(tok!("${")); + } else { + self.bump(); + return Ok(tok!('`')); + } + } + + // If we don't have any escape + let cooked = if cooked_slice_start == raw_slice_start { + let last_pos = self.input.cur_pos(); + let s = unsafe { + // Safety: Both of start and last_pos are valid position because we got them + // from `self.input` + self.input.slice(cooked_slice_start, last_pos) + }; + + Ok(self.atoms.atom(s)) + } else { + consume_cooked!(); + + cooked.map(|s| self.atoms.atom(s)) + }; + + // TODO: Handle error + let end = self.input.cur_pos(); + let raw = unsafe { + // Safety: Both of start and last_pos are valid position because we got them + // from `self.input` + self.input.slice(raw_slice_start, end) + }; + return Ok(Token::Template { + cooked, + raw: self.atoms.atom(raw), + }); + } + + if c == '\\' { + consume_cooked!(); + + match self.read_escaped_char(true) { + Ok(Some(chars)) => { + if let Ok(ref mut cooked) = cooked { + for c in chars { + cooked.extend(c); + } + } + } + Ok(None) => {} + Err(error) => { + cooked = Err(error); + } + } + + cooked_slice_start = self.input.cur_pos(); + } else if c.is_line_terminator() { + self.state.had_line_break = true; + + consume_cooked!(); + + let c = if c == '\r' && self.input.peek() == Some('\n') { + self.bump(); // '\r' + '\n' + } else { + match c { + '\n' => '\n', + '\r' => '\n', + '\u{2028}' => '\u{2028}', + '\u{2029}' => '\u{2029}', + _ => unreachable!(), + } + }; + + self.bump(); + + if let Ok(ref mut cooked) = cooked { + cooked.push(c); + } + cooked_slice_start = self.input.cur_pos(); + } else { + self.bump(); + } + } + + self.error(start_of_tpl, SyntaxError::UnterminatedTpl)? + } #[inline] #[allow(clippy::misnamed_getters)] diff --git a/crates/swc_ecma_parser/src/lexer/number.rs b/crates/swc_ecma_parser/src/lexer/number.rs index b60397bbeed6..1745eb75b5ea 100644 --- a/crates/swc_ecma_parser/src/lexer/number.rs +++ b/crates/swc_ecma_parser/src/lexer/number.rs @@ -27,855 +27,811 @@ impl LazyBigInt { } } -// impl Lexer<'_> { -// /// Reads an integer, octal integer, or floating-point number -// pub(super) fn read_number( -// &mut self, -// starts_with_dot: bool, -// ) -> LexResult, Atom)>> { -// debug_assert!(self.input.cur()?.is_some()); - -// if starts_with_dot { -// debug_assert_eq!( -// self.input.cur(), -// Some('.'), -// "read_number(starts_with_dot = true) expects current char to -// be '.'" ); -// } - -// let start = self.input.cur_pos(); - -// let val = if starts_with_dot { -// // first char is '.' -// 0f64 -// } else { -// let starts_with_zero = self.input.cur().unwrap() == '0'; - -// // Use read_number_no_dot to support long numbers. -// let (val, s, not_octal) = -// self.read_number_no_dot_as_str::<10>()?; - -// if self.input.eat(b'n') { -// let end = self.input.cur_pos(); -// let raw = unsafe { -// // Safety: We got both start and end position from -// `self.input` self.input.slice(start, end) -// }; - -// return Ok(Either::Right(( -// Box::new(s.into_value()), -// self.atoms.atom(raw), -// ))); -// } - -// if starts_with_zero { -// // TODO: I guess it would be okay if I don't use -ffast-math -// // (or something like that), but needs review. -// if val == 0.0f64 { -// // If only one zero is used, it's decimal. -// // And if multiple zero is used, it's octal. -// // -// // e.g. `0` is decimal (so it can be part of float) -// // -// // e.g. `000` is octal -// if start.0 != self.last_pos().0 - 1 { -// // `-1` is utf 8 length of `0` - -// let end = self.input.cur_pos(); -// let raw = unsafe { -// // Safety: We got both start and end position -// from `self.input` self.input.slice(start, end) -// }; -// let raw = self.atoms.atom(raw); -// return self -// .make_legacy_octal(start, 0f64) -// .map(|value| Either::Left((value, raw))); -// } -// } else { -// // strict mode hates non-zero decimals starting with -// zero. // e.g. 08.1 is strict mode violation but 0.1 is -// valid float. - -// if val.fract() == 0.0 { -// let val_str = &s.value; - -// // if it contains '8' or '9', it's decimal. -// if not_octal { -// // Continue parsing -// self.emit_strict_mode_error(start, -// SyntaxError::LegacyDecimal); } else { -// // It's Legacy octal, and we should reinterpret -// value. let val = -// BigIntValue::from_str_radix(val_str, 8) -// .unwrap_or_else(|err| { panic!( -// "failed to parse {} using -// `from_str_radix`: {:?}", val_str, err -// ) -// }) -// .to_f64() -// .unwrap_or_else(|| { -// panic!("failed to parse {} into float -// using BigInt", val_str) }); - -// let end = self.input.cur_pos(); -// let raw = unsafe { -// // Safety: We got both start and end position -// from `self.input` self.input.slice(start, -// end) }; -// let raw = self.atoms.atom(raw); - -// return self -// .make_legacy_octal(start, val) -// .map(|value| Either::Left((value, raw))); -// } -// } -// } -// } - -// val -// }; - -// // At this point, number cannot be an octal literal. - -// let mut val: f64 = val; - -// // `0.a`, `08.a`, `102.a` are invalid. -// // -// // `.1.a`, `.1e-4.a` are valid, -// if self.input.cur() == Some('.') { -// self.bump(); - -// if starts_with_dot { -// debug_assert!(self.input.cur()?.is_some()); -// debug_assert!(self.input.cur().unwrap().is_ascii_digit()); -// } - -// // Read numbers after dot -// self.read_int::<10>(0)?; - -// val = { -// let end = self.input.cur_pos(); -// let raw = unsafe { -// // Safety: We got both start and end position from -// `self.input` self.input.slice(start, end) -// }; - -// // Remove number separator from number -// if raw.contains('_') { -// Cow::Owned(raw.replace('_', "")) -// } else { -// Cow::Borrowed(raw) -// } -// .parse() -// .expect("failed to parse float using rust's impl") -// }; -// } - -// // Handle 'e' and 'E' -// // -// // .5e1 = 5 -// // 1e2 = 100 -// // 1e+2 = 100 -// // 1e-2 = 0.01 -// match self.input.cur() { -// Some('e') | Some('E') => { -// self.bump(); - -// let next = match self.input.cur() { -// Some(next) => next, -// None => { -// let pos = self.input.cur_pos(); -// self.error(pos, -// SyntaxError::NumLitTerminatedWithExp)? } -// }; - -// let positive = if next == '+' || next == '-' { -// self.bump(); // remove '+', '-' - -// next == '+' -// } else { -// true -// }; - -// let exp = self.read_number_no_dot::<10>()?; - -// val = if exp == f64::INFINITY { -// if positive && val != 0.0 { -// f64::INFINITY -// } else { -// 0.0 -// } -// } else { -// let end = self.input.cur_pos(); -// let raw = unsafe { -// // Safety: We got both start and end position from -// `self.input` self.input.slice(start, end) -// }; - -// if raw.contains('_') { -// Cow::Owned(raw.replace('_', "")) -// } else { -// Cow::Borrowed(raw) -// } -// .parse() -// .expect("failed to parse float literal") -// } -// } -// _ => {} -// } - -// self.ensure_not_ident()?; - -// let end = self.input.cur_pos(); -// let raw_str = unsafe { -// // Safety: We got both start and end position from `self.input` -// self.input.slice(start, end) -// }; -// Ok(Either::Left((val, raw_str.into()))) -// } - -// /// Returns `Left(value)` or `Right(BigInt)` -// pub(super) fn read_radix_number( -// &mut self, -// ) -> LexResult, Atom)>> { -// debug_assert!( -// RADIX == 2 || RADIX == 8 || RADIX == 16, -// "radix should be one of 2, 8, 16, but got {}", -// RADIX -// ); -// debug_assert_eq!(self.input.cur(), Some('0')); - -// let start = self.input.cur_pos(); - -// self.bump(); - -// match self.input.cur()? { -// Some(..) => { -// self.bump(); -// } -// _ => { -// unreachable!(); -// } -// } - -// let (val, s, _) = self.read_number_no_dot_as_str::()?; - -// if self.input.eat(b'n') { -// let end = self.input.cur_pos(); -// let raw = unsafe { -// // Safety: We got both start and end position from -// `self.input` self.input.slice(start, end) -// }; - -// return Ok(Either::Right(( -// Box::new(s.into_value()), -// self.atoms.atom(raw), -// ))); -// } - -// self.ensure_not_ident()?; - -// let end = self.input.cur_pos(); -// let raw = unsafe { -// // Safety: We got both start and end position from `self.input` -// self.input.slice(start, end) -// }; - -// Ok(Either::Left((val, self.atoms.atom(raw)))) -// } - -// /// This can read long integers like -// /// "13612536612375123612312312312312312312312". -// fn read_number_no_dot(&mut self) -> LexResult { -// debug_assert!( -// RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16, -// "radix for read_number_no_dot should be one of 2, 8, 10, 16, but -// got {}", RADIX -// ); -// let start = self.input.cur_pos(); - -// let mut read_any = false; - -// let res = self.read_digits::<_, f64, RADIX>( -// |total, radix, v| { -// read_any = true; - -// Ok((f64::mul_add(total, radix as f64, v as f64), true)) -// }, -// true, -// ); - -// if !read_any { -// self.error(start, SyntaxError::ExpectedDigit { radix: RADIX })?; -// } -// res -// } - -// /// This can read long integers like -// /// "13612536612375123612312312312312312312312". -// /// -// /// - Returned `bool` is `true` is there was `8` or `9`. -// fn read_number_no_dot_as_str( -// &mut self, -// ) -> LexResult<(f64, LazyBigInt, bool)> { -// debug_assert!( -// RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16, -// "radix for read_number_no_dot should be one of 2, 8, 10, 16, but -// got {}", RADIX -// ); -// let start = self.input.cur_pos(); - -// let mut non_octal = false; -// let mut read_any = false; - -// self.read_digits::<_, f64, RADIX>( -// |total, radix, v| { -// read_any = true; - -// if v == 8 || v == 9 { -// non_octal = true; -// } - -// Ok((f64::mul_add(total, radix as f64, v as f64), true)) -// }, -// true, -// )?; - -// if !read_any { -// self.error(start, SyntaxError::ExpectedDigit { radix: RADIX })?; -// } - -// let end = self.input.cur_pos(); -// let raw = unsafe { -// // Safety: We got both start and end position from `self.input` -// self.input.slice(start, end) -// }; -// // Remove number separator from number -// let raw_number_str = raw.replace('_', ""); -// let parsed_float = BigIntValue::from_str_radix(&raw_number_str, RADIX -// as u32) .expect("failed to parse float using BigInt") -// .to_f64() -// .expect("failed to parse float using BigInt"); -// Ok((parsed_float, LazyBigInt::new(raw_number_str), non_octal)) -// } - -// /// Ensure that ident cannot directly follow numbers. -// fn ensure_not_ident(&mut self) -> LexResult<()> { -// match self.input.cur() { -// Some(c) if c.is_ident_start() => { -// let span = pos_span(self.input.cur_pos()); -// self.error_span(span, SyntaxError::IdentAfterNum)? -// } -// _ => Ok(()), -// } -// } - -// /// Read an integer in the given radix. Return `None` if zero digits -// /// were read, the integer value otherwise. -// /// When `len` is not zero, this -// /// will return `None` unless the integer has exactly `len` digits. -// pub(super) fn read_int(&mut self, len: u8) -> -// LexResult> { let mut count = 0u16; -// let v = self.read_digits::<_, Option, RADIX>( -// |opt: Option, radix, val| { -// count += 1; -// let total = opt.unwrap_or_default() * radix as f64 + val as -// f64; - -// Ok((Some(total), count != len as u16)) -// }, -// true, -// )?; -// if len != 0 && count != len as u16 { -// Ok(None) -// } else { -// Ok(v) -// } -// } - -// pub(super) fn read_int_u32(&mut self, len: u8) -> -// LexResult> { let start = self.state.start; - -// let mut count = 0; -// let v = self.read_digits::<_, Option, RADIX>( -// |opt: Option, radix, val| { -// count += 1; - -// let total = opt -// .unwrap_or_default() -// .checked_mul(radix as u32) -// .and_then(|v| v.checked_add(val)) -// .ok_or_else(|| { -// let span = Span::new(start, start); -// Error::new(span, SyntaxError::InvalidUnicodeEscape) -// })?; - -// Ok((Some(total), count != len)) -// }, -// true, -// )?; -// if len != 0 && count != len { -// Ok(None) -// } else { -// Ok(v) -// } -// } - -// /// `op`- |total, radix, value| -> (total * radix + value, continue) -// fn read_digits( -// &mut self, -// mut op: F, -// allow_num_separator: bool, -// ) -> LexResult -// where -// F: FnMut(Ret, u8, u32) -> LexResult<(Ret, bool)>, -// Ret: Copy + Default, -// { -// debug_assert!( -// RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16, -// "radix for read_int should be one of 2, 8, 10, 16, but got {}", -// RADIX -// ); - -// if cfg!(feature = "debug") { -// trace!( -// "read_digits(radix = {}), cur = {:?}", -// RADIX, -// self.input.cur() -// ); -// } - -// let start = self.input.cur_pos(); -// let mut total: Ret = Default::default(); -// let mut prev = None; - -// while let Some(c) = self.input.cur() { -// if allow_num_separator && c == '_' { -// let is_allowed = |c: Option| { -// if c.is_none() { -// return false; -// } - -// let c = c.unwrap(); - -// c.is_digit(RADIX as _) -// }; -// let is_forbidden = |c: Option| { -// if c.is_none() { -// return true; -// } - -// if RADIX == 16 { -// matches!(c.unwrap(), '.' | 'X' | '_' | 'x') -// } else { -// matches!(c.unwrap(), '.' | 'B' | 'E' | 'O' | '_' | -// 'b' | 'e' | 'o') } -// }; - -// let next = self.input.peek(); - -// if !is_allowed(next) || is_forbidden(prev) || -// is_forbidden(next) { self.emit_error( -// start, -// -// SyntaxError::NumericSeparatorIsAllowedOnlyBetweenTwoDigits, -// ); } - -// // Ignore this _ character -// unsafe { -// // Safety: cur() returns Some(c) where c is a valid char -// self.input.bump(1); -// } - -// continue; -// } - -// // e.g. (val for a) = 10 where radix = 16 -// let val = if let Some(val) = c.to_digit(RADIX as _) { -// val -// } else { -// return Ok(total); -// }; - -// self.bump(); - -// let (t, cont) = op(total, RADIX, val)?; - -// total = t; - -// if !cont { -// return Ok(total); -// } - -// prev = Some(c); -// } - -// Ok(total) -// } - -// fn make_legacy_octal(&mut self, start: BytePos, val: f64) -> -// LexResult { self.ensure_not_ident()?; - -// if self.syntax.typescript() && self.target >= EsVersion::Es5 { -// self.emit_error(start, SyntaxError::TS1085); -// } - -// self.emit_strict_mode_error(start, SyntaxError::LegacyOctal); - -// Ok(val) -// } -// } - -// #[cfg(test)] -// mod tests { -// use std::panic; - -// use super::*; - -// fn lex(s: &'static str, f: F) -> Ret -// where -// F: FnOnce(&mut Lexer<'_>) -> Ret, -// { -// crate::with_test_sess(s, |_, input| { -// let mut l = Lexer::new( -// Syntax::Es(Default::default()), -// Default::default(), -// input, -// None, -// ); -// let ret = f(&mut l); -// assert_eq!(l.input.cur()?, None); -// Ok(ret) -// }) -// .unwrap() -// } - -// fn num(s: &'static str) -> (f64, Atom) { -// lex(s, |l| { -// l.read_number(s.starts_with('.')).unwrap().left().unwrap() -// }) -// } - -// fn int(s: &'static str) -> u32 { -// lex(s, |l| { -// l.read_int_u32::(0) -// .unwrap() -// .expect("read_int returned None") -// }) -// } - -// const LONG: &str = "1e10000000000000000000000000000000000000000\ -// -// 0000000000000000000000000000000000000000000000000000"; #[test] -// fn num_inf() { -// assert_eq!(num(LONG), (f64::INFINITY, LONG.into())); -// } - -// /// Number >= 2^53 -// #[test] -// fn num_big_exp() { -// assert_eq!((1e30, "1e30".into()), num("1e30")); -// } - -// #[test] -// fn num_very_big_exp() { -// const LARGE_POSITIVE_EXP: &str = -// -// "1e100000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// 00000000000000000000000000000000000000000000000000000"; -// const LARGE_NEGATIVE_EXP: &str = -// -// "1e-10000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// 000000000000000000000000000000000000000000000000000000"; -// const ZERO_WITH_LARGE_POSITIVE_EXP: &str = -// -// "0e100000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// 00000000000000000000000000000000000000000000000000000"; -// const ZERO_WITH_LARGE_NEGATIVE_EXP: &str = -// -// "0e-10000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// 000000000000000000000000000000000000000000000000000000"; -// const LARGE_MANTISSA_WITH_LARGE_NEGATIVE_EXP: &str = -// "10000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// 000000000000000000000000000000000000000000000000000000\ -// -// e-100000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// -// 00000000000000000000000000000000000000000000000000000000000000000\ -// 000000000000000000000000000000000000000000000000000000"; - -// assert_eq!( -// num(LARGE_POSITIVE_EXP), -// (f64::INFINITY, LARGE_POSITIVE_EXP.into()) -// ); -// assert_eq!(num(LARGE_NEGATIVE_EXP), (0.0, -// LARGE_NEGATIVE_EXP.into())); assert_eq!( -// num(ZERO_WITH_LARGE_POSITIVE_EXP), -// (0.0, ZERO_WITH_LARGE_POSITIVE_EXP.into()) -// ); -// assert_eq!( -// num(ZERO_WITH_LARGE_NEGATIVE_EXP), -// (0.0, ZERO_WITH_LARGE_NEGATIVE_EXP.into()) -// ); -// assert_eq!( -// num(LARGE_MANTISSA_WITH_LARGE_NEGATIVE_EXP), -// (0.0, LARGE_MANTISSA_WITH_LARGE_NEGATIVE_EXP.into()) -// ); -// } - -// #[test] -// fn num_big_many_zero() { -// assert_eq!( -// ( -// 1_000_000_000_000_000_000_000_000_000_000f64, -// "1000000000000000000000000000000".into() -// ), -// num("1000000000000000000000000000000") -// ); -// assert_eq!( -// (3.402_823_466_385_288_6e38, "34028234663852886e22".into()), -// num("34028234663852886e22"), -// ); -// } - -// #[test] -// fn big_number_with_fract() { -// assert_eq!( -// (77777777777777777.1f64, "77777777777777777.1".into()), -// num("77777777777777777.1") -// ) -// } - -// #[test] -// fn issue_480() { -// assert_eq!((9.09, "9.09".into()), num("9.09")) -// } - -// #[test] -// fn num_legacy_octal() { -// assert_eq!((0o12 as f64, "0012".into()), num("0012")); -// assert_eq!((10f64, "012".into()), num("012")); -// } - -// #[test] -// fn read_int_1() { -// assert_eq!(60, int::<10>("60")); -// assert_eq!(0o73, int::<8>("73")); -// } - -// #[test] -// fn read_int_short() { -// assert_eq!(7, int::<10>("7")); -// assert_eq!(10, int::<10>("10")); -// } - -// #[test] -// fn read_radix_number() { -// assert_eq!( -// (0o73 as f64, "0o73".into()), -// lex("0o73", |l| l -// .read_radix_number::<8>() -// .unwrap() -// .left() -// .unwrap()) -// ); -// } - -// #[test] -// fn read_num_sep() { -// assert_eq!(1_000, int::<10>("1_000")); -// assert_eq!(0xaebece, int::<16>("AE_BE_CE")); -// assert_eq!(0b1010000110000101, int::<2>("1010_0001_1000_0101")); -// assert_eq!(0o0666, int::<8>("0_6_6_6")); -// } - -// #[test] -// fn read_bigint() { -// assert_eq!( -// lex( -// "10000000000000000000000000000000000000000000000000000n", -// |l| l.read_number(false).unwrap().right().unwrap() -// ), -// ( -// Box::new( -// "10000000000000000000000000000000000000000000000000000" -// .parse::() -// .unwrap() -// ), -// -// Atom::from("10000000000000000000000000000000000000000000000000000n") -// ), -// ); -// } - -// #[test] -// fn large_bin_number() { -// const LONG: &str = -// -// "0B11111111111111111111111111111111111111111111111101001010100000010111110001111111111" -// ; const VERY_LARGE_BINARY_NUMBER: &str = -// -// "0B1111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// -// 111111111111111111111111111111111111111111111111111111111111111111\ -// 0010111110001111111111"; -// assert_eq!( -// lex(LONG, |l| l -// .read_radix_number::<2>() -// .unwrap() -// .left() -// .unwrap()), -// (9.671_406_556_917_009e24, LONG.into()) -// ); -// assert_eq!( -// lex(VERY_LARGE_BINARY_NUMBER, |l| l -// .read_radix_number::<2>() -// .unwrap() -// .left() -// .unwrap()), -// (1.0972248137587377e304, VERY_LARGE_BINARY_NUMBER.into()) -// ); -// } - -// #[test] -// fn large_float_number() { -// const LONG: &str = "9.671406556917009e+24"; - -// assert_eq!(num(LONG), (9.671_406_556_917_009e24, LONG.into())); -// } - -// /// Valid even on strict mode. -// const VALID_CASES: &[&str] = &[".0", "0.e-1", "0e8", ".8e1", "0.8e1", -// "1.18e1"]; const INVALID_CASES_ON_STRICT: &[&str] = &["08e1", "08.1", -// "08.8e1", "08", "01"]; const INVALID_CASES: &[&str] = &["01.8e1", -// "012e1", "00e1", "00.0"]; - -// fn test_floats(strict: bool, success: bool, cases: &'static [&'static -// str]) { for case in cases { -// println!( -// "Testing {} (when strict = {}); Expects success = {}", -// case, strict, success -// ); -// // lazy way to get expected values -// let expected: f64 = (i64::from_str_radix(case, 8).map(|v| v as -// f64)) .or_else(|_| case.parse::().map(|v| v as f64)) -// .or_else(|_| case.parse::()) -// .unwrap_or_else(|err| { -// panic!( -// "failed to parse '{}' as float using str.parse(): -// {}", case, err -// ) -// }); - -// let vec = panic::catch_unwind(|| { -// crate::with_test_sess(case, |_, input| { -// let mut l = Lexer::new(Syntax::default(), -// Default::default(), input, None); l.ctx.strict = strict; -// Ok(l.map(|ts| ts.token).collect::>()) -// }) -// .unwrap() -// }); - -// if success { -// let vec = match vec { -// Ok(vec) => vec, -// Err(err) => panic::resume_unwind(err), -// }; - -// assert_eq!(vec.len(), 1); - -// let token = vec.into_iter().next().unwrap(); -// let value = match token { -// Token::Num { value, .. } => value, -// _ => { -// panic!("expected num token in test") -// } -// }; - -// assert_eq!(expected, value); -// } else if let Ok(vec) = vec { -// assert_ne!( -// vec![Token::Num { -// value: expected, -// raw: expected.to_string().into() -// }], -// vec -// ) -// } -// } -// } - -// // #[test] -// // fn strict_mode() { -// // test_floats(true, true, VALID_CASES); -// // test_floats(true, false, INVALID_CASES_ON_STRICT); -// // test_floats(true, false, INVALID_CASES); -// // } - -// #[test] -// fn non_strict() { -// test_floats(false, true, VALID_CASES); -// test_floats(false, true, INVALID_CASES_ON_STRICT); -// test_floats(false, false, INVALID_CASES); -// } -// } +impl Lexer<'_> { + /// Reads an integer, octal integer, or floating-point number + pub(super) fn read_number( + &mut self, + starts_with_dot: bool, + ) -> LexResult, Atom)>> { + debug_assert!(self.input.cur()?.is_some()); + + if starts_with_dot { + debug_assert_eq!( + self.input.cur(), + Some('.'), + "read_number(starts_with_dot = true) expects current char to be '.'" + ); + } + + let start = self.input.cur_pos(); + + let val = if starts_with_dot { + // first char is '.' + 0f64 + } else { + let starts_with_zero = self.input.cur().unwrap() == '0'; + + // Use read_number_no_dot to support long numbers. + let (val, s, not_octal) = self.read_number_no_dot_as_str::<10>()?; + + if self.input.eat(b'n') { + let end = self.input.cur_pos(); + let raw = unsafe { + // Safety: We got both start and end position from `self.input` + self.input.slice(start, end) + }; + + return Ok(Either::Right(( + Box::new(s.into_value()), + self.atoms.atom(raw), + ))); + } + + if starts_with_zero { + // TODO: I guess it would be okay if I don't use -ffast-math + // (or something like that), but needs review. + if val == 0.0f64 { + // If only one zero is used, it's decimal. + // And if multiple zero is used, it's octal. + // + // e.g. `0` is decimal (so it can be part of float) + // + // e.g. `000` is octal + if start.0 != self.last_pos().0 - 1 { + // `-1` is utf 8 length of `0` + + let end = self.input.cur_pos(); + let raw = unsafe { + // Safety: We got both start and end position from `self.input` + self.input.slice(start, end) + }; + let raw = self.atoms.atom(raw); + return self + .make_legacy_octal(start, 0f64) + .map(|value| Either::Left((value, raw))); + } + } else { + // strict mode hates non-zero decimals starting with zero. + // e.g. 08.1 is strict mode violation but 0.1 is valid float. + + if val.fract() == 0.0 { + let val_str = &s.value; + + // if it contains '8' or '9', it's decimal. + if not_octal { + // Continue parsing + self.emit_strict_mode_error(start, SyntaxError::LegacyDecimal); + } else { + // It's Legacy octal, and we should reinterpret value. + let val = BigIntValue::from_str_radix(val_str, 8) + .unwrap_or_else(|err| { + panic!( + "failed to parse {} using `from_str_radix`: {:?}", + val_str, err + ) + }) + .to_f64() + .unwrap_or_else(|| { + panic!("failed to parse {} into float using BigInt", val_str) + }); + + let end = self.input.cur_pos(); + let raw = unsafe { + // Safety: We got both start and end position from `self.input` + self.input.slice(start, end) + }; + let raw = self.atoms.atom(raw); + + return self + .make_legacy_octal(start, val) + .map(|value| Either::Left((value, raw))); + } + } + } + } + + val + }; + + // At this point, number cannot be an octal literal. + + let mut val: f64 = val; + + // `0.a`, `08.a`, `102.a` are invalid. + // + // `.1.a`, `.1e-4.a` are valid, + if self.input.cur() == Some('.') { + self.bump(); + + if starts_with_dot { + debug_assert!(self.input.cur()?.is_some()); + debug_assert!(self.input.cur().unwrap().is_ascii_digit()); + } + + // Read numbers after dot + self.read_int::<10>(0)?; + + val = { + let end = self.input.cur_pos(); + let raw = unsafe { + // Safety: We got both start and end position from `self.input` + self.input.slice(start, end) + }; + + // Remove number separator from number + if raw.contains('_') { + Cow::Owned(raw.replace('_', "")) + } else { + Cow::Borrowed(raw) + } + .parse() + .expect("failed to parse float using rust's impl") + }; + } + + // Handle 'e' and 'E' + // + // .5e1 = 5 + // 1e2 = 100 + // 1e+2 = 100 + // 1e-2 = 0.01 + match self.input.cur() { + Some('e') | Some('E') => { + self.bump(); + + let next = match self.input.cur() { + Some(next) => next, + None => { + let pos = self.input.cur_pos(); + self.error(pos, SyntaxError::NumLitTerminatedWithExp)? + } + }; + + let positive = if next == '+' || next == '-' { + self.bump(); // remove '+', '-' + + next == '+' + } else { + true + }; + + let exp = self.read_number_no_dot::<10>()?; + + val = if exp == f64::INFINITY { + if positive && val != 0.0 { + f64::INFINITY + } else { + 0.0 + } + } else { + let end = self.input.cur_pos(); + let raw = unsafe { + // Safety: We got both start and end position from `self.input` + self.input.slice(start, end) + }; + + if raw.contains('_') { + Cow::Owned(raw.replace('_', "")) + } else { + Cow::Borrowed(raw) + } + .parse() + .expect("failed to parse float literal") + } + } + _ => {} + } + + self.ensure_not_ident()?; + + let end = self.input.cur_pos(); + let raw_str = unsafe { + // Safety: We got both start and end position from `self.input` + self.input.slice(start, end) + }; + Ok(Either::Left((val, raw_str.into()))) + } + + /// Returns `Left(value)` or `Right(BigInt)` + pub(super) fn read_radix_number( + &mut self, + ) -> LexResult, Atom)>> { + debug_assert!( + RADIX == 2 || RADIX == 8 || RADIX == 16, + "radix should be one of 2, 8, 16, but got {}", + RADIX + ); + debug_assert_eq!(self.input.cur(), Some('0')); + + let start = self.input.cur_pos(); + + self.bump(); + + match self.input.cur()? { + Some(..) => { + self.bump(); + } + _ => { + unreachable!(); + } + } + + let (val, s, _) = self.read_number_no_dot_as_str::()?; + + if self.input.eat(b'n') { + let end = self.input.cur_pos(); + let raw = unsafe { + // Safety: We got both start and end position from `self.input` + self.input.slice(start, end) + }; + + return Ok(Either::Right(( + Box::new(s.into_value()), + self.atoms.atom(raw), + ))); + } + + self.ensure_not_ident()?; + + let end = self.input.cur_pos(); + let raw = unsafe { + // Safety: We got both start and end position from `self.input` + self.input.slice(start, end) + }; + + Ok(Either::Left((val, self.atoms.atom(raw)))) + } + + /// This can read long integers like + /// "13612536612375123612312312312312312312312". + fn read_number_no_dot(&mut self) -> LexResult { + debug_assert!( + RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16, + "radix for read_number_no_dot should be one of 2, 8, 10, 16, but got {}", + RADIX + ); + let start = self.input.cur_pos(); + + let mut read_any = false; + + let res = self.read_digits::<_, f64, RADIX>( + |total, radix, v| { + read_any = true; + + Ok((f64::mul_add(total, radix as f64, v as f64), true)) + }, + true, + ); + + if !read_any { + self.error(start, SyntaxError::ExpectedDigit { radix: RADIX })?; + } + res + } + + /// This can read long integers like + /// "13612536612375123612312312312312312312312". + /// + /// - Returned `bool` is `true` is there was `8` or `9`. + fn read_number_no_dot_as_str( + &mut self, + ) -> LexResult<(f64, LazyBigInt, bool)> { + debug_assert!( + RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16, + "radix for read_number_no_dot should be one of 2, 8, 10, 16, but got {}", + RADIX + ); + let start = self.input.cur_pos(); + + let mut non_octal = false; + let mut read_any = false; + + self.read_digits::<_, f64, RADIX>( + |total, radix, v| { + read_any = true; + + if v == 8 || v == 9 { + non_octal = true; + } + + Ok((f64::mul_add(total, radix as f64, v as f64), true)) + }, + true, + )?; + + if !read_any { + self.error(start, SyntaxError::ExpectedDigit { radix: RADIX })?; + } + + let end = self.input.cur_pos(); + let raw = unsafe { + // Safety: We got both start and end position from `self.input` + self.input.slice(start, end) + }; + // Remove number separator from number + let raw_number_str = raw.replace('_', ""); + let parsed_float = BigIntValue::from_str_radix(&raw_number_str, RADIX as u32) + .expect("failed to parse float using BigInt") + .to_f64() + .expect("failed to parse float using BigInt"); + Ok((parsed_float, LazyBigInt::new(raw_number_str), non_octal)) + } + + /// Ensure that ident cannot directly follow numbers. + fn ensure_not_ident(&mut self) -> LexResult<()> { + match self.input.cur() { + Some(c) if c.is_ident_start() => { + let span = pos_span(self.input.cur_pos()); + self.error_span(span, SyntaxError::IdentAfterNum)? + } + _ => Ok(()), + } + } + + /// Read an integer in the given radix. Return `None` if zero digits + /// were read, the integer value otherwise. + /// When `len` is not zero, this + /// will return `None` unless the integer has exactly `len` digits. + pub(super) fn read_int(&mut self, len: u8) -> LexResult> { + let mut count = 0u16; + let v = self.read_digits::<_, Option, RADIX>( + |opt: Option, radix, val| { + count += 1; + let total = opt.unwrap_or_default() * radix as f64 + val as f64; + + Ok((Some(total), count != len as u16)) + }, + true, + )?; + if len != 0 && count != len as u16 { + Ok(None) + } else { + Ok(v) + } + } + + pub(super) fn read_int_u32(&mut self, len: u8) -> LexResult> { + let start = self.state.start; + + let mut count = 0; + let v = self.read_digits::<_, Option, RADIX>( + |opt: Option, radix, val| { + count += 1; + + let total = opt + .unwrap_or_default() + .checked_mul(radix as u32) + .and_then(|v| v.checked_add(val)) + .ok_or_else(|| { + let span = Span::new(start, start); + Error::new(span, SyntaxError::InvalidUnicodeEscape) + })?; + + Ok((Some(total), count != len)) + }, + true, + )?; + if len != 0 && count != len { + Ok(None) + } else { + Ok(v) + } + } + + /// `op`- |total, radix, value| -> (total * radix + value, continue) + fn read_digits( + &mut self, + mut op: F, + allow_num_separator: bool, + ) -> LexResult + where + F: FnMut(Ret, u8, u32) -> LexResult<(Ret, bool)>, + Ret: Copy + Default, + { + debug_assert!( + RADIX == 2 || RADIX == 8 || RADIX == 10 || RADIX == 16, + "radix for read_int should be one of 2, 8, 10, 16, but got {}", + RADIX + ); + + if cfg!(feature = "debug") { + trace!( + "read_digits(radix = {}), cur = {:?}", + RADIX, + self.input.cur() + ); + } + + let start = self.input.cur_pos(); + let mut total: Ret = Default::default(); + let mut prev = None; + + while let Some(c) = self.input.cur() { + if allow_num_separator && c == '_' { + let is_allowed = |c: Option| { + if c.is_none() { + return false; + } + + let c = c.unwrap(); + + c.is_digit(RADIX as _) + }; + let is_forbidden = |c: Option| { + if c.is_none() { + return true; + } + + if RADIX == 16 { + matches!(c.unwrap(), '.' | 'X' | '_' | 'x') + } else { + matches!(c.unwrap(), '.' | 'B' | 'E' | 'O' | '_' | 'b' | 'e' | 'o') + } + }; + + let next = self.input.peek(); + + if !is_allowed(next) || is_forbidden(prev) || is_forbidden(next) { + self.emit_error( + start, + SyntaxError::NumericSeparatorIsAllowedOnlyBetweenTwoDigits, + ); + } + + // Ignore this _ character + unsafe { + // Safety: cur() returns Some(c) where c is a valid char + self.input.bump(1); + } + + continue; + } + + // e.g. (val for a) = 10 where radix = 16 + let val = if let Some(val) = c.to_digit(RADIX as _) { + val + } else { + return Ok(total); + }; + + self.bump(); + + let (t, cont) = op(total, RADIX, val)?; + + total = t; + + if !cont { + return Ok(total); + } + + prev = Some(c); + } + + Ok(total) + } + + fn make_legacy_octal(&mut self, start: BytePos, val: f64) -> LexResult { + self.ensure_not_ident()?; + + if self.syntax.typescript() && self.target >= EsVersion::Es5 { + self.emit_error(start, SyntaxError::TS1085); + } + + self.emit_strict_mode_error(start, SyntaxError::LegacyOctal); + + Ok(val) + } +} + +#[cfg(test)] +mod tests { + use std::panic; + + use super::*; + + fn lex(s: &'static str, f: F) -> Ret + where + F: FnOnce(&mut Lexer<'_>) -> Ret, + { + crate::with_test_sess(s, |_, input| { + let mut l = Lexer::new( + Syntax::Es(Default::default()), + Default::default(), + input, + None, + ); + let ret = f(&mut l); + assert_eq!(l.input.cur(), None); + Ok(ret) + }) + .unwrap() + } + + fn num(s: &'static str) -> (f64, Atom) { + lex(s, |l| { + l.read_number(s.starts_with('.')).unwrap().left().unwrap() + }) + } + + fn int(s: &'static str) -> u32 { + lex(s, |l| { + l.read_int_u32::(0) + .unwrap() + .expect("read_int returned None") + }) + } + + const LONG: &str = "1e10000000000000000000000000000000000000000\ + 0000000000000000000000000000000000000000000000000000"; + #[test] + fn num_inf() { + assert_eq!(num(LONG), (f64::INFINITY, LONG.into())); + } + + /// Number >= 2^53 + #[test] + fn num_big_exp() { + assert_eq!((1e30, "1e30".into()), num("1e30")); + } + + #[test] + fn num_very_big_exp() { + const LARGE_POSITIVE_EXP: &str = + "1e100000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000"; + const LARGE_NEGATIVE_EXP: &str = + "1e-10000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 000000000000000000000000000000000000000000000000000000"; + const ZERO_WITH_LARGE_POSITIVE_EXP: &str = + "0e100000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000"; + const ZERO_WITH_LARGE_NEGATIVE_EXP: &str = + "0e-10000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 000000000000000000000000000000000000000000000000000000"; + const LARGE_MANTISSA_WITH_LARGE_NEGATIVE_EXP: &str = + "10000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 000000000000000000000000000000000000000000000000000000\ + e-100000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 00000000000000000000000000000000000000000000000000000000000000000\ + 000000000000000000000000000000000000000000000000000000"; + + assert_eq!( + num(LARGE_POSITIVE_EXP), + (f64::INFINITY, LARGE_POSITIVE_EXP.into()) + ); + assert_eq!(num(LARGE_NEGATIVE_EXP), (0.0, LARGE_NEGATIVE_EXP.into())); + assert_eq!( + num(ZERO_WITH_LARGE_POSITIVE_EXP), + (0.0, ZERO_WITH_LARGE_POSITIVE_EXP.into()) + ); + assert_eq!( + num(ZERO_WITH_LARGE_NEGATIVE_EXP), + (0.0, ZERO_WITH_LARGE_NEGATIVE_EXP.into()) + ); + assert_eq!( + num(LARGE_MANTISSA_WITH_LARGE_NEGATIVE_EXP), + (0.0, LARGE_MANTISSA_WITH_LARGE_NEGATIVE_EXP.into()) + ); + } + + #[test] + fn num_big_many_zero() { + assert_eq!( + ( + 1_000_000_000_000_000_000_000_000_000_000f64, + "1000000000000000000000000000000".into() + ), + num("1000000000000000000000000000000") + ); + assert_eq!( + (3.402_823_466_385_288_6e38, "34028234663852886e22".into()), + num("34028234663852886e22"), + ); + } + + #[test] + fn big_number_with_fract() { + assert_eq!( + (77777777777777777.1f64, "77777777777777777.1".into()), + num("77777777777777777.1") + ) + } + + #[test] + fn issue_480() { + assert_eq!((9.09, "9.09".into()), num("9.09")) + } + + #[test] + fn num_legacy_octal() { + assert_eq!((0o12 as f64, "0012".into()), num("0012")); + assert_eq!((10f64, "012".into()), num("012")); + } + + #[test] + fn read_int_1() { + assert_eq!(60, int::<10>("60")); + assert_eq!(0o73, int::<8>("73")); + } + + #[test] + fn read_int_short() { + assert_eq!(7, int::<10>("7")); + assert_eq!(10, int::<10>("10")); + } + + #[test] + fn read_radix_number() { + assert_eq!( + (0o73 as f64, "0o73".into()), + lex("0o73", |l| l + .read_radix_number::<8>() + .unwrap() + .left() + .unwrap()) + ); + } + + #[test] + fn read_num_sep() { + assert_eq!(1_000, int::<10>("1_000")); + assert_eq!(0xaebece, int::<16>("AE_BE_CE")); + assert_eq!(0b1010000110000101, int::<2>("1010_0001_1000_0101")); + assert_eq!(0o0666, int::<8>("0_6_6_6")); + } + + #[test] + fn read_bigint() { + assert_eq!( + lex( + "10000000000000000000000000000000000000000000000000000n", + |l| l.read_number(false).unwrap().right().unwrap() + ), + ( + Box::new( + "10000000000000000000000000000000000000000000000000000" + .parse::() + .unwrap() + ), + Atom::from("10000000000000000000000000000000000000000000000000000n") + ), + ); + } + + #[test] + fn large_bin_number() { + const LONG: &str = + "0B11111111111111111111111111111111111111111111111101001010100000010111110001111111111"; + const VERY_LARGE_BINARY_NUMBER: &str = + "0B1111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 111111111111111111111111111111111111111111111111111111111111111111\ + 0010111110001111111111"; + assert_eq!( + lex(LONG, |l| l + .read_radix_number::<2>() + .unwrap() + .left() + .unwrap()), + (9.671_406_556_917_009e24, LONG.into()) + ); + assert_eq!( + lex(VERY_LARGE_BINARY_NUMBER, |l| l + .read_radix_number::<2>() + .unwrap() + .left() + .unwrap()), + (1.0972248137587377e304, VERY_LARGE_BINARY_NUMBER.into()) + ); + } + + #[test] + fn large_float_number() { + const LONG: &str = "9.671406556917009e+24"; + + assert_eq!(num(LONG), (9.671_406_556_917_009e24, LONG.into())); + } + + /// Valid even on strict mode. + const VALID_CASES: &[&str] = &[".0", "0.e-1", "0e8", ".8e1", "0.8e1", "1.18e1"]; + const INVALID_CASES_ON_STRICT: &[&str] = &["08e1", "08.1", "08.8e1", "08", "01"]; + const INVALID_CASES: &[&str] = &["01.8e1", "012e1", "00e1", "00.0"]; + + fn test_floats(strict: bool, success: bool, cases: &'static [&'static str]) { + for case in cases { + println!( + "Testing {} (when strict = {}); Expects success = {}", + case, strict, success + ); + // lazy way to get expected values + let expected: f64 = (i64::from_str_radix(case, 8).map(|v| v as f64)) + .or_else(|_| case.parse::().map(|v| v as f64)) + .or_else(|_| case.parse::()) + .unwrap_or_else(|err| { + panic!( + "failed to parse '{}' as float using str.parse(): {}", + case, err + ) + }); + + let vec = panic::catch_unwind(|| { + crate::with_test_sess(case, |_, input| { + let mut l = Lexer::new(Syntax::default(), Default::default(), input, None); + l.ctx.strict = strict; + Ok(l.map(|ts| ts.token).collect::>()) + }) + .unwrap() + }); + + if success { + let vec = match vec { + Ok(vec) => vec, + Err(err) => panic::resume_unwind(err), + }; + + assert_eq!(vec.len(), 1); + + let token = vec.into_iter().next().unwrap(); + let value = match token { + Token::Num { value, .. } => value, + _ => { + panic!("expected num token in test") + } + }; + + assert_eq!(expected, value); + } else if let Ok(vec) = vec { + assert_ne!( + vec![Token::Num { + value: expected, + raw: expected.to_string().into() + }], + vec + ) + } + } + } + + // #[test] + // fn strict_mode() { + // test_floats(true, true, VALID_CASES); + // test_floats(true, false, INVALID_CASES_ON_STRICT); + // test_floats(true, false, INVALID_CASES); + // } + + #[test] + fn non_strict() { + test_floats(false, true, VALID_CASES); + test_floats(false, true, INVALID_CASES_ON_STRICT); + test_floats(false, false, INVALID_CASES); + } +} diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index 6c8ba9a9dae1..77d8edc324fe 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -246,101 +246,100 @@ impl Lexer<'_> { } fn next_token(&mut self, start: &mut BytePos) -> Result, Error> { - // if let Some(start) = self.state.next_regexp { - // return Ok(Some(self.read_regexp(start)?)); - // } - - // if self.state.is_first { - // if let Some(shebang) = self.read_shebang()? { - // return Ok(Some(Token::Shebang(shebang))); - // } - // } - - // self.state.had_line_break = self.state.is_first; - // self.state.is_first = false; - - // // skip spaces before getting next character, if we are allowed to. - // if self.state.can_skip_space() { - // self.skip_space::(); - // *start = self.input.cur_pos(); - // }; - - // match self.input.cur()? { - // Some(..) => {} - // // End of input. - // None => { - // self.consume_pending_comments(); - - // return Ok(None); - // } - // }; - - // // println!( - // // "\tContext: ({:?}) {:?}", - // // self.input.cur().unwrap(), - // // self.state.context.0 - // // ); - - // self.state.start = *start; - - // if self.syntax.jsx() && !self.ctx.in_property_name && !self.ctx.in_type { - // //jsx - // if self.state.context.current() == Some(TokenContext::JSXExpr) { - // return self.read_jsx_token(); - // } - - // let c = self.input.cur()?; - // if let Some(c) = c { - // if self.state.context.current() == Some(TokenContext::JSXOpeningTag) - // || self.state.context.current() == Some(TokenContext::JSXClosingTag) - // { - // if c.is_ident_start() { - // return self.read_jsx_word().map(Some); - // } - - // if c == '>' { - // unsafe { - // // Safety: cur() is Some('>') - // self.input.bump(1); - // } - // return Ok(Some(Token::JSXTagEnd)); - // } - - // if (c == '\'' || c == '"') - // && self.state.context.current() == - // Some(TokenContext::JSXOpeningTag) { - // return self.read_jsx_str(c).map(Some); - // } - // } - - // if c == '<' && self.state.is_expr_allowed && self.input.peek() != - // Some('!') { let had_line_break_before_last = - // self.had_line_break_before_last(); let cur_pos = - // self.input.cur_pos(); - - // unsafe { - // // Safety: cur() is Some('<') - // self.input.bump(1); - // } - - // if had_line_break_before_last && self.is_str("<<<<<< ") { - // let span = Span::new(cur_pos, cur_pos + BytePos(7)); - - // self.emit_error_span(span, SyntaxError::TS1185); - // self.skip_line_comment(6); - // self.skip_space::(); - // return self.read_token(); - // } - - // return Ok(Some(Token::JSXTagStart)); - // } - // } - // } - - // if let Some(TokenContext::Tpl {}) = self.state.context.current() { - // let start = self.state.tpl_start; - // return self.read_tmpl_token(start).map(Some); - // } + if let Some(start) = self.state.next_regexp { + return Ok(Some(self.read_regexp(start)?)); + } + + if self.state.is_first { + if let Some(shebang) = self.read_shebang()? { + return Ok(Some(Token::Shebang(shebang))); + } + } + + self.state.had_line_break = self.state.is_first; + self.state.is_first = false; + + // skip spaces before getting next character, if we are allowed to. + if self.state.can_skip_space() { + self.skip_space::(); + *start = self.input.cur_pos(); + }; + + match self.input.cur()? { + Some(..) => {} + // End of input. + None => { + self.consume_pending_comments(); + + return Ok(None); + } + }; + + // println!( + // "\tContext: ({:?}) {:?}", + // self.input.cur().unwrap(), + // self.state.context.0 + // ); + + self.state.start = *start; + + if self.syntax.jsx() && !self.ctx.in_property_name && !self.ctx.in_type { + //jsx + if self.state.context.current() == Some(TokenContext::JSXExpr) { + return self.read_jsx_token(); + } + + let c = self.input.cur()?; + if let Some(c) = c { + if self.state.context.current() == Some(TokenContext::JSXOpeningTag) + || self.state.context.current() == Some(TokenContext::JSXClosingTag) + { + if c.is_ident_start() { + return self.read_jsx_word().map(Some); + } + + if c == '>' { + unsafe { + // Safety: cur() is Some('>') + self.input.bump(1); + } + return Ok(Some(Token::JSXTagEnd)); + } + + if (c == '\'' || c == '"') + && self.state.context.current() == Some(TokenContext::JSXOpeningTag) + { + return self.read_jsx_str(c).map(Some); + } + } + + if c == '<' && self.state.is_expr_allowed && self.input.peek() != Some('!') { + let had_line_break_before_last = self.had_line_break_before_last(); + let cur_pos = self.input.cur_pos(); + + unsafe { + // Safety: cur() is Some('<') + self.input.bump(1); + } + + if had_line_break_before_last && self.is_str("<<<<<< ") { + let span = Span::new(cur_pos, cur_pos + BytePos(7)); + + self.emit_error_span(span, SyntaxError::TS1185); + self.skip_line_comment(6); + self.skip_space::(); + return self.read_token(); + } + + return Ok(Some(Token::JSXTagStart)); + } + } + } + + if let Some(TokenContext::Tpl {}) = self.state.context.current() { + let start = self.state.tpl_start; + return self.read_tmpl_token(start).map(Some); + } self.read_token() } @@ -372,7 +371,7 @@ impl Iterator for Lexer<'_> { } self.state.update(start, token.kind()); - self.state.prev_hi = self.input.cur_pos(); + self.state.prev_hi = self.last_pos(); self.state.had_line_break_before_last = self.had_line_break_before_last(); } diff --git a/crates/swc_ecma_parser/src/lexer/util.rs b/crates/swc_ecma_parser/src/lexer/util.rs index f263592aa83c..1d835491154c 100644 --- a/crates/swc_ecma_parser/src/lexer/util.rs +++ b/crates/swc_ecma_parser/src/lexer/util.rs @@ -21,7 +21,7 @@ use crate::{ impl Lexer<'_> { pub(super) fn span(&self, start: BytePos) -> Span { - let end = self.input.cur_pos(); + let end = self.last_pos(); if cfg!(debug_assertions) && start > end { unreachable!( "assertion failed: (span.start <= span.end). @@ -110,185 +110,183 @@ impl Lexer<'_> { self.add_module_mode_error(err); } - // /// Skip comments or whitespaces. - // /// - // /// See https://tc39.github.io/ecma262/#sec-white-space - // #[inline(never)] - // pub(super) fn skip_space(&mut self) -> Result<()> { - // loop { - // let (offset, newline) = { - // let mut skip = SkipWhitespace { - // input: self.input.as_str(), - // newline: false, - // offset: 0, - // }; - - // skip.scan(); - - // (skip.offset, skip.newline) - // }; - - // self.input.bump(offset as usize); - // if newline { - // self.state.had_line_break = true; - // } - - // if LEX_COMMENTS && self.input.is_byte(b'/') { - // if self.input.peek()? == Some('/') { - // self.skip_line_comment(2); - // continue; - // } else if self.input.peek()? == Some('*') { - // self.skip_block_comment(); - // continue; - // } - // } - - // break; - // } - - // Ok(()) - // } - - // #[inline(never)] - // pub(super) fn skip_line_comment(&mut self, start_skip: usize) { - // let start = self.input.cur_pos(); - // self.input.bump_bytes(start_skip); - // let slice_start = self.input.cur_pos(); - - // // foo // comment for foo - // // bar - // // - // // foo - // // // comment for bar - // // bar - // // - // let is_for_next = self.state.had_line_break || - // !self.state.can_have_trailing_line_comment(); - - // let idx = self - // .input - // .as_str() - // .find(['\r', '\n', '\u{2028}', '\u{2029}']) - // .map_or(self.input.as_str().len(), |v| { - // self.state.had_line_break = true; - // v - // }); - - // self.input.bump_bytes(idx); - // let end = self.input.cur_pos(); - - // if let Some(comments) = self.comments_buffer.as_mut() { - // let s = unsafe { - // // Safety: We know that the start and the end are valid - // self.input.slice(slice_start, end) - // }; - // let cmt = Comment { - // kind: CommentKind::Line, - // span: Span::new(start, end), - // text: self.atoms.atom(s), - // }; - - // if is_for_next { - // comments.push_pending_leading(cmt); - // } else { - // comments.push(BufferedComment { - // kind: BufferedCommentKind::Trailing, - // pos: self.state.prev_hi, - // comment: cmt, - // }); - // } - // } - - // unsafe { - // // Safety: We got end from self.input - // self.input.reset_to(end); - // } - // } - - // /// Expects current char to be '/' and next char to be '*'. - // #[inline(never)] - // pub(super) fn skip_block_comment(&mut self) { - // let start = self.input.cur_pos(); - - // debug_assert_eq!(self.input.cur(), Some('/')); - // debug_assert_eq!(self.input.peek(), Some('*')); - - // self.input.bump_bytes(2); - - // // jsdoc - // let slice_start = self.input.cur_pos(); - // let mut was_star = if self.input.is_byte(b'*') { - // self.bump(); - // true - // } else { - // false - // }; - - // let mut is_for_next = self.state.had_line_break || - // !self.state.can_have_trailing_comment(); - - // while let Some(c) = self.input.cur()? { - // if was_star && c == '/' { - // debug_assert_eq!(self.input.cur(), Some('/')); - // self.bump(); // '/' - - // let end = self.input.cur_pos(); - - // self.skip_space::(); - - // if self.input.eat(RawToken::Semi)? { - // is_for_next = false; - // } - - // self.store_comment(is_for_next, start, end, slice_start); - - // return; - // } - // if c.is_line_terminator() { - // self.state.had_line_break = true; - // } - - // was_star = c == '*'; - // self.bump(); - // } - - // let end = self.input.end_pos(); - // let span = Span::new(end, end); - // self.emit_error_span(span, SyntaxError::UnterminatedBlockComment) - // } - - // #[inline(never)] - // fn store_comment( - // &mut self, - // is_for_next: bool, - // start: BytePos, - // end: BytePos, - // slice_start: BytePos, - // ) { - // if let Some(comments) = self.comments_buffer.as_mut() { - // let src = unsafe { - // // Safety: We got slice_start and end from self.input so those - // are valid. self.input.slice(slice_start, end) - // }; - // let s = &src[..src.len() - 2]; - // let cmt = Comment { - // kind: CommentKind::Block, - // span: Span::new(start, end), - // text: self.atoms.atom(s), - // }; - - // let _ = self.input.peek(); - // if is_for_next { - // comments.push_pending_leading(cmt); - // } else { - // comments.push(BufferedComment { - // kind: BufferedCommentKind::Trailing, - // pos: self.state.prev_hi, - // comment: cmt, - // }); - // } - // } - // } + /// Skip comments or whitespaces. + /// + /// See https://tc39.github.io/ecma262/#sec-white-space + #[inline(never)] + pub(super) fn skip_space(&mut self) -> Result<()> { + loop { + let (offset, newline) = { + let mut skip = SkipWhitespace { + input: self.input.as_str(), + newline: false, + offset: 0, + }; + + skip.scan(); + + (skip.offset, skip.newline) + }; + + self.input.bump(offset as usize); + if newline { + self.state.had_line_break = true; + } + + if LEX_COMMENTS && self.input.is_byte(b'/') { + if self.input.peek()? == Some('/') { + self.skip_line_comment(2); + continue; + } else if self.input.peek()? == Some('*') { + self.skip_block_comment(); + continue; + } + } + + break; + } + + Ok(()) + } + + #[inline(never)] + pub(super) fn skip_line_comment(&mut self, start_skip: usize) { + let start = self.input.cur_pos(); + self.input.bump_bytes(start_skip); + let slice_start = self.input.cur_pos(); + + // foo // comment for foo + // bar + // + // foo + // // comment for bar + // bar + // + let is_for_next = self.state.had_line_break || !self.state.can_have_trailing_line_comment(); + + let idx = self + .input + .as_str() + .find(['\r', '\n', '\u{2028}', '\u{2029}']) + .map_or(self.input.as_str().len(), |v| { + self.state.had_line_break = true; + v + }); + + self.input.bump_bytes(idx); + let end = self.input.cur_pos(); + + if let Some(comments) = self.comments_buffer.as_mut() { + let s = unsafe { + // Safety: We know that the start and the end are valid + self.input.slice(slice_start, end) + }; + let cmt = Comment { + kind: CommentKind::Line, + span: Span::new(start, end), + text: self.atoms.atom(s), + }; + + if is_for_next { + comments.push_pending_leading(cmt); + } else { + comments.push(BufferedComment { + kind: BufferedCommentKind::Trailing, + pos: self.state.prev_hi, + comment: cmt, + }); + } + } + + unsafe { + // Safety: We got end from self.input + self.input.reset_to(end); + } + } + + /// Expects current char to be '/' and next char to be '*'. + #[inline(never)] + pub(super) fn skip_block_comment(&mut self) { + let start = self.input.cur_pos(); + + debug_assert_eq!(self.input.cur(), Some('/')); + debug_assert_eq!(self.input.peek(), Some('*')); + + self.input.bump_bytes(2); + + // jsdoc + let slice_start = self.input.cur_pos(); + let mut was_star = if self.input.is_byte(b'*') { + self.bump(); + true + } else { + false + }; + + let mut is_for_next = self.state.had_line_break || !self.state.can_have_trailing_comment(); + + while let Some(c) = self.input.cur()? { + if was_star && c == '/' { + debug_assert_eq!(self.input.cur(), Some('/')); + self.bump(); // '/' + + let end = self.input.cur_pos(); + + self.skip_space::(); + + if self.input.eat(RawToken::Semi)? { + is_for_next = false; + } + + self.store_comment(is_for_next, start, end, slice_start); + + return; + } + if c.is_line_terminator() { + self.state.had_line_break = true; + } + + was_star = c == '*'; + self.bump(); + } + + let end = self.input.end_pos(); + let span = Span::new(end, end); + self.emit_error_span(span, SyntaxError::UnterminatedBlockComment) + } + + #[inline(never)] + fn store_comment( + &mut self, + is_for_next: bool, + start: BytePos, + end: BytePos, + slice_start: BytePos, + ) { + if let Some(comments) = self.comments_buffer.as_mut() { + let src = unsafe { + // Safety: We got slice_start and end from self.input so those are valid. + self.input.slice(slice_start, end) + }; + let s = &src[..src.len() - 2]; + let cmt = Comment { + kind: CommentKind::Block, + span: Span::new(start, end), + text: self.atoms.atom(s), + }; + + let _ = self.input.peek(); + if is_for_next { + comments.push_pending_leading(cmt); + } else { + comments.push(BufferedComment { + kind: BufferedCommentKind::Trailing, + pos: self.state.prev_hi, + comment: cmt, + }); + } + } + } } /// Implemented for `char`. diff --git a/crates/swc_ecma_parser/src/lib.rs b/crates/swc_ecma_parser/src/lib.rs index 0effb8d826ba..e2ed67dbff1f 100644 --- a/crates/swc_ecma_parser/src/lib.rs +++ b/crates/swc_ecma_parser/src/lib.rs @@ -115,8 +115,8 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![cfg_attr(test, feature(test))] -// #![deny(clippy::all)] -// #![deny(unused)] +#![deny(clippy::all)] +#![deny(unused)] #![allow(clippy::nonminimal_bool)] #![allow(clippy::too_many_arguments)] #![allow(clippy::unnecessary_unwrap)] From 8d5d0ccfdffe57f32c4b20399e4880b2f620d57b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 19:29:46 +0900 Subject: [PATCH 077/201] mod jsx --- crates/swc_ecma_raw_lexer/src/jsx.rs | 1 + crates/swc_ecma_raw_lexer/src/lib.rs | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 crates/swc_ecma_raw_lexer/src/jsx.rs diff --git a/crates/swc_ecma_raw_lexer/src/jsx.rs b/crates/swc_ecma_raw_lexer/src/jsx.rs new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/crates/swc_ecma_raw_lexer/src/jsx.rs @@ -0,0 +1 @@ + diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index e2711e707be3..e86af0c398ff 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -1,11 +1,13 @@ //! This package is considered internal and should not be used by external //! crates. //! -//! It may updated without proper semver. +//! It may updated without proper semver bump. use logos::{Lexer, Logos, Skip}; use swc_common::{input::StringInput, BytePos}; +pub mod jsx; + #[derive(Debug, Clone)] pub struct RawBuffer<'a> { lexer: logos::Lexer<'a, RawToken>, From 17c6b2980ae296aa925228deb37d91e3a72083a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 19:33:03 +0900 Subject: [PATCH 078/201] UnknownChar --- crates/swc_ecma_parser/src/error.rs | 11 +++-------- crates/swc_ecma_raw_lexer/src/jsx.rs | 6 ++++++ crates/swc_ecma_raw_lexer/src/lib.rs | 17 +++++++---------- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/crates/swc_ecma_parser/src/error.rs b/crates/swc_ecma_parser/src/error.rs index c5ca1c9dedd5..dfa2f64ac3ca 100644 --- a/crates/swc_ecma_parser/src/error.rs +++ b/crates/swc_ecma_parser/src/error.rs @@ -804,13 +804,8 @@ fn size_of_error() { assert_eq!(std::mem::size_of::(), 8); } -impl From for Error { - fn from(e: swc_ecma_raw_lexer::LexError) -> Self { - Self::new( - DUMMY_SP, - match e { - swc_ecma_raw_lexer::LexError::UnexepectedCharacter => SyntaxError::UnexpectedToken, - }, - ) +impl From for Error { + fn from(_: swc_ecma_raw_lexer::UnknownChar) -> Self { + Self::new(DUMMY_SP, SyntaxError::UnexpectedToken) } } diff --git a/crates/swc_ecma_raw_lexer/src/jsx.rs b/crates/swc_ecma_raw_lexer/src/jsx.rs index 8b137891791f..bb3b570676e7 100644 --- a/crates/swc_ecma_raw_lexer/src/jsx.rs +++ b/crates/swc_ecma_raw_lexer/src/jsx.rs @@ -1 +1,7 @@ +use logos::Logos; +use crate::UnknownChar; + +#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)] +#[logos(error = UnknownChar)] +pub enum JsxToken {} diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index e86af0c398ff..9909eb4622ef 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -45,15 +45,15 @@ impl<'a> RawBuffer<'a> { self.pos } - pub fn cur(&self) -> Result, LexError> { + pub fn cur(&self) -> Result, UnknownChar> { self.lexer.clone().next().transpose() } - pub fn peek(&self) -> Result, LexError> { + pub fn peek(&self) -> Result, UnknownChar> { self.lexer.clone().nth(1).transpose() } - pub fn peek_ahead(&self) -> Result, LexError> { + pub fn peek_ahead(&self) -> Result, UnknownChar> { self.lexer.clone().nth(2).transpose() } @@ -79,7 +79,7 @@ impl<'a> RawBuffer<'a> { self.pos = self.pos + BytePos(n as u32); } - pub fn eat(&mut self, token: RawToken) -> Result { + pub fn eat(&mut self, token: RawToken) -> Result { let cur = self.cur()?; if cur == Some(token) { @@ -103,7 +103,7 @@ impl<'a> RawBuffer<'a> { } impl Iterator for RawBuffer<'_> { - type Item = Result; + type Item = Result; fn next(&mut self) -> Option { let item = self.lexer.next()?; @@ -121,7 +121,7 @@ impl Iterator for RawBuffer<'_> { } #[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)] -#[logos(error = LexError, extras = TokenState)] +#[logos(error = UnknownChar, extras = TokenState)] pub enum RawToken { #[token("=>")] Arrow, @@ -605,7 +605,4 @@ impl RawToken { } #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] -pub enum LexError { - #[default] - UnexepectedCharacter, -} +pub struct UnknownChar; From 569c32438bf51c97110e0d1293e169cd04adbccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 19:34:50 +0900 Subject: [PATCH 079/201] Remove some --- crates/swc_ecma_parser/src/lexer/mod.rs | 44 ------------------------- 1 file changed, 44 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 0563b74186a2..eab7dfa23d1d 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -682,50 +682,6 @@ impl<'a> Lexer<'a> { } impl Lexer<'_> { - /// This can be used if there's no keyword starting with the first - /// character. - fn read_ident_unknown(&mut self) -> LexResult { - debug_assert!(self.input.cur()?.is_some()); - - let (word, _) = self - .read_word_as_str_with(|l, s, _, _| Word::Ident(IdentLike::Other(l.atoms.atom(s))))?; - - Ok(Word(word)) - } - - /// This can be used if there's no keyword starting with the first - /// character. - fn read_word_with( - &mut self, - convert: &dyn Fn(&str) -> Option, - ) -> LexResult> { - debug_assert!(self.input.cur()?.is_some()); - - let start = self.input.cur_pos(); - let (word, has_escape) = self.read_word_as_str_with(|l, s, _, can_be_known| { - if can_be_known { - if let Some(word) = convert(s) { - return word; - } - } - - Word::Ident(IdentLike::Other(l.atoms.atom(s))) - })?; - - // Note: ctx is store in lexer because of this error. - // 'await' and 'yield' may have semantic of reserved word, which means lexer - // should know context or parser should handle this error. Our approach to this - // problem is former one. - if has_escape && self.ctx.is_reserved(&word) { - self.error( - start, - SyntaxError::EscapeInReservedWord { word: word.into() }, - )? - } else { - Ok(Some(Token::Word(word))) - } - } - /// This method is optimized for texts without escape sequences. /// /// `convert(text, has_escape, can_be_keyword)` From 1b7d4cf82443d21e96a712b6e62e53d2b74fc294 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 19:37:23 +0900 Subject: [PATCH 080/201] more work --- crates/swc_ecma_parser/src/lexer/mod.rs | 100 ++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 5 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index eab7dfa23d1d..41f0b933cbe9 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -16,7 +16,7 @@ pub use self::{ }; use crate::{ error::{Error, SyntaxError}, - token::{BinOpToken, IdentLike, Token, Word}, + token::{BinOpToken, IdentLike, Keyword, KnownIdent, Token, Word}, Context, Syntax, }; @@ -178,7 +178,7 @@ impl<'a> Lexer<'a> { None => return Ok(None), }; - match cur { + Ok(Some(match cur { RawToken::LegacyCommentOpen => { // XML style comment. ` - if self.state.had_line_break && c == b'-' && self.input.eat(b'>') { - self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule); - self.skip_line_comment(0); - self.skip_space::(); - return self.read_token(); - } - - if c == b'+' { - Token::PlusPlus - } else { - Token::MinusMinus - } - } else if self.input.eat_byte(b'=') { - Token::AssignOp(if c == b'+' { - AssignOp::AddAssign - } else { - AssignOp::SubAssign - }) - } else { - Token::BinOp(if c == b'+' { - BinOpToken::Add - } else { - BinOpToken::Sub - }) - })) - } - fn read_token_bang_or_eq(&mut self, c: u8) -> LexResult> { let start = self.input.cur_pos(); let had_line_break_before_last = self.had_line_break_before_last(); From b157aefe54266c2468c8e9add28ad19247d16cc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 19:38:31 +0900 Subject: [PATCH 084/201] Remove `read_token_bang_or_eq` --- crates/swc_ecma_parser/src/lexer/mod.rs | 43 ------------------------- 1 file changed, 43 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index eaf1b79346b8..d42ced135a84 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -592,49 +592,6 @@ impl<'a> Lexer<'a> { Ok(Some(vec![c.into()])) } - - fn read_token_bang_or_eq(&mut self, c: u8) -> LexResult> { - let start = self.input.cur_pos(); - let had_line_break_before_last = self.had_line_break_before_last(); - - unsafe { - // Safety: cur() is Some(c) if this method is called. - self.input.bump(1); - } - - Ok(Some(if self.input.eat(RawToken::AssignOp)? { - // "==" - - if self.input.eat(RawToken::AssignOp)? { - if c == b'!' { - Token::BinOp(BinOpToken::NotEqEq) - } else { - // ======= - // ^ - if had_line_break_before_last && self.is_str("====") { - self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185); - self.skip_line_comment(4); - self.skip_space::(); - return self.read_token(); - } - - Token::BinOp(BinOpToken::EqEqEq) - } - } else if c == b'!' { - Token::BinOp(BinOpToken::NotEq) - } else { - Token::BinOp(BinOpToken::EqEq) - } - } else if c == b'=' && self.input.eat_byte(b'>') { - // "=>" - - Token::Arrow - } else if c == b'!' { - Token::Bang - } else { - Token::AssignOp(AssignOp::Assign) - })) - } } impl Lexer<'_> { From 489e768b62aef035ef98b055cefdd7cd332442cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 19:45:26 +0900 Subject: [PATCH 085/201] more lexer work --- crates/swc_ecma_parser/src/lexer/jsx.rs | 4 +- crates/swc_ecma_parser/src/lexer/util.rs | 142 ++--------------------- 2 files changed, 13 insertions(+), 133 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/jsx.rs b/crates/swc_ecma_parser/src/lexer/jsx.rs index 69e9b486992f..6d249bbf861c 100644 --- a/crates/swc_ecma_parser/src/lexer/jsx.rs +++ b/crates/swc_ecma_parser/src/lexer/jsx.rs @@ -30,7 +30,9 @@ impl Lexer<'_> { let span = Span::new(cur_pos, cur_pos + BytePos(7)); self.emit_error_span(span, SyntaxError::TS1185); - self.skip_line_comment(6); + // Bump conflict marker + self.input.next().transpose()?; + self.skip_space::(); return self.read_token(); } diff --git a/crates/swc_ecma_parser/src/lexer/util.rs b/crates/swc_ecma_parser/src/lexer/util.rs index 1d835491154c..06a3b0c5976a 100644 --- a/crates/swc_ecma_parser/src/lexer/util.rs +++ b/crates/swc_ecma_parser/src/lexer/util.rs @@ -115,146 +115,24 @@ impl Lexer<'_> { /// See https://tc39.github.io/ecma262/#sec-white-space #[inline(never)] pub(super) fn skip_space(&mut self) -> Result<()> { - loop { - let (offset, newline) = { - let mut skip = SkipWhitespace { - input: self.input.as_str(), - newline: false, - offset: 0, - }; - - skip.scan(); - - (skip.offset, skip.newline) - }; - - self.input.bump(offset as usize); - if newline { - self.state.had_line_break = true; + match self.input.cur()? { + Some(RawToken::Whitespace | RawToken::NewLine) => { + self.input.next().transpose()?; } - if LEX_COMMENTS && self.input.is_byte(b'/') { - if self.input.peek()? == Some('/') { - self.skip_line_comment(2); - continue; - } else if self.input.peek()? == Some('*') { - self.skip_block_comment(); - continue; - } + Some( + RawToken::LineComment + | RawToken::BlockComment + | RawToken::LegacyCommentOpen + | RawToken::LegacyCommentClose, + ) if LEX_COMMENTS => { + self.input.next().transpose()?; } - - break; } Ok(()) } - #[inline(never)] - pub(super) fn skip_line_comment(&mut self, start_skip: usize) { - let start = self.input.cur_pos(); - self.input.bump_bytes(start_skip); - let slice_start = self.input.cur_pos(); - - // foo // comment for foo - // bar - // - // foo - // // comment for bar - // bar - // - let is_for_next = self.state.had_line_break || !self.state.can_have_trailing_line_comment(); - - let idx = self - .input - .as_str() - .find(['\r', '\n', '\u{2028}', '\u{2029}']) - .map_or(self.input.as_str().len(), |v| { - self.state.had_line_break = true; - v - }); - - self.input.bump_bytes(idx); - let end = self.input.cur_pos(); - - if let Some(comments) = self.comments_buffer.as_mut() { - let s = unsafe { - // Safety: We know that the start and the end are valid - self.input.slice(slice_start, end) - }; - let cmt = Comment { - kind: CommentKind::Line, - span: Span::new(start, end), - text: self.atoms.atom(s), - }; - - if is_for_next { - comments.push_pending_leading(cmt); - } else { - comments.push(BufferedComment { - kind: BufferedCommentKind::Trailing, - pos: self.state.prev_hi, - comment: cmt, - }); - } - } - - unsafe { - // Safety: We got end from self.input - self.input.reset_to(end); - } - } - - /// Expects current char to be '/' and next char to be '*'. - #[inline(never)] - pub(super) fn skip_block_comment(&mut self) { - let start = self.input.cur_pos(); - - debug_assert_eq!(self.input.cur(), Some('/')); - debug_assert_eq!(self.input.peek(), Some('*')); - - self.input.bump_bytes(2); - - // jsdoc - let slice_start = self.input.cur_pos(); - let mut was_star = if self.input.is_byte(b'*') { - self.bump(); - true - } else { - false - }; - - let mut is_for_next = self.state.had_line_break || !self.state.can_have_trailing_comment(); - - while let Some(c) = self.input.cur()? { - if was_star && c == '/' { - debug_assert_eq!(self.input.cur(), Some('/')); - self.bump(); // '/' - - let end = self.input.cur_pos(); - - self.skip_space::(); - - if self.input.eat(RawToken::Semi)? { - is_for_next = false; - } - - self.store_comment(is_for_next, start, end, slice_start); - - return; - } - if c.is_line_terminator() { - self.state.had_line_break = true; - } - - was_star = c == '*'; - self.bump(); - } - - let end = self.input.end_pos(); - let span = Span::new(end, end); - self.emit_error_span(span, SyntaxError::UnterminatedBlockComment) - } - #[inline(never)] fn store_comment( &mut self, From bc2eff71d06f68cd5adc382a101a7c74b91b5024 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 19:45:43 +0900 Subject: [PATCH 086/201] Remove mod whitespac --- crates/swc_ecma_parser/src/lexer/mod.rs | 1 - crates/swc_ecma_parser/src/lexer/util.rs | 2 +- .../swc_ecma_parser/src/lexer/whitespace.rs | 100 ------------------ 3 files changed, 1 insertion(+), 102 deletions(-) delete mode 100644 crates/swc_ecma_parser/src/lexer/whitespace.rs diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index d42ced135a84..57fd1da4e12e 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -29,7 +29,6 @@ mod state; #[cfg(test)] mod tests; pub mod util; -mod whitespace; pub(crate) type LexResult = Result; diff --git a/crates/swc_ecma_parser/src/lexer/util.rs b/crates/swc_ecma_parser/src/lexer/util.rs index 06a3b0c5976a..6114e86688e7 100644 --- a/crates/swc_ecma_parser/src/lexer/util.rs +++ b/crates/swc_ecma_parser/src/lexer/util.rs @@ -12,7 +12,7 @@ use swc_ecma_ast::Ident; use swc_ecma_raw_lexer::RawToken; use tracing::warn; -use super::{comments_buffer::BufferedComment, whitespace::SkipWhitespace, Char, LexResult, Lexer}; +use super::{comments_buffer::BufferedComment, Char, LexResult, Lexer}; use crate::{ error::{Error, SyntaxError}, lexer::comments_buffer::BufferedCommentKind, diff --git a/crates/swc_ecma_parser/src/lexer/whitespace.rs b/crates/swc_ecma_parser/src/lexer/whitespace.rs deleted file mode 100644 index cba91fa0158c..000000000000 --- a/crates/swc_ecma_parser/src/lexer/whitespace.rs +++ /dev/null @@ -1,100 +0,0 @@ -/// Returns true if it's done -pub(super) type ByteHandler = Option fn(&mut SkipWhitespace<'aa>) -> u32>; - -/// Lookup table for whitespace -static BYTE_HANDLERS: [ByteHandler; 256] = [ - // 0 1 2 3 4 5 6 7 8 9 A B C D E F // - ___, ___, ___, ___, ___, ___, ___, ___, ___, SPC, NLN, SPC, SPC, NLN, ___, ___, // 0 - ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 1 - SPC, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 2 - ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 3 - ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 4 - ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 5 - ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 6 - ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 7 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9 - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E - UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F -]; - -/// Stop -const ___: ByteHandler = None; - -/// Newline -const NLN: ByteHandler = Some(|skip| { - skip.newline = true; - - 1 -}); - -/// Space -const SPC: ByteHandler = Some(|_| 1); - -/// Unicode -const UNI: ByteHandler = Some(|skip| { - let s = unsafe { - // Safety: `skip.offset` is always valid - skip.input.get_unchecked(skip.offset as usize..) - }; - - let c = unsafe { - // Safety: Byte handlers are called only when `skip.input` is not empty - s.chars().next().unwrap_unchecked() - }; - - match c { - // white spaces - '\u{feff}' => {} - // line breaks - '\u{2028}' | '\u{2029}' => { - skip.newline = true; - } - - _ if c.is_whitespace() => {} - - _ => return 0, - } - - c.len_utf8() as u32 -}); - -/// API is taked from oxc by Boshen (https://github.com/Boshen/oxc/pull/26) -pub(super) struct SkipWhitespace<'a> { - pub input: &'a str, - - /// Total offset - pub offset: u32, - - /// Found newline - pub newline: bool, -} - -impl SkipWhitespace<'_> { - #[inline(always)] - pub fn scan(&mut self) { - let mut byte; - loop { - byte = match self.input.as_bytes().get(self.offset as usize).copied() { - Some(v) => v, - None => return, - }; - - let handler = unsafe { *(&BYTE_HANDLERS as *const ByteHandler).offset(byte as isize) }; - - if let Some(handler) = handler { - let delta = handler(self); - if delta == 0 { - return; - } - self.offset += delta; - } else { - return; - } - } - } -} From 372b787fb9bd4d620bab77f30148ed30da37ffdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Fri, 20 Dec 2024 19:49:06 +0900 Subject: [PATCH 087/201] Remove `read_token_dot` --- crates/swc_ecma_parser/src/lexer/mod.rs | 44 +----------------------- crates/swc_ecma_parser/src/lexer/util.rs | 2 +- 2 files changed, 2 insertions(+), 44 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 57fd1da4e12e..a66483ffc9c7 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -180,7 +180,7 @@ impl<'a> Lexer<'a> { Ok(Some(match cur { RawToken::LegacyCommentOpen => { // XML style comment. `")] LegacyCommentClose, - #[token("<<<<<")] + #[token("<<<<<<<")] LConflictMarker, - #[token(">>>>>")] + #[token(">>>>>>>")] RConflictMarker, #[token("await")] From edb21729dab54f471a821d9715ecc929c075a01e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Wed, 1 Jan 2025 05:44:45 +0900 Subject: [PATCH 127/201] ConflictMarker --- crates/swc_ecma_parser/src/lexer/mod.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index d855885b25ca..ec57a9c72238 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -188,9 +188,7 @@ impl<'a> Lexer<'a> { return self.read_token(start); } - RawToken::LConflictMarker | RawToken::RConflictMarker - if self.had_line_break_before_last() => - { + RawToken::LConflictMarker | RawToken::RConflictMarker => { // All conflict markers consist of the same character repeated seven times. // If it is a <<<<<<< or >>>>>>> marker then it is also followed by a space. // <<<<<<< @@ -199,8 +197,8 @@ impl<'a> Lexer<'a> { // ^ self.emit_error_span(fixed_len_span(*start, 7), SyntaxError::TS1185); - // self.skip_line_comment(5); - // self.skip_space::(); + let _ = self.input.next(); + *start = self.input.cur_pos(); return self.read_token(start); } @@ -326,10 +324,7 @@ impl<'a> Lexer<'a> { *start = self.input.cur_pos(); return self.read_token(start); } - RawToken::LineComment - | RawToken::BlockComment - | RawToken::LConflictMarker - | RawToken::RConflictMarker => { + RawToken::LineComment | RawToken::BlockComment => { self.input.next().transpose()?; // self.skip_space::()?; From 8cd59e589aa7a4008f735a91af4f76b27876cf8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 13:37:38 +0900 Subject: [PATCH 128/201] Token::Str: Exclude quotes --- crates/swc_ecma_parser/src/lexer/mod.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index ec57a9c72238..8e2bcfcfe5e4 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -223,10 +223,15 @@ impl<'a> Lexer<'a> { RawToken::PlusPlus => Token::PlusPlus, RawToken::MinusMinus => Token::MinusMinus, RawToken::Tilde => Token::Tilde, - RawToken::Str => Token::Str { - value: self.atoms.atom(self.input.cur_slice()), - raw: self.atoms.atom(self.input.cur_slice()), - }, + RawToken::Str => { + let s = self.input.cur_slice(); + let value = &s[1..s.len() - 1]; + + Token::Str { + value: self.atoms.atom(value), + raw: self.atoms.atom(s), + } + } RawToken::Num => { let s = self.input.cur_slice(); let value = if let Some(s) = s.strip_prefix("0x").or_else(|| s.strip_prefix("0X")) { From f7111d5a23d25aa40270050e2b80a0ad27ed7de3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 13:41:34 +0900 Subject: [PATCH 129/201] git conflict marker --- crates/swc_ecma_parser/src/lexer/jsx.rs | 5 +---- crates/swc_ecma_parser/src/lexer/mod.rs | 2 +- crates/swc_ecma_raw_lexer/src/lib.rs | 6 +++--- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/jsx.rs b/crates/swc_ecma_parser/src/lexer/jsx.rs index b9dba1b2978f..769590fe2ad6 100644 --- a/crates/swc_ecma_parser/src/lexer/jsx.rs +++ b/crates/swc_ecma_parser/src/lexer/jsx.rs @@ -22,10 +22,7 @@ impl Lexer<'_> { let cur_pos = self.input.cur_pos(); match cur { - RawToken::LtOp - if self.had_line_break_before_last() - && self.input.peek()? == Some(RawToken::LConflictMarker) => - { + RawToken::ConflictMarker => { let span = Span::new(cur_pos, cur_pos + BytePos(7)); self.emit_error_span(span, SyntaxError::TS1185); diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 8e2bcfcfe5e4..a01446196ade 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -188,7 +188,7 @@ impl<'a> Lexer<'a> { return self.read_token(start); } - RawToken::LConflictMarker | RawToken::RConflictMarker => { + RawToken::ConflictMarker => { // All conflict markers consist of the same character repeated seven times. // If it is a <<<<<<< or >>>>>>> marker then it is also followed by a space. // <<<<<<< diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 1a463b6ecd6f..90c2103bb40f 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -420,10 +420,10 @@ pub enum RawToken { LegacyCommentClose, #[token("<<<<<<<")] - LConflictMarker, - #[token(">>>>>>>")] - RConflictMarker, + #[token("=======")] + #[token("|||||||")] + ConflictMarker, #[token("await")] Await, From 0ca456f844cfe69d10a6638f3d0c61b8571c97a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 14:21:31 +0900 Subject: [PATCH 130/201] jsx ident --- crates/swc_ecma_parser/src/lexer/state.rs | 61 +++++++++++------------ 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index 0422d3a41572..8cb51f0e58a7 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -266,8 +266,8 @@ impl Lexer<'_> { // *start = self.input.cur_pos(); // }; - match self.input.cur()? { - Some(..) => {} + let c = match self.input.cur()? { + Some(v) => v, // End of input. None => { self.consume_pending_comments(); @@ -290,43 +290,40 @@ impl Lexer<'_> { return self.read_jsx_token(start); } - let c = self.input.cur()?; - if let Some(c) = c { - if self.state.context.current() == Some(TokenContext::JSXOpeningTag) - || self.state.context.current() == Some(TokenContext::JSXClosingTag) - { - if c == RawToken::Ident { - return Ok(Some(Token::JSXName { - name: self.atoms.atom(self.input.cur_slice()), - })); - } - - if c == RawToken::GtOp { - unsafe { - // Safety: cur() is Some('>') - self.input.bump(1); - } - return Ok(Some(Token::JSXTagEnd)); - } - - if (c == RawToken::Str) - && self.state.context.current() == Some(TokenContext::JSXOpeningTag) - { - return self.read_jsx_str().map(Some); - } + if self.state.context.current() == Some(TokenContext::JSXOpeningTag) + || self.state.context.current() == Some(TokenContext::JSXClosingTag) + { + if c == RawToken::Ident { + let name = self.atoms.atom(self.input.cur_slice()); + let _ = self.input.next(); + return Ok(Some(Token::JSXName { name })); } - if c == RawToken::LtOp - && self.state.is_expr_allowed - && self.input.peek()? != Some(RawToken::Bang) - { + if c == RawToken::GtOp { unsafe { - // Safety: cur() is Some('<') + // Safety: cur() is Some('>') self.input.bump(1); } + return Ok(Some(Token::JSXTagEnd)); + } + + if (c == RawToken::Str) + && self.state.context.current() == Some(TokenContext::JSXOpeningTag) + { + return self.read_jsx_str().map(Some); + } + } - return Ok(Some(Token::JSXTagStart)); + if c == RawToken::LtOp + && self.state.is_expr_allowed + && self.input.peek()? != Some(RawToken::Bang) + { + unsafe { + // Safety: cur() is Some('<') + self.input.bump(1); } + + return Ok(Some(Token::JSXTagStart)); } } From efce79d512ed6f7ee4699c20d729d4d2834d878e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 14:26:39 +0900 Subject: [PATCH 131/201] self.input.next().transpose()?; --- crates/swc_ecma_parser/src/lexer/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index a01446196ade..c325d9bbc2b6 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -323,14 +323,14 @@ impl<'a> Lexer<'a> { self.atoms.atom(self.input.cur_slice()) }))), RawToken::NewLine | RawToken::Whitespace => { - self.input.next().transpose()?; + let _ = self.input.next(); // self.skip_space::(); *start = self.input.cur_pos(); return self.read_token(start); } RawToken::LineComment | RawToken::BlockComment => { - self.input.next().transpose()?; + let _ = self.input.next(); // self.skip_space::()?; *start = self.input.cur_pos(); From 0d7b87ac0fcf03509436b9138a8afe4eb8478b4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 14:26:53 +0900 Subject: [PATCH 132/201] self.input.next().transpose()?; --- crates/swc_ecma_parser/src/lexer/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index c325d9bbc2b6..0e2830adc542 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -430,7 +430,7 @@ impl<'a> Lexer<'a> { RawToken::Public => Token::Word(Word::Ident(IdentLike::Known(KnownIdent::Public))), }; - self.input.next().transpose()?; + let _ = self.input.next(); Ok(Some(token)) } From 1621e620b976bd5d4b5e510065b136006f3e3dc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 14:27:21 +0900 Subject: [PATCH 133/201] Use next instead of bump --- crates/swc_ecma_parser/src/lexer/state.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index 8cb51f0e58a7..15e44cd7b529 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -300,10 +300,7 @@ impl Lexer<'_> { } if c == RawToken::GtOp { - unsafe { - // Safety: cur() is Some('>') - self.input.bump(1); - } + let _ = self.input.next(); return Ok(Some(Token::JSXTagEnd)); } From e76f8ebd0dc62008628263f6a209aaa36091a4f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 14:29:20 +0900 Subject: [PATCH 134/201] next() instead of bump() --- crates/swc_ecma_parser/src/lexer/jsx.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/jsx.rs b/crates/swc_ecma_parser/src/lexer/jsx.rs index 769590fe2ad6..e08276eea314 100644 --- a/crates/swc_ecma_parser/src/lexer/jsx.rs +++ b/crates/swc_ecma_parser/src/lexer/jsx.rs @@ -254,10 +254,7 @@ impl Lexer<'_> { // it might be at the end of the file when // the string literal is unterminated if self.input.peek_ahead()?.is_some() { - unsafe { - // Safety: We called peek_ahead() which means cur() was Some - self.input.bump(1); - } + let _ = self.input.next(); } let end = self.input.cur_pos(); From c76f81a1feeeb1e33ac51af7ead5ae128ce35763 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 14:42:18 +0900 Subject: [PATCH 135/201] proceed on error --- crates/swc_ecma_parser/src/lexer/state.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index 15e44cd7b529..75c3b8915b4a 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -10,7 +10,7 @@ use super::{ Context, Lexer, }; use crate::{ - error::Error, + error::{Error, SyntaxError}, input::Tokens, token::{BinOpToken, Keyword, Token, TokenAndSpan, TokenKind, WordKind}, EsVersion, Syntax, @@ -266,10 +266,14 @@ impl Lexer<'_> { // *start = self.input.cur_pos(); // }; - let c = match self.input.cur()? { - Some(v) => v, + let c = match self.input.cur() { + Err(..) => { + let _ = self.input.next(); + return Err(Error::new(self.span(*start), SyntaxError::UnexpectedToken)); + } + Ok(Some(v)) => v, // End of input. - None => { + Ok(None) => { self.consume_pending_comments(); return Ok(None); @@ -284,6 +288,8 @@ impl Lexer<'_> { self.state.start = *start; + dbg!(&c); + dbg!(&self.state.context.current()); if self.syntax.jsx() && !self.ctx.in_property_name && !self.ctx.in_type { //jsx if self.state.context.current() == Some(TokenContext::JSXExpr) { From 47c866268ecb085a2f7d2ed8f6374126199af416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 14:43:36 +0900 Subject: [PATCH 136/201] fix eat() --- crates/swc_ecma_parser/src/lexer/mod.rs | 4 ++-- crates/swc_ecma_raw_lexer/src/lib.rs | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 0e2830adc542..87a5cc44c183 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -467,7 +467,7 @@ impl<'a> Lexer<'a> { '\r' => { self.bump(); // remove '\r' - self.input.eat(RawToken::NewLine)?; + self.input.eat(RawToken::NewLine); return Ok(None); } @@ -739,7 +739,7 @@ impl Lexer<'_> { #[cold] fn read_shebang(&mut self) -> LexResult> { - if !self.input.eat(RawToken::Shebang)? { + if !self.input.eat(RawToken::Shebang) { return Ok(None); } diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 90c2103bb40f..1bc1db806da4 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -120,14 +120,16 @@ impl<'a> RawBuffer<'a> { self.pos = self.pos + BytePos(n as u32); } - pub fn eat(&mut self, token: RawToken) -> Result { - let cur = self.cur()?; + pub fn eat(&mut self, token: RawToken) -> bool { + let Ok(Some(cur)) = self.cur() else { + return false; + }; - if cur == Some(token) { + if cur == token { self.next(); - Ok(true) + true } else { - Ok(false) + false } } From 95be7bd7f436a1efa2fddfb7106c0a29254ce796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 14:46:05 +0900 Subject: [PATCH 137/201] Rename: UnexpectedCharFromLexer --- crates/swc_ecma_parser/src/error.rs | 6 +++--- crates/swc_ecma_parser/src/lexer/mod.rs | 2 +- crates/swc_ecma_parser/src/lexer/state.rs | 5 ++++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/crates/swc_ecma_parser/src/error.rs b/crates/swc_ecma_parser/src/error.rs index dfa2f64ac3ca..38f8aec0ec2c 100644 --- a/crates/swc_ecma_parser/src/error.rs +++ b/crates/swc_ecma_parser/src/error.rs @@ -293,7 +293,7 @@ pub enum SyntaxError { ReservedTypeAssertion, ReservedArrowTypeParam, - UnexpectedToken, + UnexpectedCharFromLexer, } impl SyntaxError { @@ -760,7 +760,7 @@ impl SyntaxError { as in `() => ...`." .into(), SyntaxError::InvalidAssignTarget => "Invalid assignment target".into(), - SyntaxError::UnexpectedToken => "Unexpected token".into(), + SyntaxError::UnexpectedCharFromLexer => "Unexpected character".into(), } } } @@ -806,6 +806,6 @@ fn size_of_error() { impl From for Error { fn from(_: swc_ecma_raw_lexer::UnknownChar) -> Self { - Self::new(DUMMY_SP, SyntaxError::UnexpectedToken) + Self::new(DUMMY_SP, SyntaxError::UnexpectedCharFromLexer) } } diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 87a5cc44c183..b7fc65071072 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -271,7 +271,7 @@ impl<'a> Lexer<'a> { }, RawToken::Shebang => { - self.emit_error(*start, SyntaxError::UnexpectedToken); + self.emit_error(*start, SyntaxError::UnexpectedCharFromLexer); self.input.next().transpose()?; return self.read_token(start); diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index 75c3b8915b4a..5c3882ef7936 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -269,7 +269,10 @@ impl Lexer<'_> { let c = match self.input.cur() { Err(..) => { let _ = self.input.next(); - return Err(Error::new(self.span(*start), SyntaxError::UnexpectedToken)); + return Err(Error::new( + self.span(*start), + SyntaxError::UnexpectedCharFromLexer, + )); } Ok(Some(v)) => v, // End of input. From f178a5c1fc9bba25c67d7a08c0c7229f26e37c95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 14:49:24 +0900 Subject: [PATCH 138/201] str --- crates/swc_ecma_raw_lexer/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index 1bc1db806da4..e3bd91874c35 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -254,8 +254,8 @@ pub enum RawToken { #[token("~")] Tilde, - #[regex(r#""([^"\\]|\\["\\bnfrt]|u[a-fA-F0-9]{4}|[xX][a-fA-F0-9]+|[oO][0-7]+|[bB][01]+)*""#)] - #[regex(r#"'([^'\\]|\\['\\bnfrt]|u[a-fA-F0-9]{4}|[xX][a-fA-F0-9]+|[oO][0-7]+|[bB][01]+)*'"#)] + #[regex(r#""([^"\\]|\\["\\bnfrt\n]|u[a-fA-F0-9]{4}|[xX][a-fA-F0-9]+|[oO][0-7]+|[bB][01]+)*""#)] + #[regex(r#"'([^'\\]|\\['\\bnfrt\n]|u[a-fA-F0-9]{4}|[xX][a-fA-F0-9]+|[oO][0-7]+|[bB][01]+)*'"#)] Str, #[regex(r"(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?")] From 7d5c47d6e5f9264a84723f19f6e62bf959446adf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 14:52:28 +0900 Subject: [PATCH 139/201] raw token --- crates/swc_ecma_raw_lexer/src/lib.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/swc_ecma_raw_lexer/src/lib.rs b/crates/swc_ecma_raw_lexer/src/lib.rs index e3bd91874c35..0ee00413bd5b 100644 --- a/crates/swc_ecma_raw_lexer/src/lib.rs +++ b/crates/swc_ecma_raw_lexer/src/lib.rs @@ -254,8 +254,12 @@ pub enum RawToken { #[token("~")] Tilde, - #[regex(r#""([^"\\]|\\["\\bnfrt\n]|u[a-fA-F0-9]{4}|[xX][a-fA-F0-9]+|[oO][0-7]+|[bB][01]+)*""#)] - #[regex(r#"'([^'\\]|\\['\\bnfrt\n]|u[a-fA-F0-9]{4}|[xX][a-fA-F0-9]+|[oO][0-7]+|[bB][01]+)*'"#)] + #[regex( + r#""([^"\\]|\\["\\bnfrt\n]|u[a-fA-F0-9]{4}|[xX][a-fA-F0-9]+|[oO0][0-7]*|[bB][01]+)*""# + )] + #[regex( + r#"'([^'\\]|\\['\\bnfrt\n]|u[a-fA-F0-9]{4}|[xX][a-fA-F0-9]+|[oO0][0-7]*||[bB][01]+)*'"# + )] Str, #[regex(r"(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?")] From 943dd784f42f52ea2b7e9241e66f31dd6f53fe1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 2 Jan 2025 14:53:29 +0900 Subject: [PATCH 140/201] move log --- crates/swc_ecma_parser/src/lexer/mod.rs | 2 -- crates/swc_ecma_parser/src/lexer/state.rs | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index b7fc65071072..e58e3e91126b 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -175,8 +175,6 @@ impl<'a> Lexer<'a> { None => return Ok(None), }; - dbg!(&cur, *start, self.input.cur_slice()); - let token = match cur { RawToken::LegacyCommentOpen | RawToken::LegacyCommentClose => { // XML style comment. `