From e5de2672efd9ce3c7993a15499b33fb2c23ad445 Mon Sep 17 00:00:00 2001 From: Marcelo Altmann Date: Tue, 16 Sep 2025 23:35:08 -0300 Subject: [PATCH 1/3] Add support for C-style comments This commit adds support for C-style comments supported by MySQL. It parses and consumes the optional version number after the `!` character and leading whitespace. --- src/dialect/generic.rs | 4 +++ src/dialect/mod.rs | 5 ++++ src/dialect/mysql.rs | 5 ++++ src/tokenizer.rs | 68 +++++++++++++++++++++++++++++++++++++++++- 4 files changed, 81 insertions(+), 1 deletion(-) diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs index dffc5b527..c09a9890b 100644 --- a/src/dialect/generic.rs +++ b/src/dialect/generic.rs @@ -156,6 +156,10 @@ impl Dialect for GenericDialect { true } + fn supports_c_style_comments(&self) -> bool { + true + } + fn supports_user_host_grantee(&self) -> bool { true } diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index ef4e1cdde..4a95ac59a 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -898,6 +898,11 @@ pub trait Dialect: Debug + Any { false } + /// Returns true if the dialect supports hint and C-style comments + fn supports_c_style_comments(&self) -> bool { + false + } + /// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem` /// as an alias assignment operator, rather than a boolean expression. /// For example: the following statements are equivalent for such a dialect: diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs index 8d2a5ad4b..b9f5415b8 100644 --- a/src/dialect/mysql.rs +++ b/src/dialect/mysql.rs @@ -84,6 +84,11 @@ impl Dialect for MySqlDialect { true } + /// see + fn supports_c_style_comments(&self) -> bool { + true + } + fn parse_infix( &self, parser: &mut crate::parser::Parser, diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 8382a5344..979542272 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -2107,8 +2107,9 @@ impl<'a> Tokenizer<'a> { ) -> Result, TokenizerError> { let mut s = String::new(); let mut nested = 1; + let mut c_style_comments = false; let supports_nested_comments = self.dialect.supports_nested_comments(); - + let supports_c_style_comments = self.dialect.supports_c_style_comments(); loop { match chars.next() { Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => { @@ -2117,10 +2118,40 @@ impl<'a> Tokenizer<'a> { s.push('*'); nested += 1; } + Some('!') if supports_c_style_comments => { + c_style_comments = true; + // consume the optional version digits and whitespace + while let Some(&c) = chars.peek() { + if c.is_ascii_digit() || c.is_whitespace() { + chars.next(); + } else { + break; + } + } + } + // consume all leading whitespaces until the '*/' character if in a C-style comment + Some(ch) if ch.is_whitespace() && c_style_comments => { + let mut tmp_s = String::new(); + while let Some(c) = chars.next() { + if c.is_whitespace() { + tmp_s.push(c); + } else if c == '*' && chars.peek() == Some(&'/') { + chars.next(); // consume the '/' + return Ok(Some(Token::make_word(&s, None))); + } else { + tmp_s.push(c); + s.push_str(&tmp_s); + break; + } + } + } Some('*') if matches!(chars.peek(), Some('/')) => { chars.next(); // consume the '/' nested -= 1; if nested == 0 { + if c_style_comments { + break Ok(Some(Token::make_word(&s, None))); + } break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); } s.push('*'); @@ -4070,4 +4101,39 @@ mod tests { panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); } } + #[test] + fn tokenize_multiline_comment_with_c_style_comment() { + let sql = String::from("0/*! word */1"); + + let dialect = MySqlDialect {}; + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + let expected = vec![ + Token::Number("0".to_string(), false), + Token::Word(Word { + value: "word".to_string(), + quote_style: None, + keyword: Keyword::NoKeyword, + }), + Token::Number("1".to_string(), false), + ]; + compare(expected, tokens); + } + + #[test] + fn tokenize_multiline_comment_with_c_style_comment_and_version() { + let sql = String::from("0/*!8000000 word */1"); + + let dialect = MySqlDialect {}; + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + let expected = vec![ + Token::Number("0".to_string(), false), + Token::Word(Word { + value: "word".to_string(), + quote_style: None, + keyword: Keyword::NoKeyword, + }), + Token::Number("1".to_string(), false), + ]; + compare(expected, tokens); + } } From 015aee89cee47a358a899498c8536c9e9b15ee75 Mon Sep 17 00:00:00 2001 From: Marcelo Altmann Date: Fri, 19 Sep 2025 08:36:19 -0300 Subject: [PATCH 2/3] Adjusted documentation Documented the C-style comments with an example. --- src/dialect/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 4a95ac59a..3f7545d34 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -899,6 +899,7 @@ pub trait Dialect: Debug + Any { } /// Returns true if the dialect supports hint and C-style comments + /// e.g. `/*! hint */` fn supports_c_style_comments(&self) -> bool { false } From f96249cc1d52d34edcfb0ca246bdd78f62552b91 Mon Sep 17 00:00:00 2001 From: Marcelo Altmann Date: Mon, 29 Sep 2025 11:25:24 -0300 Subject: [PATCH 3/3] Added pending tokens for c-style hints Added the pending tokens structure to properly return all tokens inside a c-style hint comment. --- src/dialect/generic.rs | 2 +- src/dialect/mod.rs | 2 +- src/dialect/mysql.rs | 2 +- src/tokenizer.rs | 72 ++++++++++++++++++++++++++---------------- 4 files changed, 48 insertions(+), 30 deletions(-) diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs index c09a9890b..9f86b6f57 100644 --- a/src/dialect/generic.rs +++ b/src/dialect/generic.rs @@ -156,7 +156,7 @@ impl Dialect for GenericDialect { true } - fn supports_c_style_comments(&self) -> bool { + fn supports_c_style_hints(&self) -> bool { true } diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 3f7545d34..2481f42d9 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -900,7 +900,7 @@ pub trait Dialect: Debug + Any { /// Returns true if the dialect supports hint and C-style comments /// e.g. `/*! hint */` - fn supports_c_style_comments(&self) -> bool { + fn supports_c_style_hints(&self) -> bool { false } diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs index b9f5415b8..be8a9283a 100644 --- a/src/dialect/mysql.rs +++ b/src/dialect/mysql.rs @@ -85,7 +85,7 @@ impl Dialect for MySqlDialect { } /// see - fn supports_c_style_comments(&self) -> bool { + fn supports_c_style_hints(&self) -> bool { true } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 979542272..92282d7dd 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -810,6 +810,8 @@ pub struct Tokenizer<'a> { /// If true (the default), the tokenizer will un-escape literal /// SQL strings See [`Tokenizer::with_unescape`] for more details. unescape: bool, + /// Tokens injected back into the stream (e.g. from MySQL C-style hints) + pending_tokens: Vec, } impl<'a> Tokenizer<'a> { @@ -834,6 +836,7 @@ impl<'a> Tokenizer<'a> { dialect, query, unescape: true, + pending_tokens: Vec::new(), } } @@ -936,10 +939,16 @@ impl<'a> Tokenizer<'a> { /// Get the next token or return None fn next_token( - &self, + &mut self, chars: &mut State, prev_token: Option<&Token>, ) -> Result, TokenizerError> { + // Return any previously injected tokens first + { + if let Some(tok) = self.pending_tokens.pop() { + return Ok(Some(tok)); + } + } match chars.peek() { Some(&ch) => match ch { ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), @@ -2102,14 +2111,14 @@ impl<'a> Tokenizer<'a> { } fn tokenize_multiline_comment( - &self, + &mut self, chars: &mut State, ) -> Result, TokenizerError> { let mut s = String::new(); let mut nested = 1; let mut c_style_comments = false; let supports_nested_comments = self.dialect.supports_nested_comments(); - let supports_c_style_comments = self.dialect.supports_c_style_comments(); + let supports_c_style_comments = self.dialect.supports_c_style_hints(); loop { match chars.next() { Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => { @@ -2120,37 +2129,21 @@ impl<'a> Tokenizer<'a> { } Some('!') if supports_c_style_comments => { c_style_comments = true; - // consume the optional version digits and whitespace + // consume only version digits (leave following whitespace/content intact) while let Some(&c) = chars.peek() { - if c.is_ascii_digit() || c.is_whitespace() { + if c.is_ascii_digit() { chars.next(); } else { break; } } } - // consume all leading whitespaces until the '*/' character if in a C-style comment - Some(ch) if ch.is_whitespace() && c_style_comments => { - let mut tmp_s = String::new(); - while let Some(c) = chars.next() { - if c.is_whitespace() { - tmp_s.push(c); - } else if c == '*' && chars.peek() == Some(&'/') { - chars.next(); // consume the '/' - return Ok(Some(Token::make_word(&s, None))); - } else { - tmp_s.push(c); - s.push_str(&tmp_s); - break; - } - } - } Some('*') if matches!(chars.peek(), Some('/')) => { chars.next(); // consume the '/' nested -= 1; if nested == 0 { if c_style_comments { - break Ok(Some(Token::make_word(&s, None))); + break self.inject_tokens_from_c_style_hints_and_return_first(s); } break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); } @@ -2170,6 +2163,26 @@ impl<'a> Tokenizer<'a> { } } + /// Tokenize the given string using the same dialect/unescape settings and inject + /// the resulting tokens back into this tokenizer so they are returned before + /// any further characters from the main stream. Returns the first injected token. + fn inject_tokens_from_c_style_hints_and_return_first( + &mut self, + inner_sql: String, + ) -> Result, TokenizerError> { + let trimmed = inner_sql.trim(); + if trimmed.is_empty() { + return Ok(None); + } + let mut inner = Tokenizer::new(self.dialect, trimmed).with_unescape(self.unescape); + let tokens = inner.tokenize()?; + // push in reverse so we can pop from the end efficiently + for t in tokens.into_iter().rev() { + self.pending_tokens.push(t); + } + Ok(self.pending_tokens.pop()) + } + fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option) { let mut last_char = None; let mut s = String::new(); @@ -4121,17 +4134,22 @@ mod tests { #[test] fn tokenize_multiline_comment_with_c_style_comment_and_version() { - let sql = String::from("0/*!8000000 word */1"); - + let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1"); let dialect = MySqlDialect {}; - let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap(); let expected = vec![ Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::Space), Token::Word(Word { - value: "word".to_string(), + value: "KEY_BLOCK_SIZE".to_string(), quote_style: None, - keyword: Keyword::NoKeyword, + keyword: Keyword::KEY_BLOCK_SIZE, }), + Token::Whitespace(Whitespace::Space), + Token::Eq, + Token::Whitespace(Whitespace::Space), + Token::Number("1024".to_string(), false), + Token::Whitespace(Whitespace::Space), Token::Number("1".to_string(), false), ]; compare(expected, tokens);