diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs index dffc5b527..9f86b6f57 100644 --- a/src/dialect/generic.rs +++ b/src/dialect/generic.rs @@ -156,6 +156,10 @@ impl Dialect for GenericDialect { true } + fn supports_c_style_hints(&self) -> bool { + true + } + fn supports_user_host_grantee(&self) -> bool { true } diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index ef4e1cdde..2481f42d9 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -898,6 +898,12 @@ pub trait Dialect: Debug + Any { false } + /// Returns true if the dialect supports hint and C-style comments + /// e.g. `/*! hint */` + fn supports_c_style_hints(&self) -> bool { + false + } + /// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem` /// as an alias assignment operator, rather than a boolean expression. /// For example: the following statements are equivalent for such a dialect: diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs index 8d2a5ad4b..be8a9283a 100644 --- a/src/dialect/mysql.rs +++ b/src/dialect/mysql.rs @@ -84,6 +84,11 @@ impl Dialect for MySqlDialect { true } + /// see + fn supports_c_style_hints(&self) -> bool { + true + } + fn parse_infix( &self, parser: &mut crate::parser::Parser, diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 8382a5344..92282d7dd 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -810,6 +810,8 @@ pub struct Tokenizer<'a> { /// If true (the default), the tokenizer will un-escape literal /// SQL strings See [`Tokenizer::with_unescape`] for more details. unescape: bool, + /// Tokens injected back into the stream (e.g. from MySQL C-style hints) + pending_tokens: Vec, } impl<'a> Tokenizer<'a> { @@ -834,6 +836,7 @@ impl<'a> Tokenizer<'a> { dialect, query, unescape: true, + pending_tokens: Vec::new(), } } @@ -936,10 +939,16 @@ impl<'a> Tokenizer<'a> { /// Get the next token or return None fn next_token( - &self, + &mut self, chars: &mut State, prev_token: Option<&Token>, ) -> Result, TokenizerError> { + // Return any previously injected tokens first + { + if let Some(tok) = self.pending_tokens.pop() { + return Ok(Some(tok)); + } + } match chars.peek() { Some(&ch) => match ch { ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), @@ -2102,13 +2111,14 @@ impl<'a> Tokenizer<'a> { } fn tokenize_multiline_comment( - &self, + &mut self, chars: &mut State, ) -> Result, TokenizerError> { let mut s = String::new(); let mut nested = 1; + let mut c_style_comments = false; let supports_nested_comments = self.dialect.supports_nested_comments(); - + let supports_c_style_comments = self.dialect.supports_c_style_hints(); loop { match chars.next() { Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => { @@ -2117,10 +2127,24 @@ impl<'a> Tokenizer<'a> { s.push('*'); nested += 1; } + Some('!') if supports_c_style_comments => { + c_style_comments = true; + // consume only version digits (leave following whitespace/content intact) + while let Some(&c) = chars.peek() { + if c.is_ascii_digit() { + chars.next(); + } else { + break; + } + } + } Some('*') if matches!(chars.peek(), Some('/')) => { chars.next(); // consume the '/' nested -= 1; if nested == 0 { + if c_style_comments { + break self.inject_tokens_from_c_style_hints_and_return_first(s); + } break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); } s.push('*'); @@ -2139,6 +2163,26 @@ impl<'a> Tokenizer<'a> { } } + /// Tokenize the given string using the same dialect/unescape settings and inject + /// the resulting tokens back into this tokenizer so they are returned before + /// any further characters from the main stream. Returns the first injected token. + fn inject_tokens_from_c_style_hints_and_return_first( + &mut self, + inner_sql: String, + ) -> Result, TokenizerError> { + let trimmed = inner_sql.trim(); + if trimmed.is_empty() { + return Ok(None); + } + let mut inner = Tokenizer::new(self.dialect, trimmed).with_unescape(self.unescape); + let tokens = inner.tokenize()?; + // push in reverse so we can pop from the end efficiently + for t in tokens.into_iter().rev() { + self.pending_tokens.push(t); + } + Ok(self.pending_tokens.pop()) + } + fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option) { let mut last_char = None; let mut s = String::new(); @@ -4070,4 +4114,44 @@ mod tests { panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); } } + #[test] + fn tokenize_multiline_comment_with_c_style_comment() { + let sql = String::from("0/*! word */1"); + + let dialect = MySqlDialect {}; + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + let expected = vec![ + Token::Number("0".to_string(), false), + Token::Word(Word { + value: "word".to_string(), + quote_style: None, + keyword: Keyword::NoKeyword, + }), + Token::Number("1".to_string(), false), + ]; + compare(expected, tokens); + } + + #[test] + fn tokenize_multiline_comment_with_c_style_comment_and_version() { + let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1"); + let dialect = MySqlDialect {}; + let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap(); + let expected = vec![ + Token::Number("0".to_string(), false), + Token::Whitespace(Whitespace::Space), + Token::Word(Word { + value: "KEY_BLOCK_SIZE".to_string(), + quote_style: None, + keyword: Keyword::KEY_BLOCK_SIZE, + }), + Token::Whitespace(Whitespace::Space), + Token::Eq, + Token::Whitespace(Whitespace::Space), + Token::Number("1024".to_string(), false), + Token::Whitespace(Whitespace::Space), + Token::Number("1".to_string(), false), + ]; + compare(expected, tokens); + } }