-
Notifications
You must be signed in to change notification settings - Fork 653
Add support for C-style comments #2034
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -810,6 +810,8 @@ pub struct Tokenizer<'a> { | |
/// If true (the default), the tokenizer will un-escape literal | ||
/// SQL strings See [`Tokenizer::with_unescape`] for more details. | ||
unescape: bool, | ||
/// Tokens injected back into the stream (e.g. from MySQL C-style hints) | ||
pending_tokens: Vec<Token>, | ||
} | ||
|
||
impl<'a> Tokenizer<'a> { | ||
|
@@ -834,6 +836,7 @@ impl<'a> Tokenizer<'a> { | |
dialect, | ||
query, | ||
unescape: true, | ||
pending_tokens: Vec::new(), | ||
} | ||
} | ||
|
||
|
@@ -936,10 +939,16 @@ impl<'a> Tokenizer<'a> { | |
|
||
/// Get the next token or return None | ||
fn next_token( | ||
&self, | ||
&mut self, | ||
chars: &mut State, | ||
prev_token: Option<&Token>, | ||
) -> Result<Option<Token>, TokenizerError> { | ||
// Return any previously injected tokens first | ||
{ | ||
if let Some(tok) = self.pending_tokens.pop() { | ||
return Ok(Some(tok)); | ||
} | ||
} | ||
match chars.peek() { | ||
Some(&ch) => match ch { | ||
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), | ||
|
@@ -2102,13 +2111,14 @@ impl<'a> Tokenizer<'a> { | |
} | ||
|
||
fn tokenize_multiline_comment( | ||
&self, | ||
&mut self, | ||
chars: &mut State, | ||
) -> Result<Option<Token>, TokenizerError> { | ||
let mut s = String::new(); | ||
let mut nested = 1; | ||
let mut c_style_comments = false; | ||
let supports_nested_comments = self.dialect.supports_nested_comments(); | ||
|
||
let supports_c_style_comments = self.dialect.supports_c_style_hints(); | ||
loop { | ||
match chars.next() { | ||
Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => { | ||
|
@@ -2117,10 +2127,24 @@ impl<'a> Tokenizer<'a> { | |
s.push('*'); | ||
nested += 1; | ||
} | ||
Some('!') if supports_c_style_comments => { | ||
c_style_comments = true; | ||
// consume only version digits (leave following whitespace/content intact) | ||
while let Some(&c) = chars.peek() { | ||
if c.is_ascii_digit() { | ||
chars.next(); | ||
} else { | ||
break; | ||
} | ||
} | ||
} | ||
Some('*') if matches!(chars.peek(), Some('/')) => { | ||
chars.next(); // consume the '/' | ||
nested -= 1; | ||
if nested == 0 { | ||
if c_style_comments { | ||
break self.inject_tokens_from_c_style_hints_and_return_first(s); | ||
} | ||
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); | ||
} | ||
s.push('*'); | ||
|
@@ -2139,6 +2163,26 @@ impl<'a> Tokenizer<'a> { | |
} | ||
} | ||
|
||
/// Tokenize the given string using the same dialect/unescape settings and inject | ||
/// the resulting tokens back into this tokenizer so they are returned before | ||
/// any further characters from the main stream. Returns the first injected token. | ||
fn inject_tokens_from_c_style_hints_and_return_first( | ||
&mut self, | ||
inner_sql: String, | ||
) -> Result<Option<Token>, TokenizerError> { | ||
let trimmed = inner_sql.trim(); | ||
if trimmed.is_empty() { | ||
return Ok(None); | ||
} | ||
let mut inner = Tokenizer::new(self.dialect, trimmed).with_unescape(self.unescape); | ||
let tokens = inner.tokenize()?; | ||
// push in reverse so we can pop from the end efficiently | ||
for t in tokens.into_iter().rev() { | ||
self.pending_tokens.push(t); | ||
} | ||
Ok(self.pending_tokens.pop()) | ||
} | ||
|
||
fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) { | ||
let mut last_char = None; | ||
let mut s = String::new(); | ||
|
@@ -4070,4 +4114,44 @@ mod tests { | |
panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); | ||
} | ||
} | ||
#[test] | ||
fn tokenize_multiline_comment_with_c_style_comment() { | ||
let sql = String::from("0/*! word */1"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking at their docs, I'm wondering if/how we support these examples? SELECT /*! STRAIGHT_JOIN */ col1 FROM table1,table2
/*!50110 KEY_BLOCK_SIZE=1024 */
SELECT /*! BKA(t1) */ FROM T There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @iffyio - Parsing the c_style comment unblocks sqlparser to not discard those as if they were a normal comment. Support for each hint will have to be added in a case by case bases. For example #2033 - MySQL adds a c-style comment if you run SHOW CREATE TABLE:
Without the current patch, the invisible keyword will be discarded. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah so to clarify I'm rather wondering regarding the parser behavior for hints that aren't singe words e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @iffyio thanks flagging this. I have fixed the issue and now we properly return individual tokens inside a C-style hint comment. |
||
|
||
let dialect = MySqlDialect {}; | ||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); | ||
let expected = vec![ | ||
Token::Number("0".to_string(), false), | ||
Token::Word(Word { | ||
value: "word".to_string(), | ||
quote_style: None, | ||
keyword: Keyword::NoKeyword, | ||
}), | ||
Token::Number("1".to_string(), false), | ||
]; | ||
compare(expected, tokens); | ||
} | ||
|
||
#[test] | ||
fn tokenize_multiline_comment_with_c_style_comment_and_version() { | ||
let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1"); | ||
let dialect = MySqlDialect {}; | ||
let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap(); | ||
let expected = vec![ | ||
Token::Number("0".to_string(), false), | ||
Token::Whitespace(Whitespace::Space), | ||
Token::Word(Word { | ||
value: "KEY_BLOCK_SIZE".to_string(), | ||
quote_style: None, | ||
keyword: Keyword::KEY_BLOCK_SIZE, | ||
}), | ||
Token::Whitespace(Whitespace::Space), | ||
Token::Eq, | ||
Token::Whitespace(Whitespace::Space), | ||
Token::Number("1024".to_string(), false), | ||
Token::Whitespace(Whitespace::Space), | ||
Token::Number("1".to_string(), false), | ||
]; | ||
compare(expected, tokens); | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.