Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/dialect/generic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,10 @@ impl Dialect for GenericDialect {
true
}

fn supports_c_style_hints(&self) -> bool {
true
}

fn supports_user_host_grantee(&self) -> bool {
true
}
Expand Down
6 changes: 6 additions & 0 deletions src/dialect/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -898,6 +898,12 @@ pub trait Dialect: Debug + Any {
false
}

/// Returns true if the dialect supports hint and C-style comments
/// e.g. `/*! hint */`
fn supports_c_style_hints(&self) -> bool {
false
}

/// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem`
/// as an alias assignment operator, rather than a boolean expression.
/// For example: the following statements are equivalent for such a dialect:
Expand Down
5 changes: 5 additions & 0 deletions src/dialect/mysql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ impl Dialect for MySqlDialect {
true
}

/// see <https://dev.mysql.com/doc/refman/8.4/en/comments.html>
fn supports_c_style_hints(&self) -> bool {
true
}

fn parse_infix(
&self,
parser: &mut crate::parser::Parser,
Expand Down
90 changes: 87 additions & 3 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -810,6 +810,8 @@ pub struct Tokenizer<'a> {
/// If true (the default), the tokenizer will un-escape literal
/// SQL strings See [`Tokenizer::with_unescape`] for more details.
unescape: bool,
/// Tokens injected back into the stream (e.g. from MySQL C-style hints)
pending_tokens: Vec<Token>,
}

impl<'a> Tokenizer<'a> {
Expand All @@ -834,6 +836,7 @@ impl<'a> Tokenizer<'a> {
dialect,
query,
unescape: true,
pending_tokens: Vec::new(),
}
}

Expand Down Expand Up @@ -936,10 +939,16 @@ impl<'a> Tokenizer<'a> {

/// Get the next token or return None
fn next_token(
&self,
&mut self,
chars: &mut State,
prev_token: Option<&Token>,
) -> Result<Option<Token>, TokenizerError> {
// Return any previously injected tokens first
{
if let Some(tok) = self.pending_tokens.pop() {
return Ok(Some(tok));
}
}
match chars.peek() {
Some(&ch) => match ch {
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
Expand Down Expand Up @@ -2102,13 +2111,14 @@ impl<'a> Tokenizer<'a> {
}

fn tokenize_multiline_comment(
&self,
&mut self,
chars: &mut State,
) -> Result<Option<Token>, TokenizerError> {
let mut s = String::new();
let mut nested = 1;
let mut c_style_comments = false;
let supports_nested_comments = self.dialect.supports_nested_comments();

let supports_c_style_comments = self.dialect.supports_c_style_hints();
loop {
match chars.next() {
Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
Expand All @@ -2117,10 +2127,24 @@ impl<'a> Tokenizer<'a> {
s.push('*');
nested += 1;
}
Some('!') if supports_c_style_comments => {
c_style_comments = true;
// consume only version digits (leave following whitespace/content intact)
while let Some(&c) = chars.peek() {
if c.is_ascii_digit() {
chars.next();
} else {
break;
}
}
}
Some('*') if matches!(chars.peek(), Some('/')) => {
chars.next(); // consume the '/'
nested -= 1;
if nested == 0 {
if c_style_comments {
break self.inject_tokens_from_c_style_hints_and_return_first(s);
}
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
}
s.push('*');
Expand All @@ -2139,6 +2163,26 @@ impl<'a> Tokenizer<'a> {
}
}

/// Tokenize the given string using the same dialect/unescape settings and inject
/// the resulting tokens back into this tokenizer so they are returned before
/// any further characters from the main stream. Returns the first injected token.
fn inject_tokens_from_c_style_hints_and_return_first(
&mut self,
inner_sql: String,
) -> Result<Option<Token>, TokenizerError> {
let trimmed = inner_sql.trim();
if trimmed.is_empty() {
return Ok(None);
}
let mut inner = Tokenizer::new(self.dialect, trimmed).with_unescape(self.unescape);
let tokens = inner.tokenize()?;
// push in reverse so we can pop from the end efficiently
for t in tokens.into_iter().rev() {
self.pending_tokens.push(t);
}
Ok(self.pending_tokens.pop())
}

fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
let mut last_char = None;
let mut s = String::new();
Expand Down Expand Up @@ -4070,4 +4114,44 @@ mod tests {
panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
}
}
#[test]
fn tokenize_multiline_comment_with_c_style_comment() {
let sql = String::from("0/*! word */1");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at their docs, I'm wondering if/how we support these examples?

SELECT /*! STRAIGHT_JOIN */ col1 FROM table1,table2
/*!50110 KEY_BLOCK_SIZE=1024 */
SELECT /*! BKA(t1) */ FROM T

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iffyio - Parsing the c_style comment unblocks sqlparser to not discard those as if they were a normal comment. Support for each hint will have to be added in a case by case bases. For example #2033 - MySQL adds a c-style comment if you run SHOW CREATE TABLE:

mysql> SHOW CREATE TABLE b;
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Table | Create Table                                                                                                                                                    |
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
| b     | CREATE TABLE `b` (
  `ID` int DEFAULT NULL,
  `b` char(1) DEFAULT NULL /*!80023 INVISIBLE */
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci |
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
1 row in set (0,008 sec)

Without the current patch, the invisible keyword will be discarded.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah so to clarify I'm rather wondering regarding the parser behavior for hints that aren't singe words e.g. /*!50110 KEY_BLOCK_SIZE=1024 */ - can we demonstrate the behavior with test cases for such scenarios?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iffyio thanks flagging this. I have fixed the issue and now we properly return individual tokens inside a C-style hint comment.


let dialect = MySqlDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::Number("0".to_string(), false),
Token::Word(Word {
value: "word".to_string(),
quote_style: None,
keyword: Keyword::NoKeyword,
}),
Token::Number("1".to_string(), false),
];
compare(expected, tokens);
}

#[test]
fn tokenize_multiline_comment_with_c_style_comment_and_version() {
let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
let dialect = MySqlDialect {};
let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
let expected = vec![
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::Space),
Token::Word(Word {
value: "KEY_BLOCK_SIZE".to_string(),
quote_style: None,
keyword: Keyword::KEY_BLOCK_SIZE,
}),
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number("1024".to_string(), false),
Token::Whitespace(Whitespace::Space),
Token::Number("1".to_string(), false),
];
compare(expected, tokens);
}
}
Loading