From e5de2672efd9ce3c7993a15499b33fb2c23ad445 Mon Sep 17 00:00:00 2001
From: Marcelo Altmann <marcelo@readyset.io>
Date: Tue, 16 Sep 2025 23:35:08 -0300
Subject: [PATCH 1/3] Add support for C-style comments

This commit adds support for C-style comments supported by MySQL.
It parses and consumes the optional version number after the `!`
character and leading whitespace.
---
 src/dialect/generic.rs |  4 +++
 src/dialect/mod.rs     |  5 ++++
 src/dialect/mysql.rs   |  5 ++++
 src/tokenizer.rs       | 68 +++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 81 insertions(+), 1 deletion(-)
diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs
index dffc5b527..c09a9890b 100644
--- a/src/dialect/generic.rs
+++ b/src/dialect/generic.rs
@@ -156,6 +156,10 @@ impl Dialect for GenericDialect {
         true
     }
 
+    fn supports_c_style_comments(&self) -> bool {
+        true
+    }
+
     fn supports_user_host_grantee(&self) -> bool {
         true
     }
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
index ef4e1cdde..4a95ac59a 100644
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@@ -898,6 +898,11 @@ pub trait Dialect: Debug + Any {
         false
     }
 
+    /// Returns true if the dialect supports hint and C-style comments
+    fn supports_c_style_comments(&self) -> bool {
+        false
+    }
+
     /// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem`
     /// as an alias assignment operator, rather than a boolean expression.
     /// For example: the following statements are equivalent for such a dialect:
diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs
index 8d2a5ad4b..b9f5415b8 100644
--- a/src/dialect/mysql.rs
+++ b/src/dialect/mysql.rs
@@ -84,6 +84,11 @@ impl Dialect for MySqlDialect {
         true
     }
 
+    /// see <https://dev.mysql.com/doc/refman/8.4/en/comments.html>
+    fn supports_c_style_comments(&self) -> bool {
+        true
+    }
+
     fn parse_infix(
         &self,
         parser: &mut crate::parser::Parser,
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 8382a5344..979542272 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -2107,8 +2107,9 @@ impl<'a> Tokenizer<'a> {
     ) -> Result<Option<Token>, TokenizerError> {
         let mut s = String::new();
         let mut nested = 1;
+        let mut c_style_comments = false;
         let supports_nested_comments = self.dialect.supports_nested_comments();
-
+        let supports_c_style_comments = self.dialect.supports_c_style_comments();
         loop {
             match chars.next() {
                 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
@@ -2117,10 +2118,40 @@ impl<'a> Tokenizer<'a> {
                     s.push('*');
                     nested += 1;
                 }
+                Some('!') if supports_c_style_comments => {
+                    c_style_comments = true;
+                    // consume the optional version digits and whitespace
+                    while let Some(&c) = chars.peek() {
+                        if c.is_ascii_digit() || c.is_whitespace() {
+                            chars.next();
+                        } else {
+                            break;
+                        }
+                    }
+                }
+                // consume all leading whitespaces until the '*/' character if in a C-style comment
+                Some(ch) if ch.is_whitespace() && c_style_comments => {
+                    let mut tmp_s = String::new();
+                    while let Some(c) = chars.next() {
+                        if c.is_whitespace() {
+                            tmp_s.push(c);
+                        } else if c == '*' && chars.peek() == Some(&'/') {
+                            chars.next(); // consume the '/'
+                            return Ok(Some(Token::make_word(&s, None)));
+                        } else {
+                            tmp_s.push(c);
+                            s.push_str(&tmp_s);
+                            break;
+                        }
+                    }
+                }
                 Some('*') if matches!(chars.peek(), Some('/')) => {
                     chars.next(); // consume the '/'
                     nested -= 1;
                     if nested == 0 {
+                        if c_style_comments {
+                            break Ok(Some(Token::make_word(&s, None)));
+                        }
                         break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
                     }
                     s.push('*');
@@ -4070,4 +4101,39 @@ mod tests {
             panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
         }
     }
+    #[test]
+    fn tokenize_multiline_comment_with_c_style_comment() {
+        let sql = String::from("0/*! word */1");
+
+        let dialect = MySqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let expected = vec![
+            Token::Number("0".to_string(), false),
+            Token::Word(Word {
+                value: "word".to_string(),
+                quote_style: None,
+                keyword: Keyword::NoKeyword,
+            }),
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+    }
+
+    #[test]
+    fn tokenize_multiline_comment_with_c_style_comment_and_version() {
+        let sql = String::from("0/*!8000000 word */1");
+
+        let dialect = MySqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let expected = vec![
+            Token::Number("0".to_string(), false),
+            Token::Word(Word {
+                value: "word".to_string(),
+                quote_style: None,
+                keyword: Keyword::NoKeyword,
+            }),
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+    }
 }

From 015aee89cee47a358a899498c8536c9e9b15ee75 Mon Sep 17 00:00:00 2001
From: Marcelo Altmann <marcelo@readyset.io>
Date: Fri, 19 Sep 2025 08:36:19 -0300
Subject: [PATCH 2/3] Adjusted documentation

Documented the C-style comments with an example.
---
 src/dialect/mod.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
index 4a95ac59a..3f7545d34 100644
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@@ -899,6 +899,7 @@ pub trait Dialect: Debug + Any {
     }
 
     /// Returns true if the dialect supports hint and C-style comments
+    /// e.g. `/*! hint */`
     fn supports_c_style_comments(&self) -> bool {
         false
     }

From f96249cc1d52d34edcfb0ca246bdd78f62552b91 Mon Sep 17 00:00:00 2001
From: Marcelo Altmann <marcelo@readyset.io>
Date: Mon, 29 Sep 2025 11:25:24 -0300
Subject: [PATCH 3/3] Added pending tokens for c-style hints

Added the pending tokens structure to properly return all tokens
inside a c-style hint comment.
---
 src/dialect/generic.rs |  2 +-
 src/dialect/mod.rs     |  2 +-
 src/dialect/mysql.rs   |  2 +-
 src/tokenizer.rs       | 72 ++++++++++++++++++++++++++----------------
 4 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs
index c09a9890b..9f86b6f57 100644
--- a/src/dialect/generic.rs
+++ b/src/dialect/generic.rs
@@ -156,7 +156,7 @@ impl Dialect for GenericDialect {
         true
     }
 
-    fn supports_c_style_comments(&self) -> bool {
+    fn supports_c_style_hints(&self) -> bool {
         true
     }
 
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
index 3f7545d34..2481f42d9 100644
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@@ -900,7 +900,7 @@ pub trait Dialect: Debug + Any {
 
     /// Returns true if the dialect supports hint and C-style comments
     /// e.g. `/*! hint */`
-    fn supports_c_style_comments(&self) -> bool {
+    fn supports_c_style_hints(&self) -> bool {
         false
     }
 
diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs
index b9f5415b8..be8a9283a 100644
--- a/src/dialect/mysql.rs
+++ b/src/dialect/mysql.rs
@@ -85,7 +85,7 @@ impl Dialect for MySqlDialect {
     }
 
     /// see <https://dev.mysql.com/doc/refman/8.4/en/comments.html>
-    fn supports_c_style_comments(&self) -> bool {
+    fn supports_c_style_hints(&self) -> bool {
         true
     }
 
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 979542272..92282d7dd 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -810,6 +810,8 @@ pub struct Tokenizer<'a> {
     /// If true (the default), the tokenizer will un-escape literal
     /// SQL strings See [`Tokenizer::with_unescape`] for more details.
     unescape: bool,
+    /// Tokens injected back into the stream (e.g. from MySQL C-style hints)
+    pending_tokens: Vec<Token>,
 }
 
 impl<'a> Tokenizer<'a> {
@@ -834,6 +836,7 @@ impl<'a> Tokenizer<'a> {
             dialect,
             query,
             unescape: true,
+            pending_tokens: Vec::new(),
         }
     }
 
@@ -936,10 +939,16 @@ impl<'a> Tokenizer<'a> {
 
     /// Get the next token or return None
     fn next_token(
-        &self,
+        &mut self,
         chars: &mut State,
         prev_token: Option<&Token>,
     ) -> Result<Option<Token>, TokenizerError> {
+        // Return any previously injected tokens first
+        {
+            if let Some(tok) = self.pending_tokens.pop() {
+                return Ok(Some(tok));
+            }
+        }
         match chars.peek() {
             Some(&ch) => match ch {
                 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
@@ -2102,14 +2111,14 @@ impl<'a> Tokenizer<'a> {
     }
 
     fn tokenize_multiline_comment(
-        &self,
+        &mut self,
         chars: &mut State,
     ) -> Result<Option<Token>, TokenizerError> {
         let mut s = String::new();
         let mut nested = 1;
         let mut c_style_comments = false;
         let supports_nested_comments = self.dialect.supports_nested_comments();
-        let supports_c_style_comments = self.dialect.supports_c_style_comments();
+        let supports_c_style_comments = self.dialect.supports_c_style_hints();
         loop {
             match chars.next() {
                 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
@@ -2120,37 +2129,21 @@ impl<'a> Tokenizer<'a> {
                 }
                 Some('!') if supports_c_style_comments => {
                     c_style_comments = true;
-                    // consume the optional version digits and whitespace
+                    // consume only version digits (leave following whitespace/content intact)
                     while let Some(&c) = chars.peek() {
-                        if c.is_ascii_digit() || c.is_whitespace() {
+                        if c.is_ascii_digit() {
                             chars.next();
                         } else {
                             break;
                         }
                     }
                 }
-                // consume all leading whitespaces until the '*/' character if in a C-style comment
-                Some(ch) if ch.is_whitespace() && c_style_comments => {
-                    let mut tmp_s = String::new();
-                    while let Some(c) = chars.next() {
-                        if c.is_whitespace() {
-                            tmp_s.push(c);
-                        } else if c == '*' && chars.peek() == Some(&'/') {
-                            chars.next(); // consume the '/'
-                            return Ok(Some(Token::make_word(&s, None)));
-                        } else {
-                            tmp_s.push(c);
-                            s.push_str(&tmp_s);
-                            break;
-                        }
-                    }
-                }
                 Some('*') if matches!(chars.peek(), Some('/')) => {
                     chars.next(); // consume the '/'
                     nested -= 1;
                     if nested == 0 {
                         if c_style_comments {
-                            break Ok(Some(Token::make_word(&s, None)));
+                            break self.inject_tokens_from_c_style_hints_and_return_first(s);
                         }
                         break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
                     }
@@ -2170,6 +2163,26 @@ impl<'a> Tokenizer<'a> {
         }
     }
 
+    /// Tokenize the given string using the same dialect/unescape settings and inject
+    /// the resulting tokens back into this tokenizer so they are returned before
+    /// any further characters from the main stream. Returns the first injected token.
+    fn inject_tokens_from_c_style_hints_and_return_first(
+        &mut self,
+        inner_sql: String,
+    ) -> Result<Option<Token>, TokenizerError> {
+        let trimmed = inner_sql.trim();
+        if trimmed.is_empty() {
+            return Ok(None);
+        }
+        let mut inner = Tokenizer::new(self.dialect, trimmed).with_unescape(self.unescape);
+        let tokens = inner.tokenize()?;
+        // push in reverse so we can pop from the end efficiently
+        for t in tokens.into_iter().rev() {
+            self.pending_tokens.push(t);
+        }
+        Ok(self.pending_tokens.pop())
+    }
+
     fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
         let mut last_char = None;
         let mut s = String::new();
@@ -4121,17 +4134,22 @@ mod tests {
 
     #[test]
     fn tokenize_multiline_comment_with_c_style_comment_and_version() {
-        let sql = String::from("0/*!8000000 word */1");
-
+        let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
         let dialect = MySqlDialect {};
-        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
         let expected = vec![
             Token::Number("0".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
             Token::Word(Word {
-                value: "word".to_string(),
+                value: "KEY_BLOCK_SIZE".to_string(),
                 quote_style: None,
-                keyword: Keyword::NoKeyword,
+                keyword: Keyword::KEY_BLOCK_SIZE,
             }),
+            Token::Whitespace(Whitespace::Space),
+            Token::Eq,
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1024".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
             Token::Number("1".to_string(), false),
         ];
         compare(expected, tokens);