Improve parsing of numbers in SQL tokenizers (#359)

DataDog · Nov 19, 2024 · eb80490 · eb80490
1 parent 5dc5061
commit eb80490
Show file tree

Hide file tree

Showing 20 changed files with 273 additions and 83 deletions.
diff --git a/src/condition/sqli_detector.cpp b/src/condition/sqli_detector.cpp
@@ -484,7 +484,6 @@ sqli_result sqli_impl(std::string_view resource, std::vector<sql_token> &resourc
                 return sqli_error::invalid_sql;
             }
         }
-
         auto [param_tokens, param_tokens_begin] =
             get_consecutive_tokens(resource_tokens, param_index, param_index + value.size());
         if (param_tokens.empty()) {

diff --git a/src/condition/sqli_detector.hpp b/src/condition/sqli_detector.hpp
@@ -13,7 +13,7 @@ namespace ddwaf {
 
 class sqli_detector : public base_impl<sqli_detector> {
 public:
-    static constexpr unsigned version = 2;
+    static constexpr unsigned version = 3;
     static constexpr std::array<std::string_view, 3> param_names{"resource", "params", "db_type"};
 
     explicit sqli_detector(std::vector<condition_parameter> args, const object_limits &limits = {})

diff --git a/src/tokenizer/base.hpp b/src/tokenizer/base.hpp
@@ -91,6 +91,10 @@ template <typename T> class base_tokenizer {
         }
     }
 
+    T current_token_type() const { return tokens_.back().type; }
+
+    base_token<T> &current_token() { return tokens_.back(); }
+
     std::string_view buffer_;
     std::size_t idx_{0};
     std::unordered_set<T> skip_tokens_{};

diff --git a/src/tokenizer/generic_sql.cpp b/src/tokenizer/generic_sql.cpp
@@ -100,18 +100,28 @@ void generic_sql_tokenizer::tokenize_eol_comment()
     emplace_token(token);
 }
 
-void generic_sql_tokenizer::tokenize_eol_comment_or_operator()
+void generic_sql_tokenizer::tokenize_eol_comment_or_operator_or_number()
 {
     if (next() == '-') {
         tokenize_eol_comment();
         return;
     }
 
-    sql_token token;
-    token.index = index();
-    token.str = substr(token.index, 1);
-    token.type = sql_token_type::binary_operator;
-    emplace_token(token);
+    if (tokens_.empty() || current_token_type() != sql_token_type::number) {
+        auto number_str = extract_number();
+        if (!number_str.empty()) {
+            sql_token token;
+            token.index = index();
+            token.type = sql_token_type::number;
+            token.str = number_str;
+            advance(number_str.size() - 1);
+            emplace_token(token);
+            return;
+        }
+    }
+
+    // If it's not a number, it must be an operator
+    add_token(sql_token_type::binary_operator);
 }
 
 std::vector<sql_token> generic_sql_tokenizer::tokenize_impl()
@@ -149,7 +159,7 @@ std::vector<sql_token> generic_sql_tokenizer::tokenize_impl()
         } else if (c == '/') {
             tokenize_inline_comment_or_operator();
         } else if (c == '-') {
-            tokenize_eol_comment_or_operator();
+            tokenize_eol_comment_or_operator_or_number();
         } else if (c == '#') {
             tokenize_eol_comment();
         } else if (c == '!' || c == '>') {
@@ -161,8 +171,10 @@ std::vector<sql_token> generic_sql_tokenizer::tokenize_impl()
             } else {
                 add_token(sql_token_type::binary_operator);
             }
-        } else if (c == '=' || c == '%' || c == '+') {
+        } else if (c == '=' || c == '%') {
             add_token(sql_token_type::binary_operator);
+        } else if (c == '+') {
+            tokenize_operator_or_number();
         } else if (c == '|') {
             if (next() == '|') {
                 add_token(sql_token_type::binary_operator, 2);

diff --git a/src/tokenizer/generic_sql.hpp b/src/tokenizer/generic_sql.hpp
@@ -27,7 +27,7 @@ class generic_sql_tokenizer : public sql_tokenizer<generic_sql_tokenizer> {
     void tokenize_keyword_operator_or_identifier();
     void tokenize_inline_comment_or_operator();
     void tokenize_eol_comment();
-    void tokenize_eol_comment_or_operator();
+    void tokenize_eol_comment_or_operator_or_number();
 
     friend class sql_tokenizer<generic_sql_tokenizer>;
 };

diff --git a/src/tokenizer/mysql.cpp b/src/tokenizer/mysql.cpp
@@ -109,11 +109,27 @@ mysql_tokenizer::mysql_tokenizer(
     }
 }
 
-void mysql_tokenizer::tokenize_keyword_operator_or_identifier()
+void mysql_tokenizer::tokenize_string_keyword_operator_or_identifier()
 {
     sql_token token;
     token.index = index();
 
+    auto c = ddwaf::tolower(peek());
+    auto n = next();
+
+    // Bit-string
+    if ((c == 'b' || c == 'x') && n == '\'') {
+        advance();
+        // The substring won't contain the prefix, but it's not required
+        // Also, bit-strings have a reduced character set not taken into
+        // consideration here, however it probably also doesn't make a
+        // difference to us since we're not using the value.
+        token.str = extract_unescaped_string('\'');
+        token.type = sql_token_type::number;
+        emplace_token(token);
+        return;
+    }
+
     auto remaining_str = substr(index());
 
     re2::StringPiece binary_op;
@@ -221,7 +237,7 @@ void mysql_tokenizer::tokenize_eol_comment()
     emplace_token(token);
 }
 
-void mysql_tokenizer::tokenize_eol_comment_or_operator()
+void mysql_tokenizer::tokenize_eol_comment_or_operator_or_number()
 {
     auto n = next();
     auto n2 = next(2);
@@ -237,6 +253,20 @@ void mysql_tokenizer::tokenize_eol_comment_or_operator()
         return;
     }
 
+    if (tokens_.empty() || current_token_type() != sql_token_type::number) {
+        auto number_str = extract_number();
+        if (!number_str.empty()) {
+            sql_token token;
+            token.index = index();
+            token.type = sql_token_type::number;
+            token.str = number_str;
+            advance(number_str.size() - 1);
+            emplace_token(token);
+            return;
+        }
+    }
+
+    // If it's not a number, it must be an operator
     add_token(sql_token_type::binary_operator);
 }
 
@@ -247,7 +277,7 @@ std::vector<sql_token> mysql_tokenizer::tokenize_impl()
         // TODO use an array of characters or a giant switch?
         if (ddwaf::isalpha(c) || c == '_' || static_cast<uint8_t>(c) > 0x7f) {
             // Command or identifier
-            tokenize_keyword_operator_or_identifier();
+            tokenize_string_keyword_operator_or_identifier();
         } else if (ddwaf::isdigit(c)) {
             tokenize_number_or_identifier();
         } else if (c == '"') {
@@ -284,7 +314,7 @@ std::vector<sql_token> mysql_tokenizer::tokenize_impl()
         } else if (c == '/') {
             tokenize_inline_comment_or_operator();
         } else if (c == '-') {
-            tokenize_eol_comment_or_operator();
+            tokenize_eol_comment_or_operator_or_number();
         } else if (c == '#') {
             tokenize_eol_comment();
         } else if (c == '@') {
@@ -311,8 +341,10 @@ std::vector<sql_token> mysql_tokenizer::tokenize_impl()
             } else {
                 add_token(sql_token_type::binary_operator);
             }
-        } else if (c == '=' || c == '%' || c == '+') {
+        } else if (c == '=' || c == '%') {
             add_token(sql_token_type::binary_operator);
+        } else if (c == '+') {
+            tokenize_operator_or_number();
         } else if (c == '|') {
             if (next() == '|') {
                 add_token(sql_token_type::binary_operator, 2);

diff --git a/src/tokenizer/mysql.hpp b/src/tokenizer/mysql.hpp
@@ -22,10 +22,10 @@ class mysql_tokenizer : public sql_tokenizer<mysql_tokenizer> {
 protected:
     std::vector<sql_token> tokenize_impl();
 
-    void tokenize_keyword_operator_or_identifier();
+    void tokenize_string_keyword_operator_or_identifier();
     void tokenize_inline_comment_or_operator();
     void tokenize_eol_comment();
-    void tokenize_eol_comment_or_operator();
+    void tokenize_eol_comment_or_operator_or_number();
     void tokenize_number_or_identifier();
     void tokenize_variable();
 

diff --git a/src/tokenizer/pgsql.cpp b/src/tokenizer/pgsql.cpp
@@ -71,15 +71,25 @@ void pgsql_tokenizer::tokenize_string_keyword_operator_or_identifier()
         return;
     }
 
-    // Escaped string or bit-string
-    if ((c == 'e' || c == 'b' || c == 'x') && n == '\'') {
+    // Escaped string
+    if (c == 'e' && n == '\'') {
+        advance();
+        // The substring won't contain the prefix, but it's not required
+        token.str = extract_escaped_string('\'');
+        token.type = sql_token_type::single_quoted_string;
+        emplace_token(token);
+        return;
+    }
+
+    // Bit-string
+    if ((c == 'b' || c == 'x') && n == '\'') {
         advance();
         // The substring won't contain the prefix, but it's not required
         // Also, bit-strings have a reduced character set not taken into
         // consideration here, however it probably also doesn't make a
         // difference to us since we're not using the value.
-        token.str = c == 'e' ? extract_escaped_string('\'') : extract_unescaped_string('\'');
-        token.type = sql_token_type::single_quoted_string;
+        token.str = extract_unescaped_string('\'');
+        token.type = sql_token_type::number;
         emplace_token(token);
         return;
     }
@@ -145,7 +155,7 @@ void pgsql_tokenizer::tokenize_eol_comment()
     emplace_token(token);
 }
 
-void pgsql_tokenizer::tokenize_eol_comment_or_operator()
+void pgsql_tokenizer::tokenize_eol_comment_or_operator_or_number()
 {
     auto n = next();
     if (n == '-') {
@@ -158,6 +168,20 @@ void pgsql_tokenizer::tokenize_eol_comment_or_operator()
         return;
     }
 
+    if (tokens_.empty() || current_token_type() != sql_token_type::number) {
+        auto number_str = extract_number();
+        if (!number_str.empty()) {
+            sql_token token;
+            token.index = index();
+            token.type = sql_token_type::number;
+            token.str = number_str;
+            advance(number_str.size() - 1);
+            emplace_token(token);
+            return;
+        }
+    }
+
+    // If it's not a number, it must be an operator
     add_token(sql_token_type::binary_operator);
 }
 
@@ -258,6 +282,8 @@ std::vector<sql_token> pgsql_tokenizer::tokenize_impl()
                 add_token(sql_token_type::binary_operator, next(2) == '>' ? 3 : 2);
             } else if (n == '-') {
                 add_token(sql_token_type::binary_operator, 2);
+            } else {
+                add_token(sql_token_type::bitwise_operator);
             }
         } else if (c == '*') {
             add_token(sql_token_type::asterisk);
@@ -266,7 +292,7 @@ std::vector<sql_token> pgsql_tokenizer::tokenize_impl()
         } else if (c == '/') {
             tokenize_inline_comment_or_operator();
         } else if (c == '-') {
-            tokenize_eol_comment_or_operator();
+            tokenize_eol_comment_or_operator_or_number();
         } else if (c == '@') {
             auto n = next();
             if (n == '@' || n == '>') {
@@ -290,8 +316,10 @@ std::vector<sql_token> pgsql_tokenizer::tokenize_impl()
             } else {
                 add_token(sql_token_type::binary_operator);
             }
-        } else if (c == '=' || c == '%' || c == '+') {
+        } else if (c == '=' || c == '%') {
             add_token(sql_token_type::binary_operator);
+        } else if (c == '+') {
+            tokenize_operator_or_number();
         } else if (c == '|') {
             if (next() == '|') {
                 add_token(sql_token_type::binary_operator, 2);

diff --git a/src/tokenizer/pgsql.hpp b/src/tokenizer/pgsql.hpp
@@ -25,7 +25,7 @@ class pgsql_tokenizer : public sql_tokenizer<pgsql_tokenizer> {
     void tokenize_string_keyword_operator_or_identifier();
     void tokenize_inline_comment_or_operator();
     void tokenize_eol_comment();
-    void tokenize_eol_comment_or_operator();
+    void tokenize_eol_comment_or_operator_or_number();
     void tokenize_dollar_quoted_string();
     void tokenize_dollar_string_or_identifier();
 

diff --git a/src/tokenizer/shell.hpp b/src/tokenizer/shell.hpp
@@ -99,10 +99,6 @@ class shell_tokenizer : protected base_tokenizer<shell_token_type> {
         current_shell_scope_ = shell_scope_stack_.back();
     }
 
-    shell_token_type current_token_type() const { return tokens_.back().type; }
-
-    shell_token &current_token() { return tokens_.back(); }
-
     template <typename T, typename... Rest>
     bool match_nth_nonws_token_descending(std::size_t n, T expected, Rest... args) const
     {

diff --git a/src/tokenizer/sql_base.cpp b/src/tokenizer/sql_base.cpp
@@ -23,7 +23,7 @@ namespace ddwaf {
 namespace {
 // Hexadecimal, octal, decimal or floating point
 re2::RE2 number_regex(
-    R"((?i)^(0[Xx][0-9a-fA-F](?:[0-9a-fA-F]*|_[0-9a-fA-F])*|0[Bb][01](?:[01]|_[01])*|0[Oo][0-7](?:[0-7]|_[0-7])*|(?:(?:[0-9](?:[0-9]|_[0-9])*)(?:\.[0-9](?:[0-9]|_[0-9])*)?(?:[eE][+-]?[0-9](?:[0-9]|_[0-9])*)?))(?:\b|\s|$))");
+    R"((?i)^(0[Xx][0-9a-fA-F](?:[0-9a-fA-F]*|_[0-9a-fA-F])*|0[Bb][01](?:[01]|_[01])*|0[Oo][0-7](?:[0-7]|_[0-7])*|(?:(?:[-+]?[0-9](?:[0-9]|_[0-9])*)(?:\.[0-9](?:[0-9]|_[0-9])*)?(?:[eE][+-]?[0-9](?:[0-9]|_[0-9])*)?))(?:\b|\s|$))");
 
 } // namespace
 
@@ -271,6 +271,25 @@ template <typename T> void sql_tokenizer<T>::tokenize_number()
     }
 }
 
+template <typename T> void sql_tokenizer<T>::tokenize_operator_or_number()
+{
+    if (tokens_.empty() || current_token_type() != sql_token_type::number) {
+        auto number_str = extract_number();
+        if (!number_str.empty()) {
+            sql_token token;
+            token.index = index();
+            token.type = sql_token_type::number;
+            token.str = number_str;
+            advance(number_str.size() - 1);
+            emplace_token(token);
+            return;
+        }
+    }
+
+    // If it's not a number, it must be an operator
+    add_token(sql_token_type::binary_operator);
+}
+
 template class sql_tokenizer<pgsql_tokenizer>;
 template class sql_tokenizer<mysql_tokenizer>;
 template class sql_tokenizer<sqlite_tokenizer>;

diff --git a/src/tokenizer/sql_base.hpp b/src/tokenizer/sql_base.hpp
@@ -81,6 +81,8 @@ template <typename T> class sql_tokenizer : protected base_tokenizer<sql_token_t
     void tokenize_conforming_string(char quote, sql_token_type type);
     void tokenize_escaped_string(char quote, sql_token_type type);
     void tokenize_number();
+    // Assumes the first character is + or -
+    void tokenize_operator_or_number();
 };
 
 } // namespace ddwaf