Skip to content

Commit

Permalink
Improve parsing of numbers in SQL tokenizers (#359)
Browse files Browse the repository at this point in the history
  • Loading branch information
Anilm3 authored Nov 19, 2024
1 parent 5dc5061 commit eb80490
Show file tree
Hide file tree
Showing 20 changed files with 273 additions and 83 deletions.
1 change: 0 additions & 1 deletion src/condition/sqli_detector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,6 @@ sqli_result sqli_impl(std::string_view resource, std::vector<sql_token> &resourc
return sqli_error::invalid_sql;
}
}

auto [param_tokens, param_tokens_begin] =
get_consecutive_tokens(resource_tokens, param_index, param_index + value.size());
if (param_tokens.empty()) {
Expand Down
2 changes: 1 addition & 1 deletion src/condition/sqli_detector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ namespace ddwaf {

class sqli_detector : public base_impl<sqli_detector> {
public:
static constexpr unsigned version = 2;
static constexpr unsigned version = 3;
static constexpr std::array<std::string_view, 3> param_names{"resource", "params", "db_type"};

explicit sqli_detector(std::vector<condition_parameter> args, const object_limits &limits = {})
Expand Down
4 changes: 4 additions & 0 deletions src/tokenizer/base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ template <typename T> class base_tokenizer {
}
}

T current_token_type() const { return tokens_.back().type; }

base_token<T> &current_token() { return tokens_.back(); }

std::string_view buffer_;
std::size_t idx_{0};
std::unordered_set<T> skip_tokens_{};
Expand Down
28 changes: 20 additions & 8 deletions src/tokenizer/generic_sql.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,18 +100,28 @@ void generic_sql_tokenizer::tokenize_eol_comment()
emplace_token(token);
}

void generic_sql_tokenizer::tokenize_eol_comment_or_operator()
void generic_sql_tokenizer::tokenize_eol_comment_or_operator_or_number()
{
if (next() == '-') {
tokenize_eol_comment();
return;
}

sql_token token;
token.index = index();
token.str = substr(token.index, 1);
token.type = sql_token_type::binary_operator;
emplace_token(token);
if (tokens_.empty() || current_token_type() != sql_token_type::number) {
auto number_str = extract_number();
if (!number_str.empty()) {
sql_token token;
token.index = index();
token.type = sql_token_type::number;
token.str = number_str;
advance(number_str.size() - 1);
emplace_token(token);
return;
}
}

// If it's not a number, it must be an operator
add_token(sql_token_type::binary_operator);
}

std::vector<sql_token> generic_sql_tokenizer::tokenize_impl()
Expand Down Expand Up @@ -149,7 +159,7 @@ std::vector<sql_token> generic_sql_tokenizer::tokenize_impl()
} else if (c == '/') {
tokenize_inline_comment_or_operator();
} else if (c == '-') {
tokenize_eol_comment_or_operator();
tokenize_eol_comment_or_operator_or_number();
} else if (c == '#') {
tokenize_eol_comment();
} else if (c == '!' || c == '>') {
Expand All @@ -161,8 +171,10 @@ std::vector<sql_token> generic_sql_tokenizer::tokenize_impl()
} else {
add_token(sql_token_type::binary_operator);
}
} else if (c == '=' || c == '%' || c == '+') {
} else if (c == '=' || c == '%') {
add_token(sql_token_type::binary_operator);
} else if (c == '+') {
tokenize_operator_or_number();
} else if (c == '|') {
if (next() == '|') {
add_token(sql_token_type::binary_operator, 2);
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer/generic_sql.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class generic_sql_tokenizer : public sql_tokenizer<generic_sql_tokenizer> {
void tokenize_keyword_operator_or_identifier();
void tokenize_inline_comment_or_operator();
void tokenize_eol_comment();
void tokenize_eol_comment_or_operator();
void tokenize_eol_comment_or_operator_or_number();

friend class sql_tokenizer<generic_sql_tokenizer>;
};
Expand Down
42 changes: 37 additions & 5 deletions src/tokenizer/mysql.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,27 @@ mysql_tokenizer::mysql_tokenizer(
}
}

void mysql_tokenizer::tokenize_keyword_operator_or_identifier()
void mysql_tokenizer::tokenize_string_keyword_operator_or_identifier()
{
sql_token token;
token.index = index();

auto c = ddwaf::tolower(peek());
auto n = next();

// Bit-string
if ((c == 'b' || c == 'x') && n == '\'') {
advance();
// The substring won't contain the prefix, but it's not required
// Also, bit-strings have a reduced character set not taken into
// consideration here, however it probably also doesn't make a
// difference to us since we're not using the value.
token.str = extract_unescaped_string('\'');
token.type = sql_token_type::number;
emplace_token(token);
return;
}

auto remaining_str = substr(index());

re2::StringPiece binary_op;
Expand Down Expand Up @@ -221,7 +237,7 @@ void mysql_tokenizer::tokenize_eol_comment()
emplace_token(token);
}

void mysql_tokenizer::tokenize_eol_comment_or_operator()
void mysql_tokenizer::tokenize_eol_comment_or_operator_or_number()
{
auto n = next();
auto n2 = next(2);
Expand All @@ -237,6 +253,20 @@ void mysql_tokenizer::tokenize_eol_comment_or_operator()
return;
}

if (tokens_.empty() || current_token_type() != sql_token_type::number) {
auto number_str = extract_number();
if (!number_str.empty()) {
sql_token token;
token.index = index();
token.type = sql_token_type::number;
token.str = number_str;
advance(number_str.size() - 1);
emplace_token(token);
return;
}
}

// If it's not a number, it must be an operator
add_token(sql_token_type::binary_operator);
}

Expand All @@ -247,7 +277,7 @@ std::vector<sql_token> mysql_tokenizer::tokenize_impl()
// TODO use an array of characters or a giant switch?
if (ddwaf::isalpha(c) || c == '_' || static_cast<uint8_t>(c) > 0x7f) {
// Command or identifier
tokenize_keyword_operator_or_identifier();
tokenize_string_keyword_operator_or_identifier();
} else if (ddwaf::isdigit(c)) {
tokenize_number_or_identifier();
} else if (c == '"') {
Expand Down Expand Up @@ -284,7 +314,7 @@ std::vector<sql_token> mysql_tokenizer::tokenize_impl()
} else if (c == '/') {
tokenize_inline_comment_or_operator();
} else if (c == '-') {
tokenize_eol_comment_or_operator();
tokenize_eol_comment_or_operator_or_number();
} else if (c == '#') {
tokenize_eol_comment();
} else if (c == '@') {
Expand All @@ -311,8 +341,10 @@ std::vector<sql_token> mysql_tokenizer::tokenize_impl()
} else {
add_token(sql_token_type::binary_operator);
}
} else if (c == '=' || c == '%' || c == '+') {
} else if (c == '=' || c == '%') {
add_token(sql_token_type::binary_operator);
} else if (c == '+') {
tokenize_operator_or_number();
} else if (c == '|') {
if (next() == '|') {
add_token(sql_token_type::binary_operator, 2);
Expand Down
4 changes: 2 additions & 2 deletions src/tokenizer/mysql.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ class mysql_tokenizer : public sql_tokenizer<mysql_tokenizer> {
protected:
std::vector<sql_token> tokenize_impl();

void tokenize_keyword_operator_or_identifier();
void tokenize_string_keyword_operator_or_identifier();
void tokenize_inline_comment_or_operator();
void tokenize_eol_comment();
void tokenize_eol_comment_or_operator();
void tokenize_eol_comment_or_operator_or_number();
void tokenize_number_or_identifier();
void tokenize_variable();

Expand Down
42 changes: 35 additions & 7 deletions src/tokenizer/pgsql.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,25 @@ void pgsql_tokenizer::tokenize_string_keyword_operator_or_identifier()
return;
}

// Escaped string or bit-string
if ((c == 'e' || c == 'b' || c == 'x') && n == '\'') {
// Escaped string
if (c == 'e' && n == '\'') {
advance();
// The substring won't contain the prefix, but it's not required
token.str = extract_escaped_string('\'');
token.type = sql_token_type::single_quoted_string;
emplace_token(token);
return;
}

// Bit-string
if ((c == 'b' || c == 'x') && n == '\'') {
advance();
// The substring won't contain the prefix, but it's not required
// Also, bit-strings have a reduced character set not taken into
// consideration here, however it probably also doesn't make a
// difference to us since we're not using the value.
token.str = c == 'e' ? extract_escaped_string('\'') : extract_unescaped_string('\'');
token.type = sql_token_type::single_quoted_string;
token.str = extract_unescaped_string('\'');
token.type = sql_token_type::number;
emplace_token(token);
return;
}
Expand Down Expand Up @@ -145,7 +155,7 @@ void pgsql_tokenizer::tokenize_eol_comment()
emplace_token(token);
}

void pgsql_tokenizer::tokenize_eol_comment_or_operator()
void pgsql_tokenizer::tokenize_eol_comment_or_operator_or_number()
{
auto n = next();
if (n == '-') {
Expand All @@ -158,6 +168,20 @@ void pgsql_tokenizer::tokenize_eol_comment_or_operator()
return;
}

if (tokens_.empty() || current_token_type() != sql_token_type::number) {
auto number_str = extract_number();
if (!number_str.empty()) {
sql_token token;
token.index = index();
token.type = sql_token_type::number;
token.str = number_str;
advance(number_str.size() - 1);
emplace_token(token);
return;
}
}

// If it's not a number, it must be an operator
add_token(sql_token_type::binary_operator);
}

Expand Down Expand Up @@ -258,6 +282,8 @@ std::vector<sql_token> pgsql_tokenizer::tokenize_impl()
add_token(sql_token_type::binary_operator, next(2) == '>' ? 3 : 2);
} else if (n == '-') {
add_token(sql_token_type::binary_operator, 2);
} else {
add_token(sql_token_type::bitwise_operator);
}
} else if (c == '*') {
add_token(sql_token_type::asterisk);
Expand All @@ -266,7 +292,7 @@ std::vector<sql_token> pgsql_tokenizer::tokenize_impl()
} else if (c == '/') {
tokenize_inline_comment_or_operator();
} else if (c == '-') {
tokenize_eol_comment_or_operator();
tokenize_eol_comment_or_operator_or_number();
} else if (c == '@') {
auto n = next();
if (n == '@' || n == '>') {
Expand All @@ -290,8 +316,10 @@ std::vector<sql_token> pgsql_tokenizer::tokenize_impl()
} else {
add_token(sql_token_type::binary_operator);
}
} else if (c == '=' || c == '%' || c == '+') {
} else if (c == '=' || c == '%') {
add_token(sql_token_type::binary_operator);
} else if (c == '+') {
tokenize_operator_or_number();
} else if (c == '|') {
if (next() == '|') {
add_token(sql_token_type::binary_operator, 2);
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer/pgsql.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class pgsql_tokenizer : public sql_tokenizer<pgsql_tokenizer> {
void tokenize_string_keyword_operator_or_identifier();
void tokenize_inline_comment_or_operator();
void tokenize_eol_comment();
void tokenize_eol_comment_or_operator();
void tokenize_eol_comment_or_operator_or_number();
void tokenize_dollar_quoted_string();
void tokenize_dollar_string_or_identifier();

Expand Down
4 changes: 0 additions & 4 deletions src/tokenizer/shell.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,6 @@ class shell_tokenizer : protected base_tokenizer<shell_token_type> {
current_shell_scope_ = shell_scope_stack_.back();
}

shell_token_type current_token_type() const { return tokens_.back().type; }

shell_token &current_token() { return tokens_.back(); }

template <typename T, typename... Rest>
bool match_nth_nonws_token_descending(std::size_t n, T expected, Rest... args) const
{
Expand Down
21 changes: 20 additions & 1 deletion src/tokenizer/sql_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ namespace ddwaf {
namespace {
// Hexadecimal, octal, decimal or floating point
re2::RE2 number_regex(
R"((?i)^(0[Xx][0-9a-fA-F](?:[0-9a-fA-F]*|_[0-9a-fA-F])*|0[Bb][01](?:[01]|_[01])*|0[Oo][0-7](?:[0-7]|_[0-7])*|(?:(?:[0-9](?:[0-9]|_[0-9])*)(?:\.[0-9](?:[0-9]|_[0-9])*)?(?:[eE][+-]?[0-9](?:[0-9]|_[0-9])*)?))(?:\b|\s|$))");
R"((?i)^(0[Xx][0-9a-fA-F](?:[0-9a-fA-F]*|_[0-9a-fA-F])*|0[Bb][01](?:[01]|_[01])*|0[Oo][0-7](?:[0-7]|_[0-7])*|(?:(?:[-+]?[0-9](?:[0-9]|_[0-9])*)(?:\.[0-9](?:[0-9]|_[0-9])*)?(?:[eE][+-]?[0-9](?:[0-9]|_[0-9])*)?))(?:\b|\s|$))");

} // namespace

Expand Down Expand Up @@ -271,6 +271,25 @@ template <typename T> void sql_tokenizer<T>::tokenize_number()
}
}

template <typename T> void sql_tokenizer<T>::tokenize_operator_or_number()
{
if (tokens_.empty() || current_token_type() != sql_token_type::number) {
auto number_str = extract_number();
if (!number_str.empty()) {
sql_token token;
token.index = index();
token.type = sql_token_type::number;
token.str = number_str;
advance(number_str.size() - 1);
emplace_token(token);
return;
}
}

// If it's not a number, it must be an operator
add_token(sql_token_type::binary_operator);
}

template class sql_tokenizer<pgsql_tokenizer>;
template class sql_tokenizer<mysql_tokenizer>;
template class sql_tokenizer<sqlite_tokenizer>;
Expand Down
2 changes: 2 additions & 0 deletions src/tokenizer/sql_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ template <typename T> class sql_tokenizer : protected base_tokenizer<sql_token_t
void tokenize_conforming_string(char quote, sql_token_type type);
void tokenize_escaped_string(char quote, sql_token_type type);
void tokenize_number();
// Assumes the first character is + or -
void tokenize_operator_or_number();
};

} // namespace ddwaf
Loading

0 comments on commit eb80490

Please sign in to comment.