diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 69987b0ef4..b4bd27b25a 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -774,4 +774,7 @@ if(CLP_BUILD_TESTING) target_compile_features(unitTest PRIVATE cxx_std_20 ) + target_compile_definitions(unitTest + PRIVATE CLP_BUILD_TESTING + ) endif() diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 1a4bf499e2..8c4e225955 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -1,35 +1,36 @@ #include "GrepCore.hpp" +#include #include #include #include #include +#include +#include +#include #include #include +#include "EncodedVariableInterpreter.hpp" #include "ir/parsing.hpp" -#include "LogSurgeonReader.hpp" -#include "QueryToken.hpp" #include "StringReader.hpp" using clp::ir::is_delim; using clp::string_utils::is_alphabet; using clp::string_utils::is_wildcard; +using log_surgeon::SymbolId::TokenFloat; +using log_surgeon::SymbolId::TokenInt; +using log_surgeon::wildcard_query_parser::QueryInterpretation; +using log_surgeon::wildcard_query_parser::StaticQueryToken; +using log_surgeon::wildcard_query_parser::VariableQueryToken; +using std::holds_alternative; +using std::set; using std::string; +using std::unordered_map; +using std::vector; namespace clp { -namespace { -/** - * Wraps the tokens returned from the log_surgeon lexer, and stores the variable ids of the tokens - * in a search query in a set. This allows for optimized search performance. - */ -class SearchToken : public log_surgeon::Token { -public: - std::set m_type_ids_set; -}; -} // namespace - bool GrepCore::get_bounds_of_next_potential_var( string const& value, size_t& begin_pos, @@ -150,124 +151,134 @@ bool GrepCore::get_bounds_of_next_potential_var( return (value_length != begin_pos); } -bool GrepCore::get_bounds_of_next_potential_var( - string const& value, - size_t& begin_pos, - size_t& end_pos, - bool& is_var, - log_surgeon::lexers::ByteLexer& lexer -) { - size_t const value_length = value.length(); - if (end_pos >= value_length) { - return false; +auto GrepCore::normalize_interpretations(set const& interpretations) + -> set { + set normalized_interpretations; + for (auto const& interpretation : interpretations) { + QueryInterpretation normalized_interpretation; + for (auto const& token : interpretation.get_logtype()) { + auto const& src_string{ + holds_alternative(token) + ? std::get(token).get_query_substring() + : std::get(token).get_query_substring() + }; + string normalized_string; + normalized_string.reserve(src_string.size()); + for (auto const c : src_string) { + if (c != '*' || normalized_string.empty() || normalized_string.back() != '*') { + normalized_string += c; + } + } + + if (holds_alternative(token)) { + auto const& variable_token{std::get(token)}; + normalized_interpretation.append_variable_token( + variable_token.get_variable_type(), + normalized_string, + variable_token.get_contains_wildcard() + ); + } else { + normalized_interpretation.append_static_token(normalized_string); + } + } + normalized_interpretations.insert(normalized_interpretation); } + return normalized_interpretations; +} - is_var = false; - bool contains_wildcard = false; - while (false == is_var && false == contains_wildcard && begin_pos < value_length) { - // Start search at end of last token - begin_pos = end_pos; +auto GrepCore::get_wildcard_encodable_positions(QueryInterpretation const& interpretation) + -> vector { + auto const logtype{interpretation.get_logtype()}; + vector wildcard_encodable_positions; + wildcard_encodable_positions.reserve(logtype.size()); - // Find variable begin or wildcard - bool is_escaped = false; - for (; begin_pos < value_length; ++begin_pos) { - char c = value[begin_pos]; + for (size_t i{0}; i < logtype.size(); ++i) { + auto const& token{logtype[i]}; + if (holds_alternative(token)) { + auto const& var_token{std::get(token)}; + auto const var_type{static_cast(var_token.get_variable_type())}; + bool const is_int{TokenInt == var_type}; + bool const is_float{TokenFloat == var_type}; + if (var_token.get_contains_wildcard() && (is_int || is_float)) { + wildcard_encodable_positions.push_back(i); + } + } + } + return wildcard_encodable_positions; +} - if (is_escaped) { - is_escaped = false; +auto GrepCore::generate_logtype_string( + QueryInterpretation const& interpretation, + vector const& wildcard_encodable_positions, + vector const& mask_encoded_flags +) -> string { + string logtype_string; - if (false == lexer.is_delimiter(c)) { - // Found escaped non-delimiter, so reverse the index to retain the escape - // character - --begin_pos; - break; - } - } else if ('\\' == c) { - // Escape character - is_escaped = true; - } else { - if (is_wildcard(c)) { - contains_wildcard = true; - break; - } - if (false == lexer.is_delimiter(c)) { - break; - } - } + // Reserve size for `logtype_string`. + size_t logtype_string_size{0}; + auto const logtype{interpretation.get_logtype()}; + for (auto const& token : logtype) { + if (holds_alternative(token)) { + auto const& static_token{std::get(token)}; + logtype_string_size += static_token.get_query_substring().size(); + } else { + logtype_string_size++; } + } + logtype_string.reserve(logtype_string_size); - // Find next delimiter - is_escaped = false; - end_pos = begin_pos; - for (; end_pos < value_length; ++end_pos) { - char c = value[end_pos]; + // Generate `logtype_string`. + for (size_t i{0}; i < logtype.size(); ++i) { + auto const& token{logtype[i]}; + if (holds_alternative(token)) { + logtype_string += std::get(token).get_query_substring(); + continue; + } - if (is_escaped) { - is_escaped = false; + auto const& var_token{std::get(token)}; + auto const& raw_string{var_token.get_query_substring()}; + auto const var_type{static_cast(var_token.get_variable_type())}; + bool const is_int{TokenInt == var_type}; + bool const is_float{TokenFloat == var_type}; - if (lexer.is_delimiter(c)) { - // Found escaped delimiter, so reverse the index to retain the escape character - --end_pos; - break; + if (wildcard_encodable_positions.end() + != std::ranges::find( + wildcard_encodable_positions.begin(), + wildcard_encodable_positions.end(), + i + )) + { + if (mask_encoded_flags[i]) { + if (is_int) { + EncodedVariableInterpreter::add_int_var(logtype_string); + } else { + EncodedVariableInterpreter::add_float_var(logtype_string); } - } else if ('\\' == c) { - // Escape character - is_escaped = true; } else { - if (is_wildcard(c)) { - contains_wildcard = true; - } else if (lexer.is_delimiter(c)) { - // Found delimiter that's not also a wildcard - break; - } + EncodedVariableInterpreter::add_dict_var(logtype_string); } + continue; } - if (end_pos > begin_pos) { - bool has_prefix_wildcard = ('*' == value[begin_pos]) || ('?' == value[begin_pos]); - bool has_suffix_wildcard = ('*' == value[end_pos - 1]) || ('?' == value[end_pos - 1]); - bool has_wildcard_in_middle = false; - for (size_t i = begin_pos + 1; i < end_pos - 1; ++i) { - if (('*' == value[i] || '?' == value[i]) && value[i - 1] != '\\') { - has_wildcard_in_middle = true; - break; - } - } - SearchToken search_token; - if (has_wildcard_in_middle || has_prefix_wildcard) { - // DO NOTHING - } else { - StringReader string_reader; - LogSurgeonReader reader_wrapper(string_reader); - log_surgeon::ParserInputBuffer parser_input_buffer; - if (has_suffix_wildcard) { // text* - // TODO: creating a string reader, setting it equal to a string, to read it into - // the ParserInputBuffer, seems like a convoluted way to set a string equal to a - // string, should be improved when adding a SearchParser to log_surgeon - string_reader.open(value.substr(begin_pos, end_pos - begin_pos - 1)); - parser_input_buffer.read_if_safe(reader_wrapper); - lexer.reset(); - lexer.scan_with_wildcard(parser_input_buffer, value[end_pos - 1], search_token); - } else { // no wildcards - string_reader.open(value.substr(begin_pos, end_pos - begin_pos)); - parser_input_buffer.read_if_safe(reader_wrapper); - lexer.reset(); - auto [err, token] = lexer.scan(parser_input_buffer); - if (log_surgeon::ErrorCode::Success != err) { - return false; - } - search_token = SearchToken{token.value()}; - search_token.m_type_ids_set.insert(search_token.m_type_ids_ptr->at(0)); - } - auto const& type = search_token.m_type_ids_ptr->at(0); - if (type != static_cast(log_surgeon::SymbolId::TokenUncaughtString) - && type != static_cast(log_surgeon::SymbolId::TokenEnd)) - { - is_var = true; - } - } + encoded_variable_t encoded_var{0}; + if (is_int + && EncodedVariableInterpreter::convert_string_to_representable_integer_var( + raw_string, + encoded_var + )) + { + EncodedVariableInterpreter::add_int_var(logtype_string); + } else if (is_float + && EncodedVariableInterpreter::convert_string_to_representable_float_var( + raw_string, + encoded_var + )) + { + EncodedVariableInterpreter::add_float_var(logtype_string); + } else { + EncodedVariableInterpreter::add_dict_var(logtype_string); } } - return (value_length != begin_pos); + return logtype_string; } } // namespace clp diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 1e477dda86..b85b172e88 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -1,15 +1,19 @@ #ifndef CLP_GREPCORE_HPP #define CLP_GREPCORE_HPP +#include #include #include #include +#include #include #include #include +#include #include #include +#include #include #include @@ -23,7 +27,15 @@ #include "VariableDictionaryReaderReq.hpp" namespace clp { +#ifdef CLP_BUILD_TESTING +class GrepCoreTest; +#endif + class GrepCore { +#ifdef CLP_BUILD_TESTING + friend class GrepCoreTest; +#endif + public: // Methods /** @@ -75,24 +87,6 @@ class GrepCore { bool& is_var ); - /** - * Returns bounds of next potential variable (either a definite variable or a token with - * wildcards) - * @param value String containing token - * @param begin_pos Begin position of last token, changes to begin position of next token - * @param end_pos End position of last token, changes to end position of next token - * @param is_var Whether the token is definitely a variable - * @param lexer DFA for determining if input is in the schema - * @return true if another potential variable was found, false otherwise - */ - static bool get_bounds_of_next_potential_var( - std::string const& value, - size_t& begin_pos, - size_t& end_pos, - bool& is_var, - log_surgeon::lexers::ByteLexer& lexer - ); - private: // Types enum class SubQueryMatchabilityResult : uint8_t { @@ -147,6 +141,129 @@ class GrepCore { bool ignore_case, SubQuery& sub_query ); + + /** + * Normalizes a set of interpretations by collapsing consecutive greedy wildcards ('*') within + * each token. + * + * Consecutive wildcards that span across the boundary of tokens are preserved. + * + * @param interpretations The original set of `QueryInterpretation`s to normalize. + * @return The normalized set of `QueryInterpretation`s. + */ + static auto normalize_interpretations( + std::set const& interpretations + ) -> std::set; + + /** + * Compare all log-surgeon interpretations against the dictionaries to determine the sub queries + * to search for within the archive. + * + * A. For each interpretation we must consider encodable wildcard variables (e.g. (*1)). + * Each such variable introduces a binary choice: + * - 0: treat as a dictionary variable (\d) + * - 1: treat as an encoded variable (\i for integers, \f for floats) + * + * If there are k encodable wildcard variables, then 2^k logtype strings are possible. As a + * result we limit k <= 16. We represent these alternatives using a bitmask. + * + * Example: + * Search query: "a *1 *2 b", + * Interpretation (one of many): "a (*1) (*2) b" + * Possible logtypes (for the above interpretation): + * mask 00 -> "a \d \d b" + * mask 01 -> "a \d \f b" + * mask 10 -> "a \i \d b" + * mask 11 -> "a \i \f b" + * + * B. Each candidate combination becomes a useful subquery only if: + * 1. The logtype exists in the logtype dictionary, and + * 2. Each variable is either: + * a) resolvable in the variable dictionary (for dictionary vars), or + * b) encoded (always assumed valid). + * + * Note: Encoded variables are always assumed to exist in the segment. This is a performance + * trade-off: checking the archive would be slower than decompressing. + * + * @tparam LogTypeDictionaryReaderType Logtype dictionary reader type. + * @tparam VariableDictionaryReaderType Variable dictionary reader type. + * @param interpretations Log-surgeon's interpretations of the search query. + * @param logtype_dict The logtype dictionary. + * @param var_dict The variable dictionary. + * @param ignore_case If true, perform a case-insensitive search. + * @param sub_queries Returns the subqueries to compare against CLP's archives. + * @throw std::runtime_error If there are too many candidate combinations. + */ + template < + LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, + VariableDictionaryReaderReq VariableDictionaryReaderType + > + static auto generate_schema_sub_queries( + std::set const& + interpretations, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict, + bool ignore_case, + std::vector& sub_queries + ) -> void; + + /** + * Scans the interpretation and returns the indices of all encodable wildcard variables. + * + * An encodable variable is a variable token that: + * - Contains a wildcard (e.g. *1). + * - Is of an encodable type (integer or float). + * + * @param interpretation The `QueryInterpretation` to scan. + * @return A vector of positions of encodable wildcard variables. + */ + static auto get_wildcard_encodable_positions( + log_surgeon::wildcard_query_parser::QueryInterpretation const& interpretation + ) -> std::vector; + + /** + * Generates a logtype string from an interpretation, applying a mask to determine which + * encodable wildcard positions are treated as encoded vs dictionary variables. + * - 0: Treat as dictionary variable. + * - 1: Treat as an encoded variable. + * + * @param interpretation The interpretation to convert to a logtype string. + * @param wildcard_encodable_positions A vector of positions of encodable wildcard variables. + * @param mask_encoded_flags A vector indicating if a variables is mask encoded. + * @return The logtype string corresponding to this combination of encoded variables. + */ + static auto generate_logtype_string( + log_surgeon::wildcard_query_parser::QueryInterpretation const& interpretation, + std::vector const& wildcard_encodable_positions, + std::vector const& mask_encoded_flags + ) -> std::string; + + /** + * Process a single variable token for schema subquery generation. + * + * Determines if the variable can be treated as: + * - an encoded variable, + * - a dictionary variable, + * - or requires wildcard dictionary search. + * + * Updates `sub_query` with the appropriate variable encodings. + * + * @tparam VariableDictionaryReaderType Variable dictionary reader type. + * @param variable_token The variable token to process. + * @param var_dict The variable dictionary. + * @param ignore_case If true, perform a case-insensitive search. + * @param is_mask_encoded If the token is an encodable wildcard and is to be encoded. + * @param sub_query Returns the updated sub query object. + * @return True if the variable is encoded or is in the variable dictionary, false otherwise. + */ + template + static auto process_schema_var_token( + log_surgeon::wildcard_query_parser::VariableQueryToken const& variable_token, + VariableDictionaryReaderType const& var_dict, + bool ignore_case, + bool is_mask_encoded, + SubQuery& sub_query + ) -> bool; }; template < @@ -163,13 +280,15 @@ std::optional GrepCore::process_raw_query( log_surgeon::lexers::ByteLexer& lexer, bool use_heuristic ) { - // Split search_string into tokens with wildcards - std::vector query_tokens; - size_t begin_pos = 0; - size_t end_pos = 0; - bool is_var; - std::string search_string_for_sub_queries{search_string}; + std::vector sub_queries; if (use_heuristic) { + // Split search_string into tokens with wildcards + std::vector query_tokens; + size_t begin_pos = 0; + size_t end_pos = 0; + bool is_var; + std::string search_string_for_sub_queries{search_string}; + // Replace unescaped '?' wildcards with '*' wildcards since we currently have no support for // generating sub-queries with '?' wildcards. The final wildcard match on the decompressed // message uses the original wildcards, so correctness will be maintained. @@ -192,72 +311,73 @@ std::optional GrepCore::process_raw_query( { query_tokens.emplace_back(search_string_for_sub_queries, begin_pos, end_pos, is_var); } - } else { - while (get_bounds_of_next_potential_var( - search_string_for_sub_queries, - begin_pos, - end_pos, - is_var, - lexer - )) - { - query_tokens.emplace_back(search_string_for_sub_queries, begin_pos, end_pos, is_var); + // Get pointers to all ambiguous tokens. Exclude tokens with wildcards in the middle since + // we fall-back to decompression + wildcard matching for those. + std::vector ambiguous_tokens; + for (auto& query_token : query_tokens) { + if (false == query_token.has_greedy_wildcard_in_middle() + && query_token.is_ambiguous_token()) + { + ambiguous_tokens.push_back(&query_token); + } } - } - // Get pointers to all ambiguous tokens. Exclude tokens with wildcards in the middle since we - // fall-back to decompression + wildcard matching for those. - std::vector ambiguous_tokens; - for (auto& query_token : query_tokens) { - if (!query_token.has_greedy_wildcard_in_middle() && query_token.is_ambiguous_token()) { - ambiguous_tokens.push_back(&query_token); - } - } + // Generate a sub-query for each combination of ambiguous tokens + // E.g., if there are two ambiguous tokens each of which could be a logtype or variable, we + // need to create: + // - (token1 as logtype) (token2 as logtype) + // - (token1 as logtype) (token2 as var) + // - (token1 as var) (token2 as logtype) + // - (token1 as var) (token2 as var) + std::string logtype; + bool type_of_one_token_changed = true; + while (type_of_one_token_changed) { + SubQuery sub_query; - // Generate a sub-query for each combination of ambiguous tokens - // E.g., if there are two ambiguous tokens each of which could be a logtype or variable, we need - // to create: - // - (token1 as logtype) (token2 as logtype) - // - (token1 as logtype) (token2 as var) - // - (token1 as var) (token2 as logtype) - // - (token1 as var) (token2 as var) - std::vector sub_queries; - std::string logtype; - bool type_of_one_token_changed = true; - while (type_of_one_token_changed) { - SubQuery sub_query; + // Compute logtypes and variables for query + auto matchability = generate_logtypes_and_vars_for_subquery( + logtype_dict, + var_dict, + search_string_for_sub_queries, + query_tokens, + ignore_case, + sub_query + ); + switch (matchability) { + case SubQueryMatchabilityResult::SupercedesAllSubQueries: + // Since other sub-queries will be superceded by this one, we can stop + // processing now + return Query{search_begin_ts, search_end_ts, ignore_case, search_string, {}}; + case SubQueryMatchabilityResult::MayMatch: + sub_queries.push_back(std::move(sub_query)); + break; + case SubQueryMatchabilityResult::WontMatch: + default: + // Do nothing + break; + } - // Compute logtypes and variables for query - auto matchability = generate_logtypes_and_vars_for_subquery( + // Update combination of ambiguous tokens + type_of_one_token_changed = false; + for (auto* ambiguous_token : ambiguous_tokens) { + if (ambiguous_token->change_to_next_possible_type()) { + type_of_one_token_changed = true; + break; + } + } + } + } else { + // TODO: Optimize such that interpretations are only generated once per schema. + log_surgeon::wildcard_query_parser::Query const query{search_string}; + auto const interpretations{query.get_all_multi_token_interpretations(lexer)}; + auto const normalized_interpretations{normalize_interpretations(interpretations)}; + generate_schema_sub_queries( + normalized_interpretations, logtype_dict, var_dict, - search_string_for_sub_queries, - query_tokens, ignore_case, - sub_query + sub_queries ); - switch (matchability) { - case SubQueryMatchabilityResult::SupercedesAllSubQueries: - // Since other sub-queries will be superceded by this one, we can stop processing - // now - return Query{search_begin_ts, search_end_ts, ignore_case, search_string, {}}; - case SubQueryMatchabilityResult::MayMatch: - sub_queries.push_back(std::move(sub_query)); - break; - case SubQueryMatchabilityResult::WontMatch: - default: - // Do nothing - break; - } - - // Update combination of ambiguous tokens - type_of_one_token_changed = false; - for (auto* ambiguous_token : ambiguous_tokens) { - if (ambiguous_token->change_to_next_possible_type()) { - type_of_one_token_changed = true; - break; - } - } } if (sub_queries.empty()) { @@ -429,6 +549,160 @@ GrepCore::SubQueryMatchabilityResult GrepCore::generate_logtypes_and_vars_for_su return SubQueryMatchabilityResult::MayMatch; } + +template < + LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, + VariableDictionaryReaderReq VariableDictionaryReaderType +> +void GrepCore::generate_schema_sub_queries( + std::set const& interpretations, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict, + bool const ignore_case, + std::vector& sub_queries +) { + constexpr size_t cMaxEncodableWildcardVariables{16}; + for (auto const& interpretation : interpretations) { + auto const logtype{interpretation.get_logtype()}; + auto wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; + if (wildcard_encodable_positions.size() > cMaxEncodableWildcardVariables) { + throw std::runtime_error("Too many encodable variables."); + } + uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; + for (uint64_t mask{0}; mask < num_combos; ++mask) { + std::vector mask_encoded_flags(logtype.size(), false); + for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { + mask_encoded_flags[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; + } + + auto logtype_string{generate_logtype_string( + interpretation, + wildcard_encodable_positions, + mask_encoded_flags + )}; + + std::unordered_set logtype_entries; + logtype_dict.get_entries_matching_wildcard_string( + logtype_string, + ignore_case, + logtype_entries + ); + if (logtype_entries.empty()) { + continue; + } + + SubQuery sub_query; + bool has_vars{true}; + for (size_t i{0}; i < logtype.size(); ++i) { + auto const& token{logtype[i]}; + if (std::holds_alternative( + token + )) + { + bool is_mask_encoded{false}; + if (wildcard_encodable_positions.end() + != std::ranges::find( + wildcard_encodable_positions.begin(), + wildcard_encodable_positions.end(), + i + )) + { + is_mask_encoded = mask_encoded_flags[i]; + } + + has_vars = process_schema_var_token( + std::get(token), + var_dict, + ignore_case, + is_mask_encoded, + sub_query + ); + } + if (false == has_vars) { + break; + } + } + if (false == has_vars) { + continue; + } + + std::unordered_set possible_logtype_ids; + possible_logtype_ids.reserve(logtype_entries.size()); + for (auto const* entry : logtype_entries) { + possible_logtype_ids.emplace(entry->get_id()); + } + sub_query.set_possible_logtypes(possible_logtype_ids); + if (sub_queries.end() == std::ranges::find(sub_queries, sub_query)) { + sub_queries.push_back(std::move(sub_query)); + } + } + } +} + +template +auto GrepCore::process_schema_var_token( + log_surgeon::wildcard_query_parser::VariableQueryToken const& variable_token, + VariableDictionaryReaderType const& var_dict, + bool const ignore_case, + bool const is_mask_encoded, + SubQuery& sub_query +) -> bool { + auto const& raw_string{variable_token.get_query_substring()}; + auto const var_has_wildcard{variable_token.get_contains_wildcard()}; + auto const var_type{static_cast(variable_token.get_variable_type())}; + bool const is_int{log_surgeon::SymbolId::TokenInt == var_type}; + bool const is_float{log_surgeon::SymbolId::TokenFloat == var_type}; + + if (is_mask_encoded) { + sub_query.mark_wildcard_match_required(); + return true; + } + + if (var_has_wildcard) { + return EncodedVariableInterpreter::wildcard_search_dictionary_and_get_encoded_matches( + raw_string, + var_dict, + ignore_case, + sub_query + ); + } + + encoded_variable_t encoded_var{}; + if ((is_int + && EncodedVariableInterpreter::convert_string_to_representable_integer_var( + raw_string, + encoded_var + )) + || (is_float + && EncodedVariableInterpreter::convert_string_to_representable_float_var( + raw_string, + encoded_var + ))) + { + sub_query.add_non_dict_var(encoded_var); + return true; + } + + auto entries = var_dict.get_entry_matching_value(raw_string, ignore_case); + if (entries.empty()) { + return false; + } + if (1 == entries.size()) { + auto const entry_id{entries[0]->get_id()}; + sub_query.add_dict_var(EncodedVariableInterpreter::encode_var_dict_id(entry_id), entry_id); + return true; + } + std::unordered_set encoded_vars; + std::unordered_set var_dict_ids; + encoded_vars.reserve(entries.size()); + var_dict_ids.reserve(entries.size()); + for (auto const* entry : entries) { + encoded_vars.emplace(EncodedVariableInterpreter::encode_var_dict_id(entry->get_id())); + var_dict_ids.emplace(entry->get_id()); + } + sub_query.add_imprecise_dict_var(encoded_vars, var_dict_ids); + return true; +} } // namespace clp #endif // CLP_GREPCORE_HPP diff --git a/components/core/src/clp/Query.hpp b/components/core/src/clp/Query.hpp index f28989096b..51457b0508 100644 --- a/components/core/src/clp/Query.hpp +++ b/components/core/src/clp/Query.hpp @@ -25,6 +25,8 @@ class QueryVar { ); // Methods + auto operator==(QueryVar const& rhs) const -> bool = default; + /** * Checks if the given encoded variable matches this QueryVar * @param var @@ -75,6 +77,8 @@ class QueryVar { class SubQuery { public: // Methods + auto operator==(SubQuery const& rhs) const -> bool = default; + /** * Adds a precise non-dictionary variable to the subquery * @param precise_non_dict_var @@ -155,7 +159,7 @@ class SubQuery { std::unordered_set m_possible_logtypes; std::set m_ids_of_matching_segments; std::vector m_vars; - bool m_wildcard_match_required; + bool m_wildcard_match_required{false}; }; /** diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index fe9c443668..3b4733cb69 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -1,101 +1,1022 @@ +#include +#include #include +#include #include +#include +#include +#include +#include +#include +#include +#include #include +#include #include +#include #include +#include +#include "../src/clp/Defs.h" +#include "../src/clp/EncodedVariableInterpreter.hpp" #include "../src/clp/GrepCore.hpp" -#include "../src/clp/Utils.hpp" +#include "../src/clp/LogTypeDictionaryReaderReq.hpp" +#include "../src/clp/Query.hpp" +#include "../src/clp/string_utils/string_utils.hpp" +#include "../src/clp/VariableDictionaryReaderReq.hpp" +using clp::EncodedVariableInterpreter; using clp::GrepCore; -using clp::load_lexer_from_file; -using log_surgeon::DelimiterStringAST; +using clp::logtype_dictionary_id_t; +using clp::LogTypeDictionaryReaderReq; +using clp::string_utils::wildcard_match_unsafe_case_sensitive; +using clp::SubQuery; +using clp::variable_dictionary_id_t; +using clp::VariableDictionaryReaderReq; using log_surgeon::lexers::ByteLexer; -using log_surgeon::ParserAST; -using log_surgeon::SchemaAST; -using log_surgeon::SchemaParser; +using log_surgeon::Schema; using log_surgeon::SchemaVarAST; +using log_surgeon::SymbolId::TokenFloat; +using log_surgeon::SymbolId::TokenInt; +using log_surgeon::wildcard_query_parser::QueryInterpretation; +using log_surgeon::wildcard_query_parser::VariableQueryToken; +using std::pair; +using std::set; using std::string; +using std::string_view; +using std::tuple; +using std::unordered_map; +using std::unordered_set; +using std::variant; +using std::vector; + +using VarInfo = tuple>; + +constexpr uint32_t cIntId{static_cast(TokenInt)}; +constexpr uint32_t cFloatId{static_cast(TokenFloat)}; +constexpr uint32_t cHasNumId{111}; + +/** + * Helper to expose `GrepCore` functionality for unit-testing. + * + * This class provides static wrappers around `GrepCore` methods, allowing test + * code to access internal logic such as: + * - Finding wildcard encodable positions in a `QueryInterpretation`; + * - Generating logtype strings with wildcard masks; + * - Processing variable tokens with or without encoding; + * - Generating schema-based sub-queries. + * + * All methods forward directly to `GrepCore` and are intended for testing only. + */ +class clp::GrepCoreTest { +public: + static auto normalize_interpretations(set const& interpretations) + -> set { + return GrepCore::normalize_interpretations(interpretations); + } + + template < + LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, + VariableDictionaryReaderReq VariableDictionaryReaderType + > + static auto generate_schema_sub_queries( + set const& interpretations, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict, + vector& sub_queries + ) -> void { + GrepCore::generate_schema_sub_queries( + interpretations, + logtype_dict, + var_dict, + false, + sub_queries + ); + } + + static auto get_wildcard_encodable_positions(QueryInterpretation const& interpretation) + -> vector { + return GrepCore::get_wildcard_encodable_positions(interpretation); + } + + static auto generate_logtype_string( + QueryInterpretation const& interpretation, + vector const& wildcard_encodable_positions, + vector const& mask_encoded_flags + ) -> string { + return GrepCore::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + mask_encoded_flags + ); + } + + template + static auto process_token( + VariableQueryToken const& var_token, + VariableDictionaryReaderType const& var_dict, + SubQuery& sub_query + ) -> bool { + return GrepCore::process_schema_var_token(var_token, var_dict, false, false, sub_query); + } + + template + static auto process_encoded_token( + VariableQueryToken const& var_token, + VariableDictionaryReaderType const& var_dict, + SubQuery& sub_query + ) -> bool { + return GrepCore::process_schema_var_token(var_token, var_dict, false, true, sub_query); + } +}; namespace { -[[nodiscard]] auto get_tests_dir() -> std::filesystem::path; +/** + * Simple helper class representing a fake variable dictionary entry for unit tests. + * + * Adheres to `VariableDictionaryEntryReq`. + */ +class FakeVarEntry { +public: + explicit FakeVarEntry(variable_dictionary_id_t const id, string value) + : m_id{id}, + m_value{std::move(value)} {} + + [[nodiscard]] auto get_id() const -> variable_dictionary_id_t { return m_id; } + + [[nodiscard]] auto get_value() const -> string const& { return m_value; } + +private: + variable_dictionary_id_t m_id; + string m_value; +}; + +/** + * Simple helper class representing a fake variable dictionary for unit tests. + * + * Provides a method for adding entries and adheres to `VariableDictionaryReaderReq`. + */ +class FakeVarDict { +public: + using Entry = FakeVarEntry; + using dictionary_id_t = variable_dictionary_id_t; + + auto add_entry(dictionary_id_t const id, string value) -> void { + m_storage.emplace(id, Entry{id, std::move(value)}); + } + + [[nodiscard]] auto get_value(dictionary_id_t const id) const -> string const& { + static string const cEmpty{}; + auto const it{m_storage.find(id)}; + if (m_storage.end() != it) { + return it->second.get_value(); + } + return cEmpty; + } + + auto get_entry_matching_value(string_view const val, [[maybe_unused]] bool ignore_case) const + -> vector { + vector results; + for (auto const& [id, entry] : m_storage) { + if (val == entry.get_value()) { + results.push_back(&entry); + } + } + return results; + } + + auto get_entries_matching_wildcard_string( + string_view const val, + [[maybe_unused]] bool ignore_case, + unordered_set& results + ) const -> void { + for (auto const& [id, entry] : m_storage) { + if (wildcard_match_unsafe_case_sensitive(entry.get_value(), val)) { + results.insert(&entry); + } + } + } + +private: + unordered_map m_storage; +}; + +/** + * Simple helper class representing a fake logtype dictionary entry for unit tests. + * + * Adheres to `LogtypeDictionaryEntryReq`. + */ +class FakeLogTypeEntry { +public: + FakeLogTypeEntry(string value, logtype_dictionary_id_t const id) + : m_value(std::move(value)), + m_id(id) {} + + auto clear() -> void { m_value.clear(); } + + auto reserve_constant_length(size_t const length) -> void { m_value.reserve(length); } + + auto parse_next_var( + [[maybe_unused]] string_view msg, + [[maybe_unused]] size_t begin, + [[maybe_unused]] size_t end, + [[maybe_unused]] string_view& parsed + ) -> bool { + return false; + } + + auto add_constant(string_view const msg, size_t const begin_pos, size_t const length) -> void { + m_value.append(msg.substr(begin_pos, length)); + } + + auto add_int_var() -> void { EncodedVariableInterpreter::add_int_var(m_value); } + + auto add_float_var() -> void { EncodedVariableInterpreter::add_float_var(m_value); } + + auto add_dictionary_var() -> void { EncodedVariableInterpreter::add_dict_var(m_value); } + + [[nodiscard]] auto get_value() const -> string const& { return m_value; } + + [[nodiscard]] auto get_num_variables() const -> size_t { return 0; } + + [[nodiscard]] auto get_num_placeholders() const -> size_t { return 0; } + + [[nodiscard]] auto + get_placeholder_info([[maybe_unused]] size_t idx, [[maybe_unused]] auto& ref) const -> size_t { + return SIZE_MAX; + } + + [[nodiscard]] auto get_id() const -> logtype_dictionary_id_t { return m_id; } + +private: + string m_value; + logtype_dictionary_id_t m_id{0}; +}; + +/** + * Simple helper class representing a fake logtype dictionary for unit tests. + * + * Provides a method for adding entries and adheres to `LogtypeDictionaryReaderReq`. + */ +class FakeLogTypeDict { +public: + using Entry = FakeLogTypeEntry; + using dictionary_id_t = logtype_dictionary_id_t; + + auto add_entry(string const& value, dictionary_id_t id) -> void { + m_storage.emplace_back(value, id); + } + + auto + get_entry_matching_value(string_view const logtype, [[maybe_unused]] bool ignore_case) const + -> vector { + vector results; + for (auto const& entry : m_storage) { + if (logtype == entry.get_value()) { + results.push_back(&entry); + } + } + return results; + } + + auto get_entries_matching_wildcard_string( + string_view const logtype, + [[maybe_unused]] bool ignore_case, + unordered_set& results + ) const -> void { + for (auto const& entry : m_storage) { + if (wildcard_match_unsafe_case_sensitive(entry.get_value(), logtype)) { + results.insert(&entry); + } + } + } -auto get_tests_dir() -> std::filesystem::path { - std::filesystem::path const current_file_path{__FILE__}; - return std::filesystem::canonical(current_file_path.parent_path()); +private: + vector m_storage; +}; + +/** + * @param entries Vector of (id, value) pairs to populate the variable + * dictionary. + * @return A `FakeVarDict` initialized with the given entries. + */ +auto make_var_dict(vector> const& entries) -> FakeVarDict; + +/** + * @param entries Vector of logtypes, where each logtype is represented by a vector of tokens. Each + * token is either a literal substring (`string_view`) or a variable placeholder (`char`). + * @return A `FakeLogtypeDict` initialized with the given entries. + */ +auto make_logtype_dict(vector>> const& entries) + -> FakeLogTypeDict; + +/** + * Constructs a `QueryInterpretation` from a vector of tokens. + * + * Each token is either: + * - a `string` representing a static substring, or + * - a `pair`, representing a variable placeholder and its value. + * + * This method automatically detects whether a variable token contains a + * wildcard (`*` or `?`). + * + * @param tokens Vector of tokens to populate the `QueryInterpretation`. + * @return A `QueryInterpretation` populated with the given tokens. + */ +auto make_query_interpretation(vector>> const& tokens) + -> QueryInterpretation; + +/** + * Generates a logtype string from a vector of tokens. + * + * Each token is either: + * - a literal substring (`string_view`) to append directly, or + * - a variable placeholder (`char`) indicating the type of variable: + * - `i` -> integer variable; + * - `f` -> float variable; + * - `d` -> dictionary variable. + * + * The function forwards variable tokens to `EncodedVariableInterpreter` to + * append their encoded representations to the resulting string. + * + * @param tokens Vector of tokens to convert into a logtype string. + * @return A `string` representing the expected encoded logtype. + */ +auto generate_expected_logtype_string(vector> const& tokens) -> string; + +/** + * Checks that a `SubQuery` at a given index matches the expected properties. + * + * This method verifies: + * - Whether wildcard matching is required; + * - The number and type of variables; + * - For dictionary variables, the precise or possible dictionary IDs; + * - The set of possible logtype IDs. + * + * @param id Index of the sub-query to check in `sub_queries`. + * @param sub_queries Vector of `SubQuery` objects. + * @param wildcard_match_required Expected wildcard match requirement. + * @param vars_info Vector of tuples describing expected variable properties: (`is_dict_var`, + * `is_precise_var`, `var_dict_ids`). + * @param logtype_ids Expected set of possible logtype IDs. + */ +auto check_sub_query( + size_t id, + vector const& sub_queries, + bool wildcard_match_required, + vector const& vars_info, + unordered_set const& logtype_ids +) -> void; + +/** + * Initializes a `ByteLexer` with space as a delimiter and the given `schema_rules`. + * + * @param schema_rules A vector of strings, each string representing a schema rule. + * @return The initialized `ByteLexer`. + */ +auto make_test_lexer(vector const& schema_rules) -> ByteLexer; + +auto make_var_dict(vector> const& entries) -> FakeVarDict { + FakeVarDict dict; + for (auto const& [id, val] : entries) { + dict.add_entry(id, val); + } + return dict; } -} // namespace -TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var]") { - auto const test_schema_files_dir = get_tests_dir() / "test_schema_files"; - auto const search_schema_path = test_schema_files_dir / "search_schema.txt"; +auto make_logtype_dict(vector>> const& entries) + -> FakeLogTypeDict { + FakeLogTypeDict dict; + logtype_dictionary_id_t id{0}; + for (auto const& entry : entries) { + dict.add_entry(generate_expected_logtype_string(entry), id++); + } + return dict; +} + +auto make_query_interpretation(vector>> const& tokens) + -> QueryInterpretation { + QueryInterpretation interp; + for (auto const& token : tokens) { + if (holds_alternative(token)) { + interp.append_static_token(get(token)); + } else { + auto const& [symbol, value]{get>(token)}; + auto const contains_wildcard{value.find_first_of("*?") != string::npos}; + interp.append_variable_token(symbol, value, contains_wildcard); + } + } + return interp; +} + +auto generate_expected_logtype_string(vector> const& tokens) -> string { + string result; + for (auto const& token : tokens) { + if (holds_alternative(token)) { + result.append(get(token)); + } else { + switch (get(token)) { + case 'i': + EncodedVariableInterpreter::add_int_var(result); + break; + case 'f': + EncodedVariableInterpreter::add_float_var(result); + break; + case 'd': + EncodedVariableInterpreter::add_dict_var(result); + break; + default: + break; + } + } + } + return result; +} + +auto check_sub_query( + size_t id, + vector const& sub_queries, + bool const wildcard_match_required, + vector const& vars_info, + unordered_set const& logtype_ids +) -> void { + CAPTURE(id); + auto const& sub_query{sub_queries[id]}; + + REQUIRE(wildcard_match_required == sub_query.wildcard_match_required()); + REQUIRE(vars_info.size() == sub_query.get_num_possible_vars()); + + for (size_t i{0}; i < vars_info.size(); ++i) { + auto const& [is_dict_var, is_precise_var, var_dict_ids]{vars_info[i]}; + auto const& var{sub_query.get_vars()[i]}; + REQUIRE(is_dict_var == var.is_dict_var()); + REQUIRE(is_precise_var == var.is_precise_var()); + if (is_dict_var) { + if (is_precise_var) { + REQUIRE(1 == var_dict_ids.size()); + REQUIRE(var_dict_ids.contains(var.get_var_dict_id())); + } else { + REQUIRE(var_dict_ids == var.get_possible_var_dict_ids()); + } + } + } + + REQUIRE(logtype_ids == sub_query.get_possible_logtypes()); +} + +auto make_test_lexer(vector const& schema_rules) -> ByteLexer { ByteLexer lexer; - load_lexer_from_file(search_schema_path.string(), lexer); + lexer.m_symbol_id["int"] = cIntId; + lexer.m_symbol_id["float"] = cFloatId; + lexer.m_symbol_id["hasNumber"] = cHasNumId; + lexer.m_id_symbol[cIntId] = "int"; + lexer.m_id_symbol[cFloatId] = "float"; + lexer.m_id_symbol[cHasNumId] = "hasNumber"; + lexer.set_delimiters({' '}); + + Schema schema; + for (auto const& schema_rule : schema_rules) { + schema.add_variable(schema_rule, -1); + } + + auto const schema_ast = schema.release_schema_ast_ptr(); + REQUIRE(nullptr != schema_ast); + REQUIRE(schema_rules.size() == schema_ast->m_schema_vars.size()); + for (size_t i{0}; i < schema_ast->m_schema_vars.size(); ++i) { + REQUIRE(nullptr != schema_ast->m_schema_vars[i]); + auto* capture_rule_ast{dynamic_cast(schema_ast->m_schema_vars[i].get())}; + REQUIRE(nullptr != capture_rule_ast); + lexer.add_rule( + lexer.m_symbol_id[capture_rule_ast->m_name], + std::move(capture_rule_ast->m_regex_ptr) + ); + } + + lexer.generate(); + return lexer; +} +} // namespace + +// Tests: `get_wildcard_encodable_positions` +TEST_CASE("get_wildcard_encodable_positions_for_empty_interpretation", "[dfa_search]") { + QueryInterpretation const interpretation{}; + + auto const positions{clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation)}; + REQUIRE(positions.empty()); +} + +TEST_CASE("get_wildcard_encodable_positions_for_multi_variable_interpretation", "[dfa_search]") { + auto const interpretation{make_query_interpretation( + {"text", + pair{cIntId, "100"}, + pair{cFloatId, "32.2"}, + pair{cIntId, "10?"}, + pair{cFloatId, "3.14*"}, + pair{cHasNumId, "3.14*"}} + )}; + + auto const positions{clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation)}; + REQUIRE(2 == positions.size()); + REQUIRE(3 == positions[0]); + REQUIRE(4 == positions[1]); +} + +// Tests: `generate_logtype_string` +TEST_CASE("generate_logtype_string_for_empty_interpretation", "[dfa_search]") { + QueryInterpretation const interpretation{}; + + auto const wildcard_encodable_positions{ + clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) + }; + + REQUIRE(wildcard_encodable_positions.empty()); + auto const logtype_string{clp::GrepCoreTest::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + {false} + )}; + REQUIRE(logtype_string.empty()); +} + +TEST_CASE("generate_logtype_string_for_single_variable_interpretation", "[dfa_search]") { + string expected_logtype_string; + EncodedVariableInterpreter::add_int_var(expected_logtype_string); + QueryInterpretation interpretation{}; + interpretation.append_variable_token(static_cast(TokenInt), "100", false); + + auto const wildcard_encodable_positions{ + clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) + }; + + REQUIRE(wildcard_encodable_positions.empty()); + auto const logtype_string{clp::GrepCoreTest::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + {false} + )}; + REQUIRE(expected_logtype_string == logtype_string); +} + +TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_search]") { + unordered_set const expected_logtype_strings{ + generate_expected_logtype_string({"text", 'i', 'f', 'd', 'd', 'd'}), + generate_expected_logtype_string({"text", 'i', 'f', 'i', 'd', 'd'}), + generate_expected_logtype_string({"text", 'i', 'f', 'd', 'f', 'd'}), + generate_expected_logtype_string({"text", 'i', 'f', 'i', 'f', 'd'}) + }; + + auto const interpretation{make_query_interpretation( + {"text", + pair{cIntId, "100"}, + pair{cFloatId, "32.2"}, + pair{cIntId, "10?"}, + pair{cFloatId, "3.14*"}, + pair{cHasNumId, "3.14*"}} + )}; + + auto const wildcard_encodable_positions{ + clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) + }; + + uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; + REQUIRE(num_combos == 4); + unordered_set logtype_strings; + for (uint64_t mask{0}; mask < num_combos; ++mask) { + vector mask_encoded_flags(interpretation.get_logtype().size(), false); + for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { + mask_encoded_flags[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; + } + logtype_strings.insert( + clp::GrepCoreTest::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + mask_encoded_flags + ) + ); + } + REQUIRE(expected_logtype_strings == logtype_strings); +} + +// Tests: `process_schema_var_token` +TEST_CASE("process_schema_empty_token ", "[dfa_search]") { + FakeVarDict const var_dict{make_var_dict({pair{0, "100"}})}; + + SubQuery sub_query; + VariableQueryToken const empty_int_token{cIntId, "", false}; + REQUIRE(false == clp::GrepCoreTest::process_token(empty_int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); +} + +TEST_CASE("process_schema_unmatched_token ", "[dfa_search]") { + FakeVarDict const var_dict{make_var_dict({pair{0, "100"}})}; + + SubQuery sub_query; + VariableQueryToken const int_token{cIntId, "200", false}; + REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(false == var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(var.get_possible_var_dict_ids().empty()); +} + +TEST_CASE("process_schema_int_token ", "[dfa_search]") { + FakeVarDict const var_dict{make_var_dict({pair{0, "100"}})}; + + SubQuery sub_query; + VariableQueryToken const int_token{cIntId, "100", false}; + REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(false == var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(var.get_possible_var_dict_ids().empty()); +} + +TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { + FakeVarDict const var_dict{make_var_dict({pair{0, "10a0"}, pair{1, "10b0"}})}; + + SECTION("interpret_as_int") { + SubQuery sub_query; + VariableQueryToken const int_token{cIntId, "10?0", true}; + REQUIRE(clp::GrepCoreTest::process_encoded_token(int_token, var_dict, sub_query)); + REQUIRE(sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); + } + + SECTION("interpret_as_float") { + SubQuery sub_query; + VariableQueryToken const float_token{cFloatId, "10?0", true}; + REQUIRE(clp::GrepCoreTest::process_encoded_token(float_token, var_dict, sub_query)); + REQUIRE(sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); + } + + SECTION("interpret_as_precise_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{cHasNumId, "10a?", true}; + REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(0 == var.get_var_dict_id()); + REQUIRE(var.get_possible_var_dict_ids().empty()); + } + + SECTION("interpret_as_imprecise_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{cHasNumId, "10?0", true}; + REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(2 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } +} + +// NOTE: CLP currently treats all non-encoded variables as the same, so the below test demonstrates +// this. In the future if CLP is more sophisticated, the two sections behave differently. +TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search]") { + size_t id{0}; + FakeVarDict const var_dict{make_var_dict( + {pair{id++, "100000000000000000000000010"}, + pair{id++, "100000000000000000000000020"}, + pair{id++, "100000000000000000000000030"}, + pair{id++, "1000000000000000000000000.0"}, + pair{id++, "1000000000000000000000000a0"}} + )}; + + SECTION("interpret_as_int") { + SubQuery sub_query; + VariableQueryToken const int_token{cIntId, "1000000000000000000000000?0", true}; + REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(5 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_float") { + SubQuery sub_query; + VariableQueryToken const float_token{cFloatId, "1000000000000000000000000?0", true}; + REQUIRE(clp::GrepCoreTest::process_token(float_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(5 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{cHasNumId, "1000000000000000000000000?0", true}; + REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(5 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } +} + +TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { + size_t id{0}; + FakeVarDict const var_dict{make_var_dict( + {pair{id++, "10a0"}, + pair{id++, "10b0"}, + pair{id++, "100000000000000000000000010"}, + pair{id++, "100000000000000000000000020"}, + pair{id++, "100000000000000000000000030"}, + pair{id++, "1000000000000000000000000.0"}, + pair{id++, "1000000000000000000000000a0"}} + )}; + + SECTION("interpret_as_non_encoded_int") { + SubQuery sub_query; + VariableQueryToken const int_token{cIntId, "10*0", true}; + REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(7 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_non_encoded_float") { + SubQuery sub_query; + VariableQueryToken const float_token{cFloatId, "10*0", true}; + REQUIRE(clp::GrepCoreTest::process_token(float_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(7 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_non_encoded_imprecise_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{cHasNumId, "10*0", true}; + REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(7 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_non_encoded_precise_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{cHasNumId, "10b*", true}; + REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(1 == var.get_var_dict_id()); + REQUIRE(var.get_possible_var_dict_ids().empty()); + } + + SECTION("interpret_as_encoded_int") { + SubQuery sub_query; + VariableQueryToken const int_token{cIntId, "10*0", true}; + REQUIRE(clp::GrepCoreTest::process_encoded_token(int_token, var_dict, sub_query)); + REQUIRE(sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); + } + + SECTION("interpret_as_encoded_float") { + SubQuery sub_query; + VariableQueryToken const float_token{cFloatId, "10*0", true}; + REQUIRE(clp::GrepCoreTest::process_encoded_token(float_token, var_dict, sub_query)); + REQUIRE(sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); + } +} + +// Tests: `generate_schema_sub_queries` +TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { + FakeVarDict const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}, pair{2, "10b"}})}; + FakeLogTypeDict const logtype_dict{make_logtype_dict( + {{"text ", 'i', " ", 'i', " ", 'f'}, + {"text ", 'i', " ", 'd', " ", 'f'}, + {"text ", 'i', " ", 'd', " 3.14ab$"}, + {"text ", 'i', " ", 'd', " 3.14abc$"}, + {"text ", 'i', " ", 'd', " 3.15ab$"}, + {"text ", 'i', " 10$ ", 'f'}} + )}; + + using V = pair; + vector>> raw_interpretations{ + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cFloatId, " 3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cHasNumId, "3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " 3.14*"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cFloatId, " 3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cHasNumId, "3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " 3.14*"}, + {"text ", V{cIntId, "100"}, " 10? ", V{cFloatId, " 3.14*"}}, + {"text ", V{cIntId, "100"}, " 10? ", V{cHasNumId, "3.14*"}}, + {"text ", V{cIntId, "100"}, " 10? 3.14*"} + }; + set interpretations; + for (auto const& raw_interpretation : raw_interpretations) { + interpretations.insert(make_query_interpretation(raw_interpretation)); + } + + vector sub_queries; + clp::GrepCoreTest::generate_schema_sub_queries( + interpretations, + logtype_dict, + var_dict, + sub_queries + ); + + VarInfo const wild_int{false, true, {}}; + VarInfo const wild_has_num{true, false, {1LL, 2LL}}; + REQUIRE(4 == sub_queries.size()); + size_t i{0}; + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); + check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); +} + +TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search]") { + FakeVarDict const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}})}; + FakeLogTypeDict const logtype_dict{make_logtype_dict( + {{"text ", 'i', " ", 'i', " ", 'f'}, + {"text ", 'i', " ", 'd', " ", 'f'}, + {"text ", 'i', " ", 'd', " 3.14ab$"}, + {"text ", 'i', " ", 'd', " 3.14abc$"}, + {"text ", 'i', " ", 'd', " 3.15ab$"}, + {"text ", 'i', " 10$ ", 'f'}} + )}; + + using V = pair; + vector>> raw_interpretations{ + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cFloatId, " 3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cHasNumId, "3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " 3.14**"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cFloatId, " 3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cHasNumId, "3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " 3.14**"}, + {"text ", V{cIntId, "100"}, " 10? ", V{cFloatId, " 3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " 10? ", V{cHasNumId, "3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " 10? 3.14**"} + }; + set interpretations; + for (auto const& raw_interpretation : raw_interpretations) { + interpretations.insert(make_query_interpretation(raw_interpretation)); + } + auto const normalized_interpretations{ + clp::GrepCoreTest::normalize_interpretations(interpretations) + }; + + vector sub_queries; + clp::GrepCoreTest::generate_schema_sub_queries( + normalized_interpretations, + logtype_dict, + var_dict, + sub_queries + ); + + VarInfo const wild_int{false, true, {}}; + VarInfo const wild_has_num{true, true, {1LL}}; + REQUIRE(4 == sub_queries.size()); + size_t i{0}; + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); + check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); +} + +// Tests: `process_raw_query` +TEST_CASE("process_raw_query", "[dfa_search]") { + auto lexer{make_test_lexer( + {{R"(int:(\d+))"}, {R"(float:(\d+\.\d+))"}, {R"(hasNumber:[^ $]*\d+[^ $]*)"}} + )}; + + FakeVarDict const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}})}; + FakeLogTypeDict const logtype_dict{make_logtype_dict( + {{"text ", 'i', " ", 'i', " ", 'f'}, + {"text ", 'i', " ", 'd', " ", 'f'}, + {"text ", 'i', " ", 'd', " 3.14ab$"}, + {"text ", 'i', " ", 'd', " 3.14abc$"}, + {"text ", 'i', " ", 'd', " 3.15ab$"}, + {"text ", 'i', " 10$ ", 'f'}} + )}; + + string const raw_query{"text 100 10? 3.14*"}; + + auto const query{ + GrepCore::process_raw_query(logtype_dict, var_dict, raw_query, 0, 0, true, lexer, false) + }; + + REQUIRE(query.has_value()); + auto const& sub_queries{query.value().get_sub_queries()}; + + VarInfo const wild_int{false, true, {}}; + VarInfo const wild_has_num{true, true, {1LL}}; + REQUIRE(4 == sub_queries.size()); + size_t i{0}; + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); + check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); +} + +// Tests: `get_bounds_of_next_potential_var` +TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var]") { string str; - size_t begin_pos; - size_t end_pos; - bool is_var; + size_t begin_pos{}; + size_t end_pos{}; + bool is_var{}; // m_end_pos past the end of the string str = ""; begin_pos = string::npos; end_pos = string::npos; - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == false); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); // Empty string str = ""; begin_pos = 0; end_pos = 0; - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == false); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); // No tokens str = "="; begin_pos = 0; end_pos = 0; - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == false); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); // No wildcards str = " MAC address 95: ad ff 95 24 0d ff =-abc- "; begin_pos = 0; end_pos = 0; - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("95" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("ad" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("95" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("24" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("0d" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); @@ -103,8 +1024,7 @@ TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var REQUIRE("-abc-" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == false); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); REQUIRE(str.length() == begin_pos); // With wildcards @@ -112,33 +1032,25 @@ TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var begin_pos = 0; end_pos = 0; - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); - REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "1\\*x"); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); + REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "1"); REQUIRE(is_var == true); - // REQUIRE(is_var == true); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "abc*123"); - REQUIRE(is_var == false); - // REQUIRE(is_var == true); + REQUIRE(is_var == true); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "1.2"); REQUIRE(is_var == true); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); - REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "+394/-"); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); + REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "+394"); REQUIRE(is_var == true); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "-*abc-"); REQUIRE(is_var == false); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == false); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); }