diff --git a/components/core/config/schemas.txt b/components/core/config/schemas.txt index e0b777859d..9c09fb5d45 100644 --- a/components/core/config/schemas.txt +++ b/components/core/config/schemas.txt @@ -16,4 +16,4 @@ float:\-{0,1}[0-9]+\.[0-9]+ // Dictionary variables hex:[a-fA-F]+ hasNumber:.*\d.* -equals:.*=.*[a-zA-Z0-9].* +equals:.*=(?.*[a-zA-Z0-9].*) diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp index f52cb87cdf..f8449bc4ca 100644 --- a/components/core/src/clp/Utils.cpp +++ b/components/core/src/clp/Utils.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -188,6 +189,18 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B for (std::unique_ptr const& parser_ast : schema_ast->m_schema_vars) { auto* rule = dynamic_cast(parser_ast.get()); + // Currently, we only support at most a single capture group in each variable. If a capture + // group is present its match will be treated as the variable rather than the full match. + auto const num_captures = rule->m_regex_ptr->get_subtree_positive_captures().size(); + if (1 < num_captures) { + throw std::runtime_error( + schema_file_path + ":" + std::to_string(rule->m_line_num + 1) + + ": error: the schema rule '" + rule->m_name + + "' has a regex pattern containing > 1 capture groups (found " + + std::to_string(num_captures) + ").\n" + ); + } + if ("timestamp" == rule->m_name) { continue; } diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index a5fe8a29f9..7d125847ea 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -2,14 +2,17 @@ #include +#include #include -#include -#include +#include +#include +#include #include #include #include #include +#include #include #include #include @@ -23,11 +26,7 @@ using clp::ir::eight_byte_encoded_variable_t; using clp::ir::four_byte_encoded_variable_t; -using log_surgeon::LogEventView; -using std::list; -using std::make_unique; using std::string; -using std::unordered_set; using std::vector; namespace clp::streaming_archive::writer { @@ -315,13 +314,13 @@ Archive::write_msg(epochtime_t timestamp, string const& message, size_t num_unco update_segment_indices(logtype_id, var_ids); } -void Archive::write_msg_using_schema(LogEventView const& log_view) { +void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) { epochtime_t timestamp = 0; TimestampPattern* timestamp_pattern = nullptr; auto const& log_output_buffer = log_view.get_log_output_buffer(); if (log_output_buffer->has_timestamp()) { - size_t start; - size_t end; + size_t start{}; + size_t end{}; timestamp_pattern = (TimestampPattern*)TimestampPattern::search_known_ts_patterns( log_output_buffer->get_mutable_token(0).to_string(), timestamp, @@ -360,7 +359,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { if (timestamp_pattern == nullptr) { start_pos = log_output_buffer->get_token(1).m_start_pos; } - uint32_t end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos; + uint32_t const end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos; if (start_pos <= end_pos) { num_uncompressed_bytes = end_pos - start_pos; } else { @@ -369,7 +368,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { } for (uint32_t i = 1; i < log_output_buffer->pos(); i++) { log_surgeon::Token& token = log_output_buffer->get_mutable_token(i); - int token_type = token.m_type_ids_ptr->at(0); + auto const token_type{token.m_type_ids_ptr->at(0)}; if (log_output_buffer->has_delimiters() && (timestamp_pattern != nullptr || i > 1) && token_type != static_cast(log_surgeon::SymbolId::TokenUncaughtString) && token_type != static_cast(log_surgeon::SymbolId::TokenNewline)) @@ -388,13 +387,13 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { break; } case static_cast(log_surgeon::SymbolId::TokenInt): { - encoded_variable_t encoded_var; + encoded_variable_t encoded_var{}; if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var( token.to_string(), encoded_var )) { - variable_dictionary_id_t id; + variable_dictionary_id_t id{}; m_var_dict.add_entry(token.to_string(), id); encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); m_logtype_dict_entry.add_dictionary_var(); @@ -405,13 +404,13 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { break; } case static_cast(log_surgeon::SymbolId::TokenFloat): { - encoded_variable_t encoded_var; + encoded_variable_t encoded_var{}; if (!EncodedVariableInterpreter::convert_string_to_representable_float_var( token.to_string(), encoded_var )) { - variable_dictionary_id_t id; + variable_dictionary_id_t id{}; m_var_dict.add_entry(token.to_string(), id); encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); m_logtype_dict_entry.add_dictionary_var(); @@ -422,21 +421,63 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { break; } default: { - // Variable string looks like a dictionary variable, so encode it as so - encoded_variable_t encoded_var; - variable_dictionary_id_t id; - m_var_dict.add_entry(token.to_string(), id); - encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); - m_var_ids.push_back(id); + // If there are no capture groups the entire variable token is stored as a variable. + // If the variable token contains capture groups, we break the token up by storing + // each capture as a variable and any substrings surrounding the capture as part of + // the logtype. If a capture has repetition we store all instances as a single + // variable. + + auto const& lexer{log_view.get_log_parser().m_lexer}; + auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)}; + if (false == capture_ids.has_value()) { + variable_dictionary_id_t id{}; + m_var_dict.add_entry(token.to_string(), id); + m_var_ids.push_back(id); + m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); + m_logtype_dict_entry.add_dictionary_var(); + + break; + } + auto const register_ids{ + lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0)) + }; + if (false == register_ids.has_value()) { + throw(std::runtime_error( + "No register IDs found for variable's capture group. Full token: " + + token.to_string() + )); + } + auto const [start_reg_id, end_reg_id]{register_ids.value()}; + auto const capture_start{token.get_reversed_reg_positions(start_reg_id).back()}; + auto const capture_end{token.get_reversed_reg_positions(end_reg_id).front()}; + auto token_view{token}; + auto const token_end_pos{token_view.m_end_pos}; + + token_view.m_end_pos = capture_start; + auto const text_before_capture{token_view.to_string_view()}; + m_logtype_dict_entry + .add_constant(text_before_capture, 0, text_before_capture.size()); + + token_view.m_start_pos = capture_start; + token_view.m_end_pos = capture_end; + variable_dictionary_id_t id{}; + m_var_dict.add_entry(token_view.to_string_view(), id); + m_var_ids.push_back(id); + m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); m_logtype_dict_entry.add_dictionary_var(); - m_encoded_vars.push_back(encoded_var); + + token_view.m_start_pos = capture_end; + token_view.m_end_pos = token_end_pos; + auto const text_after_capture{token_view.to_string_view()}; + m_logtype_dict_entry.add_constant(text_after_capture, 0, text_after_capture.size()); + break; } } } - if (!m_logtype_dict_entry.get_value().empty()) { - logtype_dictionary_id_t logtype_id; + if (false == m_logtype_dict_entry.get_value().empty()) { + logtype_dictionary_id_t logtype_id{}; m_logtype_dict.add_entry(m_logtype_dict_entry, logtype_id); m_file->write_encoded_msg( timestamp, diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp index 8232fd16f7..a12c596c13 100644 --- a/components/core/tests/test-ParserWithUserSchema.cpp +++ b/components/core/tests/test-ParserWithUserSchema.cpp @@ -202,3 +202,26 @@ TEST_CASE("Test lexer", "[Search]") { token = opt_token.value(); } } + +TEST_CASE("Test schema with single capture group", "[load_lexer]") { + std::string const schema_path{"../tests/test_schema_files/single_capture_group.txt"}; + ByteLexer lexer; + load_lexer_from_file(schema_path, lexer); + + auto const rule_id{lexer.m_symbol_id.at("capture")}; + auto const capture_ids{lexer.get_capture_ids_from_rule_id(rule_id)}; + REQUIRE(capture_ids.has_value()); + REQUIRE(1 == capture_ids->size()); + REQUIRE("group" == lexer.m_id_symbol.at(capture_ids->at(0))); +} + +TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexer]") { + std::string const schema_path{"../tests/test_schema_files/multiple_capture_groups.txt"}; + ByteLexer lexer; + REQUIRE_THROWS_WITH( + load_lexer_from_file(schema_path, lexer), + schema_path + + ":3: error: the schema rule 'multicapture' has a regex pattern containing > " + "1 capture groups (found 2).\n" + ); +} diff --git a/components/core/tests/test_schema_files/multiple_capture_groups.txt b/components/core/tests/test_schema_files/multiple_capture_groups.txt new file mode 100644 index 0000000000..09998bc19d --- /dev/null +++ b/components/core/tests/test_schema_files/multiple_capture_groups.txt @@ -0,0 +1,3 @@ +delimiters: \r\n + +multicapture:text(?var0)text(?var1)text diff --git a/components/core/tests/test_schema_files/single_capture_group.txt b/components/core/tests/test_schema_files/single_capture_group.txt new file mode 100644 index 0000000000..7c4bf17688 --- /dev/null +++ b/components/core/tests/test_schema_files/single_capture_group.txt @@ -0,0 +1,3 @@ +delimiters: \r\n + +capture:text(?var)text