Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
f8d06b2
Add dynamic programming to CLP.
SharafMohamed Sep 17, 2025
add11a2
Format.
SharafMohamed Sep 19, 2025
c82dbab
Merge branch 'main' into dfa-search-using-ls-port
SharafMohamed Sep 22, 2025
70f357b
Add unit-tests.
SharafMohamed Sep 26, 2025
9d3e152
Format.
SharafMohamed Sep 26, 2025
b587085
Add missing header; Flip cast for comparison to SymbolID over unit32_t.
SharafMohamed Sep 26, 2025
9d5dd94
Fix typos.
SharafMohamed Sep 26, 2025
c5bd43b
Remove magic number.
SharafMohamed Sep 26, 2025
d7cfb0d
Add missing header; Fix typo; Fix if statement.
SharafMohamed Sep 26, 2025
27ebc0f
Remove thread-unsafe statics.
SharafMohamed Sep 26, 2025
7a2f74e
Use uint64_t with bit shift operator.
SharafMohamed Sep 26, 2025
9848096
Switch to default initialization for all false map.
SharafMohamed Sep 26, 2025
3b4572b
Fix bug in string_utils and add to test-string_utils; Refactor unit-t…
SharafMohamed Oct 1, 2025
b633980
Format.
SharafMohamed Oct 1, 2025
62f23b0
Add flags to hide test helper in production.
SharafMohamed Oct 1, 2025
8b093a0
Cast to SymbolId instead of uint32_t.
SharafMohamed Oct 1, 2025
49dc925
Add missing include.
SharafMohamed Oct 1, 2025
7b73754
Avoid multiple get_logtype() calls; Reserve size.
SharafMohamed Oct 1, 2025
c8f9046
Remove more get_logtype() calls and make tokens be references into th…
SharafMohamed Oct 1, 2025
9eba2d5
Switch from contains to find to avoid double lookup.
SharafMohamed Oct 1, 2025
4115786
Fix typos.
SharafMohamed Oct 1, 2025
8aa356a
Reduce mask size to 16 bits; Update documentation to match.
SharafMohamed Oct 1, 2025
e5afe04
Use propery type for bit mask.
SharafMohamed Oct 1, 2025
7352fd2
Define cross-test constants in a single place; Make sure constants ar…
SharafMohamed Oct 1, 2025
80354f3
Reserve var dict size.
SharafMohamed Oct 1, 2025
1680f33
Cache logtype and use references into it.
SharafMohamed Oct 1, 2025
5bc40ff
Replace contain with find to avoid double lookup.
SharafMohamed Oct 1, 2025
decca7e
Reserve possible_logtype_ids.
SharafMohamed Oct 1, 2025
26c8e49
Use correct types for bit mask.
SharafMohamed Oct 1, 2025
0908106
Use constants in place of magic numbers.
SharafMohamed Oct 1, 2025
42d7a1c
Add paranthesis for bit operation clarity.
SharafMohamed Oct 1, 2025
fae5913
Add docstrings for unit-test helpers.
SharafMohamed Oct 1, 2025
2bdf381
Fix some unit-tests.
SharafMohamed Oct 1, 2025
764973c
Merge branch 'main' into dfa-search-using-ls-port
SharafMohamed Oct 1, 2025
8996cfb
Format.
SharafMohamed Oct 1, 2025
7608cab
Remove consecutive wildcards from interpretation tokens; Undo changes…
SharafMohamed Oct 3, 2025
a130b96
Format.
SharafMohamed Oct 3, 2025
ae93a4e
Merge branch 'main' into dfa-search-using-ls-port
SharafMohamed Oct 3, 2025
f2f01d6
Add missing headers.
SharafMohamed Oct 3, 2025
0eed0db
Improve docstring.
SharafMohamed Oct 3, 2025
e2c55a7
Use find over contains to avoid double lookup.
SharafMohamed Oct 3, 2025
1cbbefd
Fix type in test to use variable_dictionary_id_t.
SharafMohamed Oct 3, 2025
f3ab0a5
Update docstring for consistency.
SharafMohamed Oct 3, 2025
ddf3e23
Fix macos ULL error.
SharafMohamed Oct 3, 2025
d629ae9
Replace map with bool vector.
SharafMohamed Oct 3, 2025
9e3e9d9
Fix type mismatch.
SharafMohamed Oct 3, 2025
06ab6a6
Add missing header; Remove unused header.
SharafMohamed Oct 3, 2025
ad951ae
Add missing header.
SharafMohamed Oct 3, 2025
f3d27b7
Format.
SharafMohamed Oct 3, 2025
76ecb1d
Merge branch 'main' into dfa-search-using-ls-port
SharafMohamed Oct 3, 2025
9920bc9
Remove duplicate sub queries.
SharafMohamed Oct 4, 2025
e7dcc43
Format.
SharafMohamed Oct 4, 2025
ed13886
Rename to rhs.
SharafMohamed Oct 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -774,4 +774,7 @@ if(CLP_BUILD_TESTING)
target_compile_features(unitTest
PRIVATE cxx_std_20
)
target_compile_definitions(unitTest
PRIVATE CLP_BUILD_TESTING
)
endif()
243 changes: 127 additions & 116 deletions components/core/src/clp/GrepCore.cpp
Original file line number Diff line number Diff line change
@@ -1,35 +1,36 @@
#include "GrepCore.hpp"

#include <algorithm>
#include <cstddef>
#include <set>
#include <string>
#include <string_view>
#include <unordered_map>
#include <variant>
#include <vector>

#include <log_surgeon/Constants.hpp>
#include <string_utils/string_utils.hpp>

#include "EncodedVariableInterpreter.hpp"
#include "ir/parsing.hpp"
#include "LogSurgeonReader.hpp"
#include "QueryToken.hpp"
#include "StringReader.hpp"

using clp::ir::is_delim;
using clp::string_utils::is_alphabet;
using clp::string_utils::is_wildcard;
using log_surgeon::SymbolId::TokenFloat;
using log_surgeon::SymbolId::TokenInt;
using log_surgeon::wildcard_query_parser::QueryInterpretation;
using log_surgeon::wildcard_query_parser::StaticQueryToken;
using log_surgeon::wildcard_query_parser::VariableQueryToken;
using std::holds_alternative;
using std::set;
using std::string;
using std::unordered_map;
using std::vector;

namespace clp {
namespace {
/**
* Wraps the tokens returned from the log_surgeon lexer, and stores the variable ids of the tokens
* in a search query in a set. This allows for optimized search performance.
*/
class SearchToken : public log_surgeon::Token {
public:
std::set<int> m_type_ids_set;
};
} // namespace

bool GrepCore::get_bounds_of_next_potential_var(
string const& value,
size_t& begin_pos,
Expand Down Expand Up @@ -150,124 +151,134 @@ bool GrepCore::get_bounds_of_next_potential_var(
return (value_length != begin_pos);
}

bool GrepCore::get_bounds_of_next_potential_var(
string const& value,
size_t& begin_pos,
size_t& end_pos,
bool& is_var,
log_surgeon::lexers::ByteLexer& lexer
) {
size_t const value_length = value.length();
if (end_pos >= value_length) {
return false;
auto GrepCore::normalize_interpretations(set<QueryInterpretation> const& interpretations)
-> set<QueryInterpretation> {
set<QueryInterpretation> normalized_interpretations;
for (auto const& interpretation : interpretations) {
QueryInterpretation normalized_interpretation;
for (auto const& token : interpretation.get_logtype()) {
auto const& src_string{
holds_alternative<VariableQueryToken>(token)
? std::get<VariableQueryToken>(token).get_query_substring()
: std::get<StaticQueryToken>(token).get_query_substring()
};
string normalized_string;
normalized_string.reserve(src_string.size());
for (auto const c : src_string) {
if (c != '*' || normalized_string.empty() || normalized_string.back() != '*') {
normalized_string += c;
}
}

if (holds_alternative<VariableQueryToken>(token)) {
auto const& variable_token{std::get<VariableQueryToken>(token)};
normalized_interpretation.append_variable_token(
variable_token.get_variable_type(),
normalized_string,
variable_token.get_contains_wildcard()
);
} else {
normalized_interpretation.append_static_token(normalized_string);
}
}
normalized_interpretations.insert(normalized_interpretation);
}
return normalized_interpretations;
}

is_var = false;
bool contains_wildcard = false;
while (false == is_var && false == contains_wildcard && begin_pos < value_length) {
// Start search at end of last token
begin_pos = end_pos;
auto GrepCore::get_wildcard_encodable_positions(QueryInterpretation const& interpretation)
-> vector<size_t> {
auto const logtype{interpretation.get_logtype()};
vector<size_t> wildcard_encodable_positions;
wildcard_encodable_positions.reserve(logtype.size());

// Find variable begin or wildcard
bool is_escaped = false;
for (; begin_pos < value_length; ++begin_pos) {
char c = value[begin_pos];
for (size_t i{0}; i < logtype.size(); ++i) {
auto const& token{logtype[i]};
if (holds_alternative<VariableQueryToken>(token)) {
auto const& var_token{std::get<VariableQueryToken>(token)};
auto const var_type{static_cast<log_surgeon::SymbolId>(var_token.get_variable_type())};
bool const is_int{TokenInt == var_type};
bool const is_float{TokenFloat == var_type};
if (var_token.get_contains_wildcard() && (is_int || is_float)) {
wildcard_encodable_positions.push_back(i);
}
}
}
return wildcard_encodable_positions;
}

if (is_escaped) {
is_escaped = false;
auto GrepCore::generate_logtype_string(
QueryInterpretation const& interpretation,
vector<size_t> const& wildcard_encodable_positions,
vector<bool> const& mask_encoded_flags
) -> string {
string logtype_string;

if (false == lexer.is_delimiter(c)) {
// Found escaped non-delimiter, so reverse the index to retain the escape
// character
--begin_pos;
break;
}
} else if ('\\' == c) {
// Escape character
is_escaped = true;
} else {
if (is_wildcard(c)) {
contains_wildcard = true;
break;
}
if (false == lexer.is_delimiter(c)) {
break;
}
}
// Reserve size for `logtype_string`.
size_t logtype_string_size{0};
auto const logtype{interpretation.get_logtype()};
for (auto const& token : logtype) {
if (holds_alternative<StaticQueryToken>(token)) {
auto const& static_token{std::get<StaticQueryToken>(token)};
logtype_string_size += static_token.get_query_substring().size();
} else {
logtype_string_size++;
}
}
logtype_string.reserve(logtype_string_size);

// Find next delimiter
is_escaped = false;
end_pos = begin_pos;
for (; end_pos < value_length; ++end_pos) {
char c = value[end_pos];
// Generate `logtype_string`.
for (size_t i{0}; i < logtype.size(); ++i) {
auto const& token{logtype[i]};
if (holds_alternative<StaticQueryToken>(token)) {
logtype_string += std::get<StaticQueryToken>(token).get_query_substring();
continue;
}

if (is_escaped) {
is_escaped = false;
auto const& var_token{std::get<VariableQueryToken>(token)};
auto const& raw_string{var_token.get_query_substring()};
auto const var_type{static_cast<log_surgeon::SymbolId>(var_token.get_variable_type())};
bool const is_int{TokenInt == var_type};
bool const is_float{TokenFloat == var_type};

if (lexer.is_delimiter(c)) {
// Found escaped delimiter, so reverse the index to retain the escape character
--end_pos;
break;
if (wildcard_encodable_positions.end()
!= std::ranges::find(
wildcard_encodable_positions.begin(),
wildcard_encodable_positions.end(),
i
))
{
if (mask_encoded_flags[i]) {
if (is_int) {
EncodedVariableInterpreter::add_int_var(logtype_string);
} else {
EncodedVariableInterpreter::add_float_var(logtype_string);
}
} else if ('\\' == c) {
// Escape character
is_escaped = true;
} else {
if (is_wildcard(c)) {
contains_wildcard = true;
} else if (lexer.is_delimiter(c)) {
// Found delimiter that's not also a wildcard
break;
}
EncodedVariableInterpreter::add_dict_var(logtype_string);
}
continue;
}

if (end_pos > begin_pos) {
bool has_prefix_wildcard = ('*' == value[begin_pos]) || ('?' == value[begin_pos]);
bool has_suffix_wildcard = ('*' == value[end_pos - 1]) || ('?' == value[end_pos - 1]);
bool has_wildcard_in_middle = false;
for (size_t i = begin_pos + 1; i < end_pos - 1; ++i) {
if (('*' == value[i] || '?' == value[i]) && value[i - 1] != '\\') {
has_wildcard_in_middle = true;
break;
}
}
SearchToken search_token;
if (has_wildcard_in_middle || has_prefix_wildcard) {
// DO NOTHING
} else {
StringReader string_reader;
LogSurgeonReader reader_wrapper(string_reader);
log_surgeon::ParserInputBuffer parser_input_buffer;
if (has_suffix_wildcard) { // text*
// TODO: creating a string reader, setting it equal to a string, to read it into
// the ParserInputBuffer, seems like a convoluted way to set a string equal to a
// string, should be improved when adding a SearchParser to log_surgeon
string_reader.open(value.substr(begin_pos, end_pos - begin_pos - 1));
parser_input_buffer.read_if_safe(reader_wrapper);
lexer.reset();
lexer.scan_with_wildcard(parser_input_buffer, value[end_pos - 1], search_token);
} else { // no wildcards
string_reader.open(value.substr(begin_pos, end_pos - begin_pos));
parser_input_buffer.read_if_safe(reader_wrapper);
lexer.reset();
auto [err, token] = lexer.scan(parser_input_buffer);
if (log_surgeon::ErrorCode::Success != err) {
return false;
}
search_token = SearchToken{token.value()};
search_token.m_type_ids_set.insert(search_token.m_type_ids_ptr->at(0));
}
auto const& type = search_token.m_type_ids_ptr->at(0);
if (type != static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString)
&& type != static_cast<int>(log_surgeon::SymbolId::TokenEnd))
{
is_var = true;
}
}
encoded_variable_t encoded_var{0};
if (is_int
&& EncodedVariableInterpreter::convert_string_to_representable_integer_var(
raw_string,
encoded_var
))
{
EncodedVariableInterpreter::add_int_var(logtype_string);
} else if (is_float
&& EncodedVariableInterpreter::convert_string_to_representable_float_var(
raw_string,
encoded_var
))
{
EncodedVariableInterpreter::add_float_var(logtype_string);
} else {
EncodedVariableInterpreter::add_dict_var(logtype_string);
}
}
return (value_length != begin_pos);
return logtype_string;
}
} // namespace clp
Loading
Loading