Skip to content

Commit 0ed19dc

Browse files
authored
ingest: Dissect processor and lib for 6x (elastic#33422)
* Introduce the dissect library (elastic#32297) The dissect library will be used for the ingest node as an alternative to Grok to split a string based on a pattern. Dissect differs from Grok such that regular expressions are not used to split the string. Note - Regular expressions are used during construction of the objects, but not in the hot path. A dissect pattern takes the form of: '%{a} %{b},%{c}' which is composed of 3 keys (a,b,c) and two delimiters (space and comma). This dissect pattern will match a string of the form: 'foo bar,baz' and will result a key/value pairing of 'a=foo, b=bar, and c=baz'. See the comments in DissectParser for a full explanation. This commit does not include the ingest node processor that will consume it. However, the consumption should be a trivial mapping between the key/value pairing returned by the parser and the key/value pairing needed for the IngestDocument. * ingest: Introduce the dissect processor (elastic#32884) The ingest node dissect processor is an alternative to Grok to split a string based on a pattern. Dissect differs from Grok such that regular expressions are not used to split the string. Dissect can be used to parse a source text field with a simpler pattern, and is often faster the Grok for basic string parsing. This processor uses the dissect library which does most of the work. * ingest: minor - update test to include dissect (elastic#33211) This change also includes placing the bytes processor in the correct order (helps to avoid merge conflict when back patching processors)
1 parent 6357a57 commit 0ed19dc

File tree

19 files changed

+2541
-137
lines changed

19 files changed

+2541
-137
lines changed

docs/reference/ingest/ingest-node.asciidoc

+312-119
Large diffs are not rendered by default.

libs/dissect/build.gradle

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import org.elasticsearch.gradle.precommit.PrecommitTasks
2+
3+
/*
4+
* Licensed to Elasticsearch under one or more contributor
5+
* license agreements. See the NOTICE file distributed with
6+
* this work for additional information regarding copyright
7+
* ownership. Elasticsearch licenses this file to you under
8+
* the Apache License, Version 2.0 (the "License"); you may
9+
* not use this file except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing,
15+
* software distributed under the License is distributed on an
16+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17+
* KIND, either express or implied. See the License for the
18+
* specific language governing permissions and limitations
19+
* under the License.
20+
*/
21+
22+
archivesBaseName = 'elasticsearch-dissect'
23+
24+
dependencies {
25+
if (isEclipse == false || project.path == ":libs:dissect-tests") {
26+
testCompile("org.elasticsearch.test:framework:${version}") {
27+
exclude group: 'org.elasticsearch', module: 'dissect'
28+
}
29+
}
30+
testCompile "com.fasterxml.jackson.core:jackson-core:${versions.jackson}"
31+
testCompile("com.fasterxml.jackson.core:jackson-annotations:${versions.jackson}")
32+
testCompile("com.fasterxml.jackson.core:jackson-databind:${versions.jackson}")
33+
}
34+
35+
forbiddenApisMain {
36+
replaceSignatureFiles 'jdk-signatures'
37+
}
38+
39+
if (isEclipse) {
40+
// in eclipse the project is under a fake root, we need to change around the source sets
41+
sourceSets {
42+
if (project.path == ":libs:dissect") {
43+
main.java.srcDirs = ['java']
44+
main.resources.srcDirs = ['resources']
45+
} else {
46+
test.java.srcDirs = ['java']
47+
test.resources.srcDirs = ['resources']
48+
}
49+
}
50+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
// this is just shell gradle file for eclipse to have separate projects for dissect src and tests
3+
apply from: '../../build.gradle'
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.dissect;
21+
22+
/**
23+
* Parent class for all dissect related exceptions. Consumers may catch this exception or more specific child exceptions.
24+
*/
25+
public abstract class DissectException extends RuntimeException {
26+
DissectException(String message) {
27+
super(message);
28+
}
29+
30+
/**
31+
* Error while parsing a dissect pattern
32+
*/
33+
static class PatternParse extends DissectException {
34+
PatternParse(String pattern, String reason) {
35+
super("Unable to parse pattern: " + pattern + " Reason: " + reason);
36+
}
37+
}
38+
39+
/**
40+
* Error while parsing a dissect key
41+
*/
42+
static class KeyParse extends DissectException {
43+
KeyParse(String key, String reason) {
44+
super("Unable to parse key: " + key + " Reason: " + reason);
45+
}
46+
}
47+
48+
/**
49+
* Unable to find a match between pattern and source string
50+
*/
51+
static class FindMatch extends DissectException {
52+
FindMatch(String pattern, String source) {
53+
super("Unable to find match for dissect pattern: " + pattern + " against source: " + source);
54+
55+
}
56+
}
57+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.dissect;
21+
22+
import java.util.EnumSet;
23+
import java.util.regex.Matcher;
24+
import java.util.regex.Pattern;
25+
26+
/**
27+
* <p>A Key of a dissect pattern. This class models the name and modifiers and provides some validation.</p>
28+
* <p>For dissect pattern of {@code %{a} %{+a} %{b}} the dissect keys are:
29+
* <ul>
30+
* <li>{@code a}</li>
31+
* <li>{@code +a}</li>
32+
* <li>{@code b}</li>
33+
* </ul>
34+
* This class represents a single key.
35+
* <p>A single key is composed of a name and it's modifiers. For the key {@code +a}, {@code a} is the name and {@code +} is the modifier.
36+
* @see DissectParser
37+
*/
38+
public final class DissectKey {
39+
private static final Pattern LEFT_MODIFIER_PATTERN = Pattern.compile("([+*&?])(.*?)(->)?$", Pattern.DOTALL);
40+
private static final Pattern RIGHT_PADDING_PATTERN = Pattern.compile("^(.*?)(->)?$", Pattern.DOTALL);
41+
private static final Pattern APPEND_WITH_ORDER_PATTERN = Pattern.compile("[+](.*?)(/)([0-9]+)(->)?$", Pattern.DOTALL);
42+
private final Modifier modifier;
43+
private boolean skip;
44+
private boolean skipRightPadding;
45+
private int appendPosition;
46+
private String name;
47+
48+
/**
49+
* Constructor - parses the String key into it's name and modifier(s)
50+
*
51+
* @param key The key without the leading <code>%{</code> or trailing <code>}</code>, for example {@code a->}
52+
*/
53+
DissectKey(String key) {
54+
skip = key == null || key.isEmpty();
55+
modifier = Modifier.findModifier(key);
56+
switch (modifier) {
57+
case NONE:
58+
Matcher matcher = RIGHT_PADDING_PATTERN.matcher(key);
59+
while (matcher.find()) {
60+
name = matcher.group(1);
61+
skipRightPadding = matcher.group(2) != null;
62+
}
63+
skip = name.isEmpty();
64+
break;
65+
case NAMED_SKIP:
66+
matcher = LEFT_MODIFIER_PATTERN.matcher(key);
67+
while (matcher.find()) {
68+
name = matcher.group(2);
69+
skipRightPadding = matcher.group(3) != null;
70+
}
71+
skip = true;
72+
break;
73+
case APPEND:
74+
matcher = LEFT_MODIFIER_PATTERN.matcher(key);
75+
while (matcher.find()) {
76+
name = matcher.group(2);
77+
skipRightPadding = matcher.group(3) != null;
78+
}
79+
break;
80+
case FIELD_NAME:
81+
matcher = LEFT_MODIFIER_PATTERN.matcher(key);
82+
while (matcher.find()) {
83+
name = matcher.group(2);
84+
skipRightPadding = matcher.group(3) != null;
85+
}
86+
break;
87+
case FIELD_VALUE:
88+
matcher = LEFT_MODIFIER_PATTERN.matcher(key);
89+
while (matcher.find()) {
90+
name = matcher.group(2);
91+
skipRightPadding = matcher.group(3) != null;
92+
}
93+
break;
94+
case APPEND_WITH_ORDER:
95+
matcher = APPEND_WITH_ORDER_PATTERN.matcher(key);
96+
while (matcher.find()) {
97+
name = matcher.group(1);
98+
appendPosition = Short.valueOf(matcher.group(3));
99+
skipRightPadding = matcher.group(4) != null;
100+
}
101+
break;
102+
}
103+
104+
if (name == null || (name.isEmpty() && !skip)) {
105+
throw new DissectException.KeyParse(key, "The key name could be determined");
106+
}
107+
}
108+
109+
/**
110+
* Copy constructor to explicitly override the modifier.
111+
* @param key The key to copy (except for the modifier)
112+
* @param modifier the modifer to use for this copy
113+
*/
114+
DissectKey(DissectKey key, DissectKey.Modifier modifier){
115+
this.modifier = modifier;
116+
this.skipRightPadding = key.skipRightPadding;
117+
this.skip = key.skip;
118+
this.name = key.name;
119+
this.appendPosition = key.appendPosition;
120+
}
121+
122+
Modifier getModifier() {
123+
return modifier;
124+
}
125+
126+
boolean skip() {
127+
return skip;
128+
}
129+
130+
boolean skipRightPadding() {
131+
return skipRightPadding;
132+
}
133+
134+
int getAppendPosition() {
135+
return appendPosition;
136+
}
137+
138+
String getName() {
139+
return name;
140+
}
141+
142+
//generated
143+
@Override
144+
public String toString() {
145+
return "DissectKey{" +
146+
"modifier=" + modifier +
147+
", skip=" + skip +
148+
", appendPosition=" + appendPosition +
149+
", name='" + name + '\'' +
150+
'}';
151+
}
152+
153+
public enum Modifier {
154+
NONE(""), APPEND_WITH_ORDER("/"), APPEND("+"), FIELD_NAME("*"), FIELD_VALUE("&"), NAMED_SKIP("?");
155+
156+
private static final Pattern MODIFIER_PATTERN = Pattern.compile("[/+*&?]");
157+
158+
private final String modifier;
159+
160+
@Override
161+
public String toString() {
162+
return modifier;
163+
}
164+
165+
Modifier(final String modifier) {
166+
this.modifier = modifier;
167+
}
168+
169+
//package private for testing
170+
static Modifier fromString(String modifier) {
171+
return EnumSet.allOf(Modifier.class).stream().filter(km -> km.modifier.equals(modifier))
172+
.findFirst().orElseThrow(() -> new IllegalArgumentException("Found invalid modifier.")); //throw should never happen
173+
}
174+
175+
private static Modifier findModifier(String key) {
176+
Modifier modifier = Modifier.NONE;
177+
if (key != null && !key.isEmpty()) {
178+
Matcher matcher = MODIFIER_PATTERN.matcher(key);
179+
int matches = 0;
180+
while (matcher.find()) {
181+
Modifier priorModifier = modifier;
182+
modifier = Modifier.fromString(matcher.group());
183+
if (++matches > 1 && !(APPEND.equals(priorModifier) && APPEND_WITH_ORDER.equals(modifier))) {
184+
throw new DissectException.KeyParse(key, "multiple modifiers are not allowed.");
185+
}
186+
}
187+
}
188+
return modifier;
189+
}
190+
}
191+
}

0 commit comments

Comments
 (0)