Skip to content

Commit 4dbdc9f

Browse files
authored
Fix matching of $ when there are trailing newlines (#1201)
1 parent 9527f4a commit 4dbdc9f

File tree

7 files changed

+489
-5
lines changed

7 files changed

+489
-5
lines changed

src/main/java/com/networknt/schema/regex/JDKRegularExpression.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ class JDKRegularExpression implements RegularExpression {
99
private final Pattern pattern;
1010

1111
JDKRegularExpression(String regex) {
12-
this.pattern = Pattern.compile(regex);
12+
this.pattern = Pattern.compile(RegularExpressions
13+
.replaceLongformCharacterProperties(RegularExpressions.replaceDollarAnchors(regex)));
1314
}
1415

1516
@Override

src/main/java/com/networknt/schema/regex/JoniRegularExpression.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class JoniRegularExpression implements RegularExpression {
5050

5151
JoniRegularExpression(String regex, Syntax syntax) {
5252
validate(regex);
53+
regex = RegularExpressions.replaceDollarAnchors(regex);
5354
byte[] bytes = regex.getBytes(StandardCharsets.UTF_8);
5455
this.pattern = new Regex(bytes, 0, bytes.length, Option.SINGLELINE, ECMAScriptUTF8Encoding.INSTANCE, syntax);
5556
}
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
/*
2+
* Copyright (c) 2025 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.networknt.schema.regex;
17+
18+
import java.util.HashMap;
19+
import java.util.Map;
20+
21+
/**
22+
* Utility methods for Regular Expressions.
23+
*/
24+
public class RegularExpressions {
25+
private RegularExpressions() {
26+
}
27+
28+
/**
29+
* The meaning of $ in ecmascript does not allow newlines while for other
30+
* languages it is typically allowed. The closest to the meaning in ecmascript
31+
* is \z.
32+
*
33+
* @param regex the regex
34+
* @return the replacement
35+
*/
36+
public static String replaceDollarAnchors(String regex) {
37+
if (regex.indexOf('$') == -1) {
38+
return regex;
39+
}
40+
/*
41+
* Note that for joni there's no option for this and this occurs in the Lexer
42+
* when the regex is compiled. If single line $ is AnchorType.SEMI_END_BUF and
43+
* if multiline is AnchorType.END_LINE. However what is required is
44+
* AnchorType.END_BUF.
45+
*/
46+
StringBuilder result = new StringBuilder();
47+
boolean inCharacterClass = false;
48+
boolean inLiteralSection = false; // This isn't supported by ECMA but by Java
49+
for (int i = 0; i < regex.length(); i++) {
50+
char ch = regex.charAt(i);
51+
// Literal Section (not supported by ECMA)
52+
if (inLiteralSection) {
53+
if (ch == '\\' && i + 1 < regex.length() && regex.charAt(i + 1) == 'E') {
54+
result.append("\\E");
55+
inLiteralSection = false;
56+
i++;
57+
} else {
58+
// Everything else is treated as a literal character
59+
result.append(ch);
60+
}
61+
continue;
62+
}
63+
// Escaped
64+
if (ch == '\\') {
65+
result.append(ch);
66+
if (i + 1 < regex.length()) {
67+
char escapedChar = regex.charAt(i + 1);
68+
result.append(escapedChar);
69+
if (escapedChar == 'Q') {
70+
inLiteralSection = true;
71+
}
72+
i++;
73+
}
74+
continue;
75+
}
76+
// Character Class
77+
if (ch == '[') {
78+
inCharacterClass = true;
79+
result.append(ch);
80+
continue;
81+
} else if (ch == ']') {
82+
inCharacterClass = false;
83+
result.append(ch);
84+
continue;
85+
}
86+
87+
if (ch == '$') {
88+
if (inCharacterClass) {
89+
result.append(ch);
90+
} else {
91+
result.append("\\z");
92+
}
93+
} else {
94+
result.append(ch);
95+
}
96+
}
97+
return result.toString();
98+
}
99+
100+
private static final Map<String, String> LONGFORM_CHARACTER_PROPERTIES;
101+
102+
static {
103+
LONGFORM_CHARACTER_PROPERTIES = new HashMap<>();
104+
LONGFORM_CHARACTER_PROPERTIES.put("Letter", "L");
105+
LONGFORM_CHARACTER_PROPERTIES.put("Lowercase_Letter", "Ll");
106+
LONGFORM_CHARACTER_PROPERTIES.put("Uppercase_Letter", "Lu");
107+
LONGFORM_CHARACTER_PROPERTIES.put("Titlecase_Letter", "Lt");
108+
LONGFORM_CHARACTER_PROPERTIES.put("Cased_Letter", "L&");
109+
LONGFORM_CHARACTER_PROPERTIES.put("Modifier_Letter", "Lm");
110+
LONGFORM_CHARACTER_PROPERTIES.put("Other_Letter", "Lo");
111+
LONGFORM_CHARACTER_PROPERTIES.put("Mark", "M");
112+
LONGFORM_CHARACTER_PROPERTIES.put("Non_Spacing_Mark", "Mn");
113+
LONGFORM_CHARACTER_PROPERTIES.put("Spacing_Combining_Mark", "Mc");
114+
LONGFORM_CHARACTER_PROPERTIES.put("Enclosing_Mark", "Me");
115+
LONGFORM_CHARACTER_PROPERTIES.put("Separator", "Z");
116+
LONGFORM_CHARACTER_PROPERTIES.put("Space_Separator", "Zs");
117+
LONGFORM_CHARACTER_PROPERTIES.put("Line_Separator", "Zl");
118+
LONGFORM_CHARACTER_PROPERTIES.put("Paragraph_Separator", "Zp");
119+
LONGFORM_CHARACTER_PROPERTIES.put("Symbol", "S");
120+
LONGFORM_CHARACTER_PROPERTIES.put("Math_Symbol", "Sm");
121+
LONGFORM_CHARACTER_PROPERTIES.put("Currency_Symbol", "Sc");
122+
LONGFORM_CHARACTER_PROPERTIES.put("Modifier_Symbol", "Sk");
123+
LONGFORM_CHARACTER_PROPERTIES.put("Other_Symbol", "So");
124+
LONGFORM_CHARACTER_PROPERTIES.put("Number", "N");
125+
LONGFORM_CHARACTER_PROPERTIES.put("Decimal_Digit_Number", "Nd");
126+
LONGFORM_CHARACTER_PROPERTIES.put("Letter_Number", "Nl");
127+
LONGFORM_CHARACTER_PROPERTIES.put("Other_Number", "No");
128+
LONGFORM_CHARACTER_PROPERTIES.put("Punctuation", "P");
129+
LONGFORM_CHARACTER_PROPERTIES.put("Dash_Punctuation", "Pd");
130+
LONGFORM_CHARACTER_PROPERTIES.put("Open_Punctuation", "Ps");
131+
LONGFORM_CHARACTER_PROPERTIES.put("Close_Punctuation", "Pe");
132+
LONGFORM_CHARACTER_PROPERTIES.put("Initial_Punctuation", "Pi");
133+
LONGFORM_CHARACTER_PROPERTIES.put("Final_Punctuation", "Pf");
134+
LONGFORM_CHARACTER_PROPERTIES.put("Connector_Punctuation", "Pc");
135+
LONGFORM_CHARACTER_PROPERTIES.put("Other_Punctuation", "Po");
136+
LONGFORM_CHARACTER_PROPERTIES.put("Other", "C");
137+
LONGFORM_CHARACTER_PROPERTIES.put("Control", "Cc");
138+
LONGFORM_CHARACTER_PROPERTIES.put("Format", "Cf");
139+
LONGFORM_CHARACTER_PROPERTIES.put("Private_Use", "Co");
140+
LONGFORM_CHARACTER_PROPERTIES.put("Surrogate", "Cs");
141+
LONGFORM_CHARACTER_PROPERTIES.put("Unassigned", "Cn");
142+
LONGFORM_CHARACTER_PROPERTIES.put("digit", "Nd");
143+
}
144+
145+
/**
146+
* Replaces the longform character properties with the shortform character
147+
* propertise.
148+
*
149+
* @param regex the regex
150+
* @return the replacement
151+
*/
152+
public static String replaceLongformCharacterProperties(String regex) {
153+
return replaceCharacterProperties(regex, LONGFORM_CHARACTER_PROPERTIES);
154+
}
155+
156+
/**
157+
* The character properties in JDK is different from ECMA.
158+
*
159+
* @param regex the regex
160+
* @return the replacement
161+
*/
162+
public static String replaceCharacterProperties(String regex, Map<String, String> replacements) {
163+
if (regex.indexOf("\\p{") == -1) {
164+
return regex;
165+
}
166+
StringBuilder result = new StringBuilder();
167+
boolean inCharacterClass = false;
168+
boolean inLiteralSection = false; // This isn't supported by ECMA but by Java
169+
for (int i = 0; i < regex.length(); i++) {
170+
char ch = regex.charAt(i);
171+
// Literal Section (not supported by ECMA)
172+
if (inLiteralSection) {
173+
if (ch == '\\' && i + 1 < regex.length() && regex.charAt(i + 1) == 'E') {
174+
result.append("\\E");
175+
inLiteralSection = false;
176+
i++;
177+
} else {
178+
// Everything else is treated as a literal character
179+
result.append(ch);
180+
}
181+
continue;
182+
}
183+
if (!inCharacterClass && regex.length() >= i + 3 && regex.startsWith("\\p{", i)) {
184+
185+
// Find the matching closing brace '}'
186+
int end = findClosingBrace(regex, i + 3);
187+
188+
if (end != -1) {
189+
// Found valid \p{...} outside character class and literal block
190+
result.append("\\p{");
191+
String characterClass = regex.substring(i + 3, end);
192+
String replacement = replacements.get(characterClass);
193+
if (replacement == null) {
194+
result.append(characterClass);
195+
} else {
196+
result.append(replacement);
197+
}
198+
result.append("}");
199+
i = end; // Skip the entire \p{...} sequence
200+
continue;
201+
}
202+
// If the closing brace isn't found, fall through and treat as literals
203+
}
204+
// Escaped
205+
if (ch == '\\') {
206+
result.append(ch);
207+
if (i + 1 < regex.length()) {
208+
char escapedChar = regex.charAt(i + 1);
209+
result.append(escapedChar);
210+
if (escapedChar == 'Q') {
211+
inLiteralSection = true;
212+
}
213+
i++;
214+
}
215+
continue;
216+
}
217+
// Character Class
218+
if (ch == '[') {
219+
inCharacterClass = true;
220+
result.append(ch);
221+
continue;
222+
} else if (ch == ']') {
223+
inCharacterClass = false;
224+
result.append(ch);
225+
continue;
226+
}
227+
result.append(ch);
228+
}
229+
return result.toString();
230+
}
231+
232+
private static int findClosingBrace(String regex, int start) {
233+
int i = start;
234+
while (i < regex.length()) {
235+
if (regex.charAt(i) == '}') {
236+
return i;
237+
}
238+
if (regex.charAt(i) == '\\' && i + 1 < regex.length()) {
239+
i++;
240+
}
241+
i++;
242+
}
243+
return -1;
244+
}
245+
}

src/test/java/com/networknt/schema/regex/GraalJSRegularExpressionTest.java

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,30 @@ void anchorShouldNotMatchMultilineInput() {
163163
assertFalse(regex.matches("abc\n"));
164164
}
165165

166+
@Test
167+
void anchorStartShouldNotMatchMultilineInput() {
168+
RegularExpression regex = new GraalJSRegularExpression("^[a-z]{1,10}$", CONTEXT);
169+
assertFalse(regex.matches("\nabc"));
170+
}
171+
172+
@Test
173+
void dollarInCharacterClassShouldNotBeInterpretedAsAnchor() {
174+
RegularExpression regex = new GraalJSRegularExpression("^[a$]{1,10}$", CONTEXT);
175+
assertTrue(regex.matches("a$a$a$a$aa"));
176+
}
177+
178+
@Test
179+
void escapedDollarShouldNotBeInterpretedAsAnchor() {
180+
RegularExpression regex = new GraalJSRegularExpression("\\$", CONTEXT);
181+
assertTrue(regex.matches("$"));
182+
}
183+
184+
@Test
185+
void escapedDollarInCharacterClassShouldNotBeInterpretedAsAnchor() {
186+
RegularExpression regex = new GraalJSRegularExpression("[\\$]", CONTEXT);
187+
assertTrue(regex.matches("$"));
188+
}
189+
166190
/**
167191
* This test is because the JDK regex matches function implicitly adds anchors
168192
* which isn't expected.
@@ -211,4 +235,32 @@ public void run() {
211235
throw instance[0];
212236
}
213237
}
238+
239+
enum CharacterClassInput {
240+
LETTER("\\p{Letter}", "hello", true),
241+
NUMBER("\\p{Number}", "1", true),
242+
LOWERCASE_LETTER("\\p{Lowercase_Letter}", "A", false),
243+
;
244+
245+
String regex;
246+
String input;
247+
boolean result;
248+
249+
CharacterClassInput(String regex, String input, boolean result) {
250+
this.regex = regex;
251+
this.input = input;
252+
this.result = result;
253+
}
254+
}
255+
256+
@ParameterizedTest
257+
@EnumSource(CharacterClassInput.class)
258+
void characterClass(CharacterClassInput input) {
259+
RegularExpression regex = new GraalJSRegularExpression(input.regex, CONTEXT);
260+
if(input.result) {
261+
assertTrue(regex.matches(input.input));
262+
} else {
263+
assertFalse(regex.matches(input.input));
264+
}
265+
}
214266
}

0 commit comments

Comments
 (0)