Skip to content

Commit

Permalink
Move new posixToJavaRegex into Parser as fixRegexes.
Browse files Browse the repository at this point in the history
Pull functionality (`\<`/`\>` and posix class translation) of previous `fixRegexes` into the new one.

Added a "test" (it just parses and prints) to process all locally installed `.nanorc` files.

Adopted existing tests using the previous `fixRegexes` function.

Fixed the wrong handling in the previous commit of Posix classes in bracket expressions.
  • Loading branch information
snazy committed Jan 27, 2025
1 parent d7be34b commit 6c26cf7
Show file tree
Hide file tree
Showing 2 changed files with 280 additions and 182 deletions.
326 changes: 173 additions & 153 deletions builtins/src/main/java/org/jline/builtins/SyntaxHighlighter.java
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ private AttributedStringBuilder _highlight(
return asb;
}

private static class HighlightRule {
static class HighlightRule {
public enum RuleType {
PATTERN,
START_END,
Expand Down Expand Up @@ -465,7 +465,7 @@ public String toString() {
}
}

private static class NanorcParser {
static class NanorcParser {
private static final String DEFAULT_SYNTAX = "default";
private final String name;
private final String target;
Expand Down Expand Up @@ -574,20 +574,7 @@ public void parse() throws IOException {
}

private String fixRegexes(String line) {
return line.replaceAll("\\\\<", "\\\\b")
.replaceAll("\\\\>", "\\\\b")
.replaceAll("\\[:alnum:]", "\\\\p{Alnum}")
.replaceAll("\\[:alpha:]", "\\\\p{Alpha}")
.replaceAll("\\[:blank:]", "\\\\p{Blank}")
.replaceAll("\\[:cntrl:]", "\\\\p{Cntrl}")
.replaceAll("\\[:digit:]", "\\\\p{Digit}")
.replaceAll("\\[:graph:]", "\\\\p{Graph}")
.replaceAll("\\[:lower:]", "\\\\p{Lower}")
.replaceAll("\\[:print:]", "\\\\p{Print}")
.replaceAll("\\[:punct:]", "\\\\p{Punct}")
.replaceAll("\\[:space:]", "\\\\s")
.replaceAll("\\[:upper:]", "\\\\p{Upper}")
.replaceAll("\\[:xdigit:]", "\\\\p{XDigit}");
return line;
}

private boolean addHighlightRule(List<String> parts, int idx, String tokenName) {
Expand Down Expand Up @@ -699,146 +686,11 @@ private void addHighlightRule(String reference, List<String> parts, boolean case
}

private Pattern doPattern(String regex, boolean caseInsensitive) {
regex = posixToJavaRegex(regex);
regex = Parser.fixRegexes(regex);
return caseInsensitive ? Pattern.compile(regex, Pattern.CASE_INSENSITIVE) : Pattern.compile(regex);
}
}

/**
* Posix regex is different from Java regex. This function parses the given Posix regex and escapes according to these rules:
*
* <p>The first {@code ]} in a bracket expression does not need to be escaped in Posix,translate to {@code \]}.
*
* <p>Same as above for a negating bracket expression like {@code [^][]}, translate to {@code [^\]\[]}.
*
* <p>Any {@code [} in a bracket expression does not need to be escaped in Posix, translate to {@code \[}.
*
* <p>Any {@code ]} not in a bracket expression is valid in both Posix and Java, no translation.
*
* <p>A backslash before the closing bracket like {@code [.f\]} is not an escape of the closing bracket,
* the backslash needs to be escaped for Java, translate to {@code [.f\\]}.
*
* <p>Do not perform the above translations within an escape via {@code \}.
*
* <p>Do not perform the above translations for Posix "classes" like {@code [[:word:]]} or {@code [[:digit:]]}
* and their negation {@code [-[:word]]}.
*
* <p>Do not perform the above translations for single-bracket Posix classes like {@code [:digit:]},
* and handle the case of single-bracket Posix classes inside bracket expressions, like
* @code {[[:digit:]-.]}.
*
* @param posix Posix regex
* @return Java regex
*/
static String posixToJavaRegex(String posix) {
int len = posix.length();
StringBuilder java = new StringBuilder();

boolean inBracketExpression = false;

int i = 0;
char next;
try {
for (; i < len; i++) {
char c = posix.charAt(i);

switch (c) {
case '\\':
next = posix.charAt(++i);
// Don't translate anything after the \ character escape
if (inBracketExpression && next == ']') {
inBracketExpression = false;
java.append("\\\\").append(next);
} else {
java.append(c).append(next);
}
break;
case '[':
if (i == len - 1) {
throw new IllegalArgumentException("Lone [ at the end of (index " + i + "): " + posix);
}
// Handle "double bracket" Posix "classes" like [[:word:]] or [[:digit:]] and their negations
// starting with [-[:
if (posix.regionMatches(i, "[[:", 0, 3) || posix.regionMatches(i, "[-[:", 0, 4)) {
int afterClass = nextAfterClass(posix, i + 3);
if (posix.regionMatches(afterClass, ":]]", 0, 3)) {
java.append(posix, i, afterClass + 3);
i = afterClass + 2;
break;
} else if (posix.regionMatches(afterClass, ":]", 0, 2)) {
if (inBracketExpression) {
throw new IllegalArgumentException("Unclear bracket expression");
}
// Handles character patterns like [[:alpha:]_-]
java.append(posix, i, afterClass + 2);
i = afterClass + 1;
inBracketExpression = true;
break;
} else {
throw new IllegalArgumentException("Invalid character class");
}
}
// Handle "single bracket" Posix "classes" like [:word:]
else if (posix.charAt(i + 1) == ':') {
int afterClass = nextAfterClass(posix, i + 2);
if (!posix.regionMatches(afterClass, ":]", 0, 2)) {
java.append("[:");
i++;
inBracketExpression = true;
} else {
java.append(posix, i, afterClass + 2);
i = afterClass + 1;
}
break;
}
if (inBracketExpression) {
// Translate lone [ to \[
java.append('\\').append(c);
} else {
inBracketExpression = true;
java.append(c);
next = posix.charAt(i + 1);
if (next == ']') {
i++;
java.append("\\]");
} else if (next == '^' && posix.charAt(i + 2) == ']') {
i += 2;
java.append("^\\]");
}
}
break;
case ']':
if (inBracketExpression) {
inBracketExpression = false;
}
java.append(c);
break;
default:
java.append(c);
break;
}
}
} catch (Exception e) {
throw new IllegalArgumentException(
"Posix-to-Java regex translation failed around index " + i + " of: " + posix, e);
}
return java.toString();
}

private static int nextAfterClass(String s, int idx) {
if (s.charAt(idx) == ':') {
idx++;
}
while (true) {
char c = s.charAt(idx);
if (!Character.isLetterOrDigit(c)) {
break;
}
idx++;
}
return idx;
}

protected static class RuleSplitter {
protected static List<String> split(String s) {
List<String> out = new ArrayList<>();
Expand Down Expand Up @@ -938,7 +790,7 @@ public int getEnd() {
}
}

private static class Parser {
static class Parser {
private static final char escapeChar = '\\';
private String blockCommentTokenName;
private BlockCommentDelimiters blockCommentDelimiters;
Expand Down Expand Up @@ -1122,5 +974,173 @@ private boolean isEscaped(final CharSequence buffer, final int pos) {
}
return isEscapeChar(buffer, pos - 1);
}

/**
* Perform Posix/Java regex fixups. This function parses the given regex and escapes according to these rules:
*
* <p>The first {@code ]} in a bracket expression does not need to be escaped in Posix,translate to {@code \]}.
*
* <p>Same as above for a negating bracket expression like {@code [^][]}, translate to {@code [^\]\[]}.
*
* <p>Any {@code [} in a bracket expression does not need to be escaped in Posix, translate to {@code \[}.
*
* <p>Any {@code ]} not in a bracket expression is valid in both Posix and Java, no translation.
*
* <p>A backslash before the closing bracket like {@code [.f\]} is not an escape of the closing bracket,
* the backslash needs to be escaped for Java, translate to {@code [.f\\]}.
*
* <p>Do not perform the above translations within an escape via {@code \}, except for {@code \<} and {@code \>} to {@code \b}.
*
* <p>Replace the Posix classes like {@code [:word:]} or {@code [:digit:]} to Java classes, inside and outside a bracket expression.
*
* @param posix Posix regex
* @return Java regex
*/
static String fixRegexes(String posix) {
int len = posix.length();
StringBuilder java = new StringBuilder();

boolean inBracketExpression = false;

int i = 0;
char next;
try {
for (; i < len; i++) {
char c = posix.charAt(i);

switch (c) {
case escapeChar:
next = posix.charAt(++i);
// Don't translate anything after the \ character escape
if (inBracketExpression && next == ']') {
inBracketExpression = false;
java.append("\\\\").append(next);
} else {
// Translate '\<' and '\>' to '\b'
if (next == '<' || next == '>') {
next = 'b';
}
java.append(c).append(next);
}
break;
case '[':
if (i == len - 1) {
throw new IllegalArgumentException("Lone [ at the end of (index " + i + "): " + posix);
}
// Handle "double bracket" Posix "classes" like [[:word:]] or [[:digit:]] and their
// negations
// starting with [-[:
if (posix.regionMatches(i, "[[:", 0, 3)) {
int afterClass = nextAfterClass(posix, i + 3);
if (posix.regionMatches(afterClass, ":]]", 0, 3)) {
String className = posix.substring(i + 3, afterClass);
java.append('[').append(replaceClass(className));
i = afterClass + 1;
inBracketExpression = true;
break;
} else if (posix.regionMatches(afterClass, ":]", 0, 2)) {
if (inBracketExpression) {
throw new IllegalArgumentException("Unclear bracket expression");
}
// Handles character patterns like [[:alpha:]_-]
String className = posix.substring(i + 3, afterClass);
java.append('[').append(replaceClass(className));
i = afterClass + 1;
inBracketExpression = true;
break;
} else {
throw new IllegalArgumentException("Invalid character class");
}
}
// Handle "single bracket" Posix "classes" like [:word:]
else if (posix.charAt(i + 1) == ':') {
int afterClass = nextAfterClass(posix, i + 2);
if (!posix.regionMatches(afterClass, ":]", 0, 2)) {
java.append("[:");
i++;
inBracketExpression = true;
} else {
String className = posix.substring(i + 2, afterClass);
java.append(replaceClass(className));
i = afterClass + 1;
}
break;
}
if (inBracketExpression) {
// Translate lone [ to \[
java.append('\\').append(c);
} else {
inBracketExpression = true;
java.append(c);
next = posix.charAt(i + 1);
if (next == ']') {
i++;
java.append("\\]");
} else if (next == '^' && posix.charAt(i + 2) == ']') {
i += 2;
java.append("^\\]");
}
}
break;
case ']':
inBracketExpression = false;
java.append(c);
break;
default:
java.append(c);
break;
}
}
} catch (Exception e) {
throw new IllegalArgumentException(
"Posix-to-Java regex translation failed around index " + i + " of: " + posix, e);
}

return java.toString();
}

private static String replaceClass(String className) {
switch (className) {
case "alnum":
return "\\p{Alnum}";
case "alpha":
return "\\p{Alpha}";
case "blank":
return "\\p{Blank}";
case "cntrl":
return "\\p{Cntrl}";
case "digit":
return "\\p{Digit}";
case "graph":
return "\\p{Graph}";
case "lower":
return "\\p{Lower}";
case "print":
return "\\p{Print}";
case "punct":
return "\\p{Punct}";
case "space":
return "\\s";
case "upper":
return "\\p{Upper}";
case "xdigit":
return "\\p{XDigit}";
}
throw new IllegalArgumentException("Unknown class '" + className + "'");
}

private static int nextAfterClass(String s, int idx) {
if (s.charAt(idx) == ':') {
idx++;
}
while (true) {
char c = s.charAt(idx);
if (!Character.isLetterOrDigit(c)) {
break;
}
idx++;
}
return idx;
}
}
}
Loading

0 comments on commit 6c26cf7

Please sign in to comment.