Move new posixToJavaRegex into Parser as fixRegexes.

Pull functionality (`\<`/`\>` and posix class translation) of previous `fixRegexes` into the new one. Added a "test" (it just parses and prints) to process all locally installed `.nanorc` files. Adopted existing tests using the previous `fixRegexes` function. Fixed the wrong handling in the previous commit of Posix classes in bracket expressions.
jline · Jan 27, 2025 · 6c26cf7 · 6c26cf7
1 parent d7be34b
commit 6c26cf7
Show file tree

Hide file tree

Showing 2 changed files with 280 additions and 182 deletions.
diff --git a/builtins/src/main/java/org/jline/builtins/SyntaxHighlighter.java b/builtins/src/main/java/org/jline/builtins/SyntaxHighlighter.java
@@ -356,7 +356,7 @@ private AttributedStringBuilder _highlight(
         return asb;
     }
 
-    private static class HighlightRule {
+    static class HighlightRule {
         public enum RuleType {
             PATTERN,
             START_END,
@@ -465,7 +465,7 @@ public String toString() {
         }
     }
 
-    private static class NanorcParser {
+    static class NanorcParser {
         private static final String DEFAULT_SYNTAX = "default";
         private final String name;
         private final String target;
@@ -574,20 +574,7 @@ public void parse() throws IOException {
         }
 
         private String fixRegexes(String line) {
-            return line.replaceAll("\\\\<", "\\\\b")
-                    .replaceAll("\\\\>", "\\\\b")
-                    .replaceAll("\\[:alnum:]", "\\\\p{Alnum}")
-                    .replaceAll("\\[:alpha:]", "\\\\p{Alpha}")
-                    .replaceAll("\\[:blank:]", "\\\\p{Blank}")
-                    .replaceAll("\\[:cntrl:]", "\\\\p{Cntrl}")
-                    .replaceAll("\\[:digit:]", "\\\\p{Digit}")
-                    .replaceAll("\\[:graph:]", "\\\\p{Graph}")
-                    .replaceAll("\\[:lower:]", "\\\\p{Lower}")
-                    .replaceAll("\\[:print:]", "\\\\p{Print}")
-                    .replaceAll("\\[:punct:]", "\\\\p{Punct}")
-                    .replaceAll("\\[:space:]", "\\\\s")
-                    .replaceAll("\\[:upper:]", "\\\\p{Upper}")
-                    .replaceAll("\\[:xdigit:]", "\\\\p{XDigit}");
+            return line;
         }
 
         private boolean addHighlightRule(List<String> parts, int idx, String tokenName) {
@@ -699,146 +686,11 @@ private void addHighlightRule(String reference, List<String> parts, boolean case
         }
 
         private Pattern doPattern(String regex, boolean caseInsensitive) {
-            regex = posixToJavaRegex(regex);
+            regex = Parser.fixRegexes(regex);
             return caseInsensitive ? Pattern.compile(regex, Pattern.CASE_INSENSITIVE) : Pattern.compile(regex);
         }
     }
 
-    /**
-     * Posix regex is different from Java regex. This function parses the given Posix regex and escapes according to these rules:
-     *
-     * <p>The first {@code ]} in a bracket expression does not need to be escaped in Posix,translate to {@code \]}.
-     *
-     * <p>Same as above for a negating bracket expression like {@code [^][]}, translate to {@code [^\]\[]}.
-     *
-     * <p>Any {@code [} in a bracket expression does not need to be escaped in Posix, translate to {@code \[}.
-     *
-     * <p>Any {@code ]} not in a bracket expression is valid in both Posix and Java, no translation.
-     *
-     * <p>A backslash before the closing bracket like {@code [.f\]} is not an escape of the closing bracket,
-     * the backslash needs to be escaped for Java, translate to {@code [.f\\]}.
-     *
-     * <p>Do not perform the above translations within an escape via {@code \}.
-     *
-     * <p>Do not perform the above translations for Posix "classes" like {@code [[:word:]]} or {@code [[:digit:]]}
-     * and their negation {@code [-[:word]]}.
-     *
-     * <p>Do not perform the above translations for single-bracket Posix classes like {@code [:digit:]},
-     * and handle the case of single-bracket Posix classes inside bracket expressions, like
-     * @code {[[:digit:]-.]}.
-     *
-     * @param posix Posix regex
-     * @return Java regex
-     */
-    static String posixToJavaRegex(String posix) {
-        int len = posix.length();
-        StringBuilder java = new StringBuilder();
-
-        boolean inBracketExpression = false;
-
-        int i = 0;
-        char next;
-        try {
-            for (; i < len; i++) {
-                char c = posix.charAt(i);
-
-                switch (c) {
-                    case '\\':
-                        next = posix.charAt(++i);
-                        // Don't translate anything after the \ character escape
-                        if (inBracketExpression && next == ']') {
-                            inBracketExpression = false;
-                            java.append("\\\\").append(next);
-                        } else {
-                            java.append(c).append(next);
-                        }
-                        break;
-                    case '[':
-                        if (i == len - 1) {
-                            throw new IllegalArgumentException("Lone [ at the end of (index " + i + "): " + posix);
-                        }
-                        // Handle "double bracket" Posix "classes" like [[:word:]] or [[:digit:]] and their negations
-                        // starting with [-[:
-                        if (posix.regionMatches(i, "[[:", 0, 3) || posix.regionMatches(i, "[-[:", 0, 4)) {
-                            int afterClass = nextAfterClass(posix, i + 3);
-                            if (posix.regionMatches(afterClass, ":]]", 0, 3)) {
-                                java.append(posix, i, afterClass + 3);
-                                i = afterClass + 2;
-                                break;
-                            } else if (posix.regionMatches(afterClass, ":]", 0, 2)) {
-                                if (inBracketExpression) {
-                                    throw new IllegalArgumentException("Unclear bracket expression");
-                                }
-                                // Handles character patterns like [[:alpha:]_-]
-                                java.append(posix, i, afterClass + 2);
-                                i = afterClass + 1;
-                                inBracketExpression = true;
-                                break;
-                            } else {
-                                throw new IllegalArgumentException("Invalid character class");
-                            }
-                        }
-                        // Handle "single bracket" Posix "classes" like [:word:]
-                        else if (posix.charAt(i + 1) == ':') {
-                            int afterClass = nextAfterClass(posix, i + 2);
-                            if (!posix.regionMatches(afterClass, ":]", 0, 2)) {
-                                java.append("[:");
-                                i++;
-                                inBracketExpression = true;
-                            } else {
-                                java.append(posix, i, afterClass + 2);
-                                i = afterClass + 1;
-                            }
-                            break;
-                        }
-                        if (inBracketExpression) {
-                            // Translate lone [ to \[
-                            java.append('\\').append(c);
-                        } else {
-                            inBracketExpression = true;
-                            java.append(c);
-                            next = posix.charAt(i + 1);
-                            if (next == ']') {
-                                i++;
-                                java.append("\\]");
-                            } else if (next == '^' && posix.charAt(i + 2) == ']') {
-                                i += 2;
-                                java.append("^\\]");
-                            }
-                        }
-                        break;
-                    case ']':
-                        if (inBracketExpression) {
-                            inBracketExpression = false;
-                        }
-                        java.append(c);
-                        break;
-                    default:
-                        java.append(c);
-                        break;
-                }
-            }
-        } catch (Exception e) {
-            throw new IllegalArgumentException(
-                    "Posix-to-Java regex translation failed around index " + i + " of: " + posix, e);
-        }
-        return java.toString();
-    }
-
-    private static int nextAfterClass(String s, int idx) {
-        if (s.charAt(idx) == ':') {
-            idx++;
-        }
-        while (true) {
-            char c = s.charAt(idx);
-            if (!Character.isLetterOrDigit(c)) {
-                break;
-            }
-            idx++;
-        }
-        return idx;
-    }
-
     protected static class RuleSplitter {
         protected static List<String> split(String s) {
             List<String> out = new ArrayList<>();
@@ -938,7 +790,7 @@ public int getEnd() {
         }
     }
 
-    private static class Parser {
+    static class Parser {
         private static final char escapeChar = '\\';
         private String blockCommentTokenName;
         private BlockCommentDelimiters blockCommentDelimiters;
@@ -1122,5 +974,173 @@ private boolean isEscaped(final CharSequence buffer, final int pos) {
             }
             return isEscapeChar(buffer, pos - 1);
         }
+
+        /**
+         * Perform Posix/Java regex fixups. This function parses the given regex and escapes according to these rules:
+         *
+         * <p>The first {@code ]} in a bracket expression does not need to be escaped in Posix,translate to {@code \]}.
+         *
+         * <p>Same as above for a negating bracket expression like {@code [^][]}, translate to {@code [^\]\[]}.
+         *
+         * <p>Any {@code [} in a bracket expression does not need to be escaped in Posix, translate to {@code \[}.
+         *
+         * <p>Any {@code ]} not in a bracket expression is valid in both Posix and Java, no translation.
+         *
+         * <p>A backslash before the closing bracket like {@code [.f\]} is not an escape of the closing bracket,
+         * the backslash needs to be escaped for Java, translate to {@code [.f\\]}.
+         *
+         * <p>Do not perform the above translations within an escape via {@code \}, except for {@code \<} and {@code \>} to {@code \b}.
+         *
+         * <p>Replace the Posix classes like {@code [:word:]} or {@code [:digit:]} to Java classes, inside and outside a bracket expression.
+         *
+         * @param posix Posix regex
+         * @return Java regex
+         */
+        static String fixRegexes(String posix) {
+            int len = posix.length();
+            StringBuilder java = new StringBuilder();
+
+            boolean inBracketExpression = false;
+
+            int i = 0;
+            char next;
+            try {
+                for (; i < len; i++) {
+                    char c = posix.charAt(i);
+
+                    switch (c) {
+                        case escapeChar:
+                            next = posix.charAt(++i);
+                            // Don't translate anything after the \ character escape
+                            if (inBracketExpression && next == ']') {
+                                inBracketExpression = false;
+                                java.append("\\\\").append(next);
+                            } else {
+                                // Translate '\<' and '\>' to '\b'
+                                if (next == '<' || next == '>') {
+                                    next = 'b';
+                                }
+                                java.append(c).append(next);
+                            }
+                            break;
+                        case '[':
+                            if (i == len - 1) {
+                                throw new IllegalArgumentException("Lone [ at the end of (index " + i + "): " + posix);
+                            }
+                            // Handle "double bracket" Posix "classes" like [[:word:]] or [[:digit:]] and their
+                            // negations
+                            // starting with [-[:
+                            if (posix.regionMatches(i, "[[:", 0, 3)) {
+                                int afterClass = nextAfterClass(posix, i + 3);
+                                if (posix.regionMatches(afterClass, ":]]", 0, 3)) {
+                                    String className = posix.substring(i + 3, afterClass);
+                                    java.append('[').append(replaceClass(className));
+                                    i = afterClass + 1;
+                                    inBracketExpression = true;
+                                    break;
+                                } else if (posix.regionMatches(afterClass, ":]", 0, 2)) {
+                                    if (inBracketExpression) {
+                                        throw new IllegalArgumentException("Unclear bracket expression");
+                                    }
+                                    // Handles character patterns like [[:alpha:]_-]
+                                    String className = posix.substring(i + 3, afterClass);
+                                    java.append('[').append(replaceClass(className));
+                                    i = afterClass + 1;
+                                    inBracketExpression = true;
+                                    break;
+                                } else {
+                                    throw new IllegalArgumentException("Invalid character class");
+                                }
+                            }
+                            // Handle "single bracket" Posix "classes" like [:word:]
+                            else if (posix.charAt(i + 1) == ':') {
+                                int afterClass = nextAfterClass(posix, i + 2);
+                                if (!posix.regionMatches(afterClass, ":]", 0, 2)) {
+                                    java.append("[:");
+                                    i++;
+                                    inBracketExpression = true;
+                                } else {
+                                    String className = posix.substring(i + 2, afterClass);
+                                    java.append(replaceClass(className));
+                                    i = afterClass + 1;
+                                }
+                                break;
+                            }
+                            if (inBracketExpression) {
+                                // Translate lone [ to \[
+                                java.append('\\').append(c);
+                            } else {
+                                inBracketExpression = true;
+                                java.append(c);
+                                next = posix.charAt(i + 1);
+                                if (next == ']') {
+                                    i++;
+                                    java.append("\\]");
+                                } else if (next == '^' && posix.charAt(i + 2) == ']') {
+                                    i += 2;
+                                    java.append("^\\]");
+                                }
+                            }
+                            break;
+                        case ']':
+                            inBracketExpression = false;
+                            java.append(c);
+                            break;
+                        default:
+                            java.append(c);
+                            break;
+                    }
+                }
+            } catch (Exception e) {
+                throw new IllegalArgumentException(
+                        "Posix-to-Java regex translation failed around index " + i + " of: " + posix, e);
+            }
+
+            return java.toString();
+        }
+
+        private static String replaceClass(String className) {
+            switch (className) {
+                case "alnum":
+                    return "\\p{Alnum}";
+                case "alpha":
+                    return "\\p{Alpha}";
+                case "blank":
+                    return "\\p{Blank}";
+                case "cntrl":
+                    return "\\p{Cntrl}";
+                case "digit":
+                    return "\\p{Digit}";
+                case "graph":
+                    return "\\p{Graph}";
+                case "lower":
+                    return "\\p{Lower}";
+                case "print":
+                    return "\\p{Print}";
+                case "punct":
+                    return "\\p{Punct}";
+                case "space":
+                    return "\\s";
+                case "upper":
+                    return "\\p{Upper}";
+                case "xdigit":
+                    return "\\p{XDigit}";
+            }
+            throw new IllegalArgumentException("Unknown class '" + className + "'");
+        }
+
+        private static int nextAfterClass(String s, int idx) {
+            if (s.charAt(idx) == ':') {
+                idx++;
+            }
+            while (true) {
+                char c = s.charAt(idx);
+                if (!Character.isLetterOrDigit(c)) {
+                    break;
+                }
+                idx++;
+            }
+            return idx;
+        }
     }
 }