janet-lang · ianthehenry · Dec 5, 2024
diff --git a/src/core/peg.c b/src/core/peg.c
@@ -183,551 +183,587 @@
    const uint32_t *rule,
    const uint8_t *text) {
 tail:
    switch (*rule) {
        default:
            janet_panic("unexpected opcode");
            return NULL;

        case RULE_LITERAL: {
            uint32_t len = rule[1];
            if (text + len > s->text_end) return NULL;
            return memcmp(text, rule + 2, len) ? NULL : text + len;
        }

        case RULE_NCHAR: {
            uint32_t n = rule[1];
            return (text + n > s->text_end) ? NULL : text + n;
        }

        case RULE_NOTNCHAR: {
            uint32_t n = rule[1];
            return (text + n > s->text_end) ? text : NULL;
        }

        case RULE_RANGE: {
            uint8_t lo = rule[1] & 0xFF;
            uint8_t hi = (rule[1] >> 16) & 0xFF;
            return (text < s->text_end &&
                    text[0] >= lo &&
                    text[0] <= hi)
                   ? text + 1
                   : NULL;
        }

        case RULE_SET: {
            if (text >= s->text_end) return NULL;
            uint32_t word = rule[1 + (text[0] >> 5)];
            uint32_t mask = (uint32_t)1 << (text[0] & 0x1F);
            return (word & mask)
                   ? text + 1
                   : NULL;
        }

        case RULE_LOOK: {
            text += ((int32_t *)rule)[1];
            if (text < s->text_start || text > s->text_end) return NULL;
            down1(s);
            const uint8_t *result = peg_rule(s, s->bytecode + rule[2], text);
            up1(s);
            text -= ((int32_t *)rule)[1];
            return result ? text : NULL;
        }

        case RULE_CHOICE: {
            uint32_t len = rule[1];
            const uint32_t *args = rule + 2;
            if (len == 0) return NULL;
            down1(s);
            CapState cs = cap_save(s);
            for (uint32_t i = 0; i < len - 1; i++) {
                const uint8_t *result = peg_rule(s, s->bytecode + args[i], text);
                if (result) {
                    up1(s);
                    return result;
                }
                cap_load(s, cs);
            }
            up1(s);
            rule = s->bytecode + args[len - 1];
            goto tail;
        }

        case RULE_SEQUENCE: {
            uint32_t len = rule[1];
            const uint32_t *args = rule + 2;
            if (len == 0) return text;
            down1(s);
            for (uint32_t i = 0; text && i < len - 1; i++)
                text = peg_rule(s, s->bytecode + args[i], text);
            up1(s);
            if (!text) return NULL;
            rule = s->bytecode + args[len - 1];
            goto tail;
        }

        case RULE_IF: {
            const uint32_t *rule_a = s->bytecode + rule[1];
            const uint32_t *rule_b = s->bytecode + rule[2];
            down1(s);
            const uint8_t *result = peg_rule(s, rule_a, text);
            up1(s);
            if (!result) return NULL;
            rule = rule_b;
            goto tail;
        }
        case RULE_IFNOT: {
            const uint32_t *rule_a = s->bytecode + rule[1];
            const uint32_t *rule_b = s->bytecode + rule[2];
            down1(s);
            CapState cs = cap_save(s);
            const uint8_t *result = peg_rule(s, rule_a, text);
            if (!!result) {
                up1(s);
                return NULL;
            } else {
                cap_load(s, cs);
                up1(s);
                rule = rule_b;
                goto tail;
            }
        }

        case RULE_NOT: {
            const uint32_t *rule_a = s->bytecode + rule[1];
            down1(s);
            CapState cs = cap_save(s);
            const uint8_t *result = peg_rule(s, rule_a, text);
            if (result) {
                up1(s);
                return NULL;
            } else {
                cap_load(s, cs);
                up1(s);
                return text;
            }
        }

        case RULE_THRU:
        case RULE_TO: {
            const uint32_t *rule_a = s->bytecode + rule[1];
            const uint8_t *next_text = NULL;
            CapState cs = cap_save(s);
            down1(s);
            while (text <= s->text_end) {
                CapState cs2 = cap_save(s);
                next_text = peg_rule(s, rule_a, text);
                if (next_text) {
                    if (rule[0] == RULE_TO) cap_load(s, cs2);
                    break;
                }
                cap_load(s, cs2);
                text++;
            }
            up1(s);
            if (text > s->text_end) {
                cap_load(s, cs);
                return NULL;
            }
            return rule[0] == RULE_TO ? text : next_text;
        }

        case RULE_BETWEEN: {
            uint32_t lo = rule[1];
            uint32_t hi = rule[2];
            const uint32_t *rule_a = s->bytecode + rule[3];
            uint32_t captured = 0;
            const uint8_t *next_text;
            CapState cs = cap_save(s);
            down1(s);
            while (captured < hi) {
                CapState cs2 = cap_save(s);
                next_text = peg_rule(s, rule_a, text);
                if (!next_text || next_text == text) {
                    cap_load(s, cs2);
                    break;
                }
                captured++;
                text = next_text;
            }
            up1(s);
            if (captured < lo) {
                cap_load(s, cs);
                return NULL;
            }
            return text;
        }

        /* Capturing rules */

        case RULE_GETTAG: {
            uint32_t search = rule[1];
            uint32_t tag = rule[2];
            for (int32_t i = s->tags->count - 1; i >= 0; i--) {
                if (s->tags->data[i] == search) {
                    pushcap(s, s->tagged_captures->data[i], tag);
                    return text;
                }
            }
            return NULL;
        }

        case RULE_POSITION: {
            pushcap(s, janet_wrap_number((double)(text - s->text_start)), rule[1]);
            return text;
        }

        case RULE_LINE: {
            LineCol lc = get_linecol_from_position(s, (int32_t)(text - s->text_start));
            pushcap(s, janet_wrap_number((double)(lc.line)), rule[1]);
            return text;
        }

        case RULE_COLUMN: {
            LineCol lc = get_linecol_from_position(s, (int32_t)(text - s->text_start));
            pushcap(s, janet_wrap_number((double)(lc.col)), rule[1]);
            return text;
        }

        case RULE_ARGUMENT: {
            int32_t index = ((int32_t *)rule)[1];
            Janet capture = (index >= s->extrac) ? janet_wrap_nil() : s->extrav[index];
            pushcap(s, capture, rule[2]);
            return text;
        }

        case RULE_CONSTANT: {
            pushcap(s, s->constants[rule[1]], rule[2]);
            return text;
        }

        case RULE_CAPTURE: {
            down1(s);
            const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
            up1(s);
            if (!result) return NULL;
            /* Specialized pushcap - avoid intermediate string creation */
            if (!s->has_backref && s->mode == PEG_MODE_ACCUMULATE) {
                janet_buffer_push_bytes(s->scratch, text, (int32_t)(result - text));
            } else {
                uint32_t tag = rule[2];
                pushcap(s, janet_stringv(text, (int32_t)(result - text)), tag);
            }
            return result;
        }

        case RULE_CAPTURE_NUM: {
            down1(s);
            const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
            up1(s);
            if (!result) return NULL;
            /* check number parsing */
            double x = 0.0;
            int32_t base = (int32_t) rule[2];
            if (janet_scan_number_base(text, (int32_t)(result - text), base, &x)) return NULL;
            /* Specialized pushcap - avoid intermediate string creation */
            if (!s->has_backref && s->mode == PEG_MODE_ACCUMULATE) {
                janet_buffer_push_bytes(s->scratch, text, (int32_t)(result - text));
            } else {
                uint32_t tag = rule[3];
                pushcap(s, janet_wrap_number(x), tag);
            }
            return result;
        }

        case RULE_ACCUMULATE: {
            uint32_t tag = rule[2];
            int oldmode = s->mode;
            if (!tag && oldmode == PEG_MODE_ACCUMULATE) {
                rule = s->bytecode + rule[1];
                goto tail;
            }
            CapState cs = cap_save(s);
            s->mode = PEG_MODE_ACCUMULATE;
            down1(s);
            const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
            up1(s);
            s->mode = oldmode;
            if (!result) return NULL;
            Janet cap = janet_stringv(s->scratch->data + cs.scratch,
                                      s->scratch->count - cs.scratch);
            cap_load_keept(s, cs);
            pushcap(s, cap, tag);
            return result;
        }

        case RULE_DROP: {
            CapState cs = cap_save(s);
            down1(s);
            const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
            up1(s);
            if (!result) return NULL;
            cap_load(s, cs);
            return result;
        }

        case RULE_ONLY_TAGS: {
            CapState cs = cap_save(s);
            down1(s);
            const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
            up1(s);
            if (!result) return NULL;
            cap_load_keept(s, cs);
            return result;
        }

        case RULE_GROUP: {
            uint32_t tag = rule[2];
            int oldmode = s->mode;
            CapState cs = cap_save(s);
            s->mode = PEG_MODE_NORMAL;
            down1(s);
            const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
            up1(s);
            s->mode = oldmode;
            if (!result) return NULL;
            int32_t num_sub_captures = s->captures->count - cs.cap;
            JanetArray *sub_captures = janet_array(num_sub_captures);
            safe_memcpy(sub_captures->data,
                        s->captures->data + cs.cap,
                        sizeof(Janet) * num_sub_captures);
            sub_captures->count = num_sub_captures;
            cap_load_keept(s, cs);
            pushcap(s, janet_wrap_array(sub_captures), tag);
            return result;
        }

        case RULE_NTH: {
            uint32_t nth = rule[1];
            if (nth > INT32_MAX) nth = INT32_MAX;
            uint32_t tag = rule[3];
            int oldmode = s->mode;
            CapState cs = cap_save(s);
            s->mode = PEG_MODE_NORMAL;
            down1(s);
            const uint8_t *result = peg_rule(s, s->bytecode + rule[2], text);
            up1(s);
            s->mode = oldmode;
            if (!result) return NULL;
            int32_t num_sub_captures = s->captures->count - cs.cap;
            Janet cap;
            if (num_sub_captures > (int32_t) nth) {
                cap = s->captures->data[cs.cap + nth];
            } else {
                return NULL;
            }
            cap_load_keept(s, cs);
            pushcap(s, cap, tag);
            return result;
        }

        case RULE_SUB: {
            const uint8_t *text_start = text;
            const uint32_t *rule_window = s->bytecode + rule[1];
            const uint32_t *rule_subpattern = s->bytecode + rule[2];
            down1(s);
            const uint8_t *window_end = peg_rule(s, rule_window, text);
            up1(s);
            if (!window_end) {
                return NULL;
            }
            const uint8_t *saved_end = s->text_end;
            s->text_end = window_end;
            down1(s);
            const uint8_t *next_text = peg_rule(s, rule_subpattern, text_start);
            up1(s);
            s->text_end = saved_end;

            if (!next_text) {
                return NULL;
            }

             return window_end;
         }
 
+        case RULE_TIL: {
+            const uint32_t *rule_terminus = s->bytecode + rule[1];
+            const uint32_t *rule_subpattern = s->bytecode + rule[2];
+
+            const uint8_t *terminus_start = text;
+            const uint8_t *terminus_end = NULL;
+            down1(s);
+            while (terminus_start <= s->text_end) {
+                CapState cs2 = cap_save(s);
+                terminus_end = peg_rule(s, rule_terminus, terminus_start);
+                cap_load(s, cs2);
+                if (terminus_end) {
+                    break;
+                }
+                terminus_start++;
+            }
+            up1(s);
+
+            if (!terminus_end) {
+              return NULL;
+            }
+
+            const uint8_t *saved_end = s->text_end;
+            s->text_end = terminus_start;
+            down1(s);
+            const uint8_t *matched = peg_rule(s, rule_subpattern, text);
+            up1(s);
+            s->text_end = saved_end;
+
+            if (!matched) {
+                return NULL;
+            }
+
+            return terminus_end;
+        }
+
         case RULE_SPLIT: {
             const uint8_t *saved_end = s->text_end;
             const uint32_t *rule_separator = s->bytecode + rule[1];
            const uint32_t *rule_subpattern = s->bytecode + rule[2];

            const uint8_t *separator_end = NULL;
            do {
                const uint8_t *text_start = text;
                CapState cs = cap_save(s);
                down1(s);
                while (text <= s->text_end) {
                    separator_end = peg_rule(s, rule_separator, text);
                    cap_load(s, cs);
                    if (separator_end) {
                        break;
                    }
                    text++;
                }
                up1(s);

                if (separator_end) {
                    s->text_end = text;
                    text = separator_end;
                }

                down1(s);
                const uint8_t *subpattern_end = peg_rule(s, rule_subpattern, text_start);
                up1(s);
                s->text_end = saved_end;

                if (!subpattern_end) {
                    return NULL;
                }
            } while (separator_end);

            return s->text_end;
        }

        case RULE_REPLACE:
        case RULE_MATCHTIME: {
            uint32_t tag = rule[3];
            int oldmode = s->mode;
            CapState cs = cap_save(s);
            s->mode = PEG_MODE_NORMAL;
            down1(s);
            const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
            up1(s);
            s->mode = oldmode;
            if (!result) return NULL;

            Janet cap = janet_wrap_nil();
            Janet constant = s->constants[rule[2]];
            switch (janet_type(constant)) {
                default:
                    cap = constant;
                    break;
                case JANET_STRUCT:
                    if (s->captures->count) {
                        cap = janet_struct_get(janet_unwrap_struct(constant),
                                               s->captures->data[s->captures->count - 1]);
                    }
                    break;
                case JANET_TABLE:
                    if (s->captures->count) {
                        cap = janet_table_get(janet_unwrap_table(constant),
                                              s->captures->data[s->captures->count - 1]);
                    }
                    break;
                case JANET_CFUNCTION:
                    cap = janet_unwrap_cfunction(constant)(s->captures->count - cs.cap,
                                                           s->captures->data + cs.cap);
                    break;
                case JANET_FUNCTION:
                    cap = janet_call(janet_unwrap_function(constant),
                                     s->captures->count - cs.cap,
                                     s->captures->data + cs.cap);
                    break;
            }
            cap_load_keept(s, cs);
            if (rule[0] == RULE_MATCHTIME && !janet_truthy(cap)) return NULL;
            pushcap(s, cap, tag);
            return result;
        }

        case RULE_ERROR: {
            int oldmode = s->mode;
            s->mode = PEG_MODE_NORMAL;
            int32_t old_cap = s->captures->count;
            down1(s);
            const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
            up1(s);
            s->mode = oldmode;
            if (!result) return NULL;
            if (s->captures->count > old_cap) {
                /* Throw last capture */
                janet_panicv(s->captures->data[s->captures->count - 1]);
            } else {
                /* Throw generic error */
                int32_t start = (int32_t)(text - s->text_start);
                LineCol lc = get_linecol_from_position(s, start);
                janet_panicf("match error at line %d, column %d", lc.line, lc.col);
            }
            return NULL;
        }

        case RULE_BACKMATCH: {
            uint32_t search = rule[1];
            for (int32_t i = s->tags->count - 1; i >= 0; i--) {
                if (s->tags->data[i] == search) {
                    Janet capture = s->tagged_captures->data[i];
                    if (!janet_checktype(capture, JANET_STRING))
                        return NULL;
                    const uint8_t *bytes = janet_unwrap_string(capture);
                    int32_t len = janet_string_length(bytes);
                    if (text + len > s->text_end)
                        return NULL;
                    return memcmp(text, bytes, len) ? NULL : text + len;
                }
            }
            return NULL;
        }

        case RULE_LENPREFIX: {
            int oldmode = s->mode;
            s->mode = PEG_MODE_NORMAL;
            const uint8_t *next_text;
            CapState cs = cap_save(s);
            down1(s);
            next_text = peg_rule(s, s->bytecode + rule[1], text);
            up1(s);
            if (NULL == next_text) return NULL;
            s->mode = oldmode;
            int32_t num_sub_captures = s->captures->count - cs.cap;
            Janet lencap;
            if (num_sub_captures <= 0 ||
                    (lencap = s->captures->data[cs.cap], !janet_checkint(lencap))) {
                cap_load(s, cs);
                return NULL;
            }
            int32_t nrep = janet_unwrap_integer(lencap);
            /* drop captures from len pattern */
            cap_load(s, cs);
            for (int32_t i = 0; i < nrep; i++) {
                down1(s);
                next_text = peg_rule(s, s->bytecode + rule[2], next_text);
                up1(s);
                if (NULL == next_text) {
                    cap_load(s, cs);
                    return NULL;
                }
            }
            return next_text;
        }

        case RULE_READINT: {
            uint32_t tag = rule[2];
            uint32_t signedness = rule[1] & 0x10;
            uint32_t endianness = rule[1] & 0x20;
            int width = (int)(rule[1] & 0xF);
            if (text + width > s->text_end) return NULL;
            uint64_t accum = 0;
            if (endianness) {
                /* BE */
                for (int i = 0; i < width; i++) accum = (accum << 8) | text[i];
            } else {
                /* LE */
                for (int i = width - 1; i >= 0; i--) accum = (accum << 8) | text[i];
            }

            Janet capture_value;
            /* We can only parse integeres of greater than 6 bytes reliable if int-types are enabled.
             * Otherwise, we may lose precision, so 6 is the maximum size when int-types are disabled. */
 #ifdef JANET_INT_TYPES
            if (width > 6) {
                if (signedness) {
                    capture_value = janet_wrap_s64(peg_convert_u64_s64(accum, width));
                } else {
                    capture_value = janet_wrap_u64(accum);
                }
            } else
 #endif
            {
                double double_value;
                if (signedness) {
                    double_value = (double)(peg_convert_u64_s64(accum, width));
                } else {
                    double_value = (double)accum;
@@ -1227,6 +1263,14 @@
     emit_2(r, RULE_SUB, subrule1, subrule2);
 }
 
+static void spec_til(Builder *b, int32_t argc, const Janet *argv) {
+    peg_fixarity(b, argc, 2);
+    Reserve r = reserve(b, 3);
+    uint32_t subrule1 = peg_compile1(b, argv[0]);
+    uint32_t subrule2 = peg_compile1(b, argv[1]);
+    emit_2(r, RULE_TIL, subrule1, subrule2);
+}
+
 static void spec_split(Builder *b, int32_t argc, const Janet *argv) {
     peg_fixarity(b, argc, 2);
     Reserve r = reserve(b, 3);
@@ -1323,6 +1367,7 @@
     {"split", spec_split},
     {"sub", spec_sub},
     {"thru", spec_thru},
+    {"til", spec_til},
     {"to", spec_to},
     {"uint", spec_uint_le},
     {"uint-be", spec_uint_be},
@@ -1657,6 +1702,7 @@
                 i += 4;
                 break;
             case RULE_SUB:
+            case RULE_TIL:
             case RULE_SPLIT:
                 /* [rule, rule] */
                 if (rule[1] >= blen) goto bad;

diff --git a/src/include/janet.h b/src/include/janet.h
@@ -2180,6 +2180,7 @@ typedef enum {
     RULE_UNREF,        /* [rule, tag] */
     RULE_CAPTURE_NUM,  /* [rule, tag] */
     RULE_SUB,          /* [rule, rule] */
+    RULE_TIL,          /* [rule, rule] */
     RULE_SPLIT,        /* [rule, rule] */
     RULE_NTH,          /* [nth, rule, tag] */
     RULE_ONLY_TAGS,    /* [rule] */

diff --git a/test/suite-peg.janet b/test/suite-peg.janet
@@ -713,6 +713,41 @@
   "abcdef"
   @[])
 
+(test "til: basic matching"
+  ~(til "d" "abc")
+  "abcdef"
+  @[])
+
+(test "til: second pattern can't see past the first occurrence of first pattern"
+  ~(til "d" (* "abc" -1))
+  "abcdef"
+  @[])
+
+(test "til: fails if first pattern fails"
+  ~(til "x" "abc")
+  "abcdef"
+  nil)
+
+(test "til: fails if second pattern fails"
+  ~(til "abc" "x")
+  "abcdef"
+  nil)
+
+(test "til: discards captures from initial pattern"
+  ~(til '"d" '"abc")
+  "abcdef"
+  @["abc"])
+
+(test "til: positions inside second match are still relative to the entire input"
+  ~(* "one\ntw" (til 0 (* ($) (line) (column))))
+  "one\ntwo\nthree\n"
+  @[6 2 3])
+
+(test "til: advances to the end of the first pattern's first occurrence"
+  ~(* (til "d" "ab") "e")
+  "abcdef"
+  @[])
+
 (test "split: basic functionality"
   ~(split "," '1)
   "a,b,c"