Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add (til) PEG special #1528

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions src/core/peg.c
Original file line number Diff line number Diff line change
Expand Up @@ -183,551 +183,587 @@
const uint32_t *rule,
const uint8_t *text) {
tail:
switch (*rule) {
default:
janet_panic("unexpected opcode");
return NULL;

case RULE_LITERAL: {
uint32_t len = rule[1];
if (text + len > s->text_end) return NULL;
return memcmp(text, rule + 2, len) ? NULL : text + len;
}

case RULE_NCHAR: {
uint32_t n = rule[1];
return (text + n > s->text_end) ? NULL : text + n;
}

case RULE_NOTNCHAR: {
uint32_t n = rule[1];
return (text + n > s->text_end) ? text : NULL;
}

case RULE_RANGE: {
uint8_t lo = rule[1] & 0xFF;
uint8_t hi = (rule[1] >> 16) & 0xFF;
return (text < s->text_end &&
text[0] >= lo &&
text[0] <= hi)
? text + 1
: NULL;
}

case RULE_SET: {
if (text >= s->text_end) return NULL;
uint32_t word = rule[1 + (text[0] >> 5)];
uint32_t mask = (uint32_t)1 << (text[0] & 0x1F);
return (word & mask)
? text + 1
: NULL;
}

case RULE_LOOK: {
text += ((int32_t *)rule)[1];
if (text < s->text_start || text > s->text_end) return NULL;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[2], text);
up1(s);
text -= ((int32_t *)rule)[1];
return result ? text : NULL;
}

case RULE_CHOICE: {
uint32_t len = rule[1];
const uint32_t *args = rule + 2;
if (len == 0) return NULL;
down1(s);
CapState cs = cap_save(s);
for (uint32_t i = 0; i < len - 1; i++) {
const uint8_t *result = peg_rule(s, s->bytecode + args[i], text);
if (result) {
up1(s);
return result;
}
cap_load(s, cs);
}
up1(s);
rule = s->bytecode + args[len - 1];
goto tail;
}

case RULE_SEQUENCE: {
uint32_t len = rule[1];
const uint32_t *args = rule + 2;
if (len == 0) return text;
down1(s);
for (uint32_t i = 0; text && i < len - 1; i++)
text = peg_rule(s, s->bytecode + args[i], text);
up1(s);
if (!text) return NULL;
rule = s->bytecode + args[len - 1];
goto tail;
}

case RULE_IF: {
const uint32_t *rule_a = s->bytecode + rule[1];
const uint32_t *rule_b = s->bytecode + rule[2];
down1(s);
const uint8_t *result = peg_rule(s, rule_a, text);
up1(s);
if (!result) return NULL;
rule = rule_b;
goto tail;
}
case RULE_IFNOT: {
const uint32_t *rule_a = s->bytecode + rule[1];
const uint32_t *rule_b = s->bytecode + rule[2];
down1(s);
CapState cs = cap_save(s);
const uint8_t *result = peg_rule(s, rule_a, text);
if (!!result) {
up1(s);
return NULL;
} else {
cap_load(s, cs);
up1(s);
rule = rule_b;
goto tail;
}
}

case RULE_NOT: {
const uint32_t *rule_a = s->bytecode + rule[1];
down1(s);
CapState cs = cap_save(s);
const uint8_t *result = peg_rule(s, rule_a, text);
if (result) {
up1(s);
return NULL;
} else {
cap_load(s, cs);
up1(s);
return text;
}
}

case RULE_THRU:
case RULE_TO: {
const uint32_t *rule_a = s->bytecode + rule[1];
const uint8_t *next_text = NULL;
CapState cs = cap_save(s);
down1(s);
while (text <= s->text_end) {
CapState cs2 = cap_save(s);
next_text = peg_rule(s, rule_a, text);
if (next_text) {
if (rule[0] == RULE_TO) cap_load(s, cs2);
break;
}
cap_load(s, cs2);
text++;
}
up1(s);
if (text > s->text_end) {
cap_load(s, cs);
return NULL;
}
return rule[0] == RULE_TO ? text : next_text;
}

case RULE_BETWEEN: {
uint32_t lo = rule[1];
uint32_t hi = rule[2];
const uint32_t *rule_a = s->bytecode + rule[3];
uint32_t captured = 0;
const uint8_t *next_text;
CapState cs = cap_save(s);
down1(s);
while (captured < hi) {
CapState cs2 = cap_save(s);
next_text = peg_rule(s, rule_a, text);
if (!next_text || next_text == text) {
cap_load(s, cs2);
break;
}
captured++;
text = next_text;
}
up1(s);
if (captured < lo) {
cap_load(s, cs);
return NULL;
}
return text;
}

/* Capturing rules */

case RULE_GETTAG: {
uint32_t search = rule[1];
uint32_t tag = rule[2];
for (int32_t i = s->tags->count - 1; i >= 0; i--) {
if (s->tags->data[i] == search) {
pushcap(s, s->tagged_captures->data[i], tag);
return text;
}
}
return NULL;
}

case RULE_POSITION: {
pushcap(s, janet_wrap_number((double)(text - s->text_start)), rule[1]);
return text;
}

case RULE_LINE: {
LineCol lc = get_linecol_from_position(s, (int32_t)(text - s->text_start));
pushcap(s, janet_wrap_number((double)(lc.line)), rule[1]);
return text;
}

case RULE_COLUMN: {
LineCol lc = get_linecol_from_position(s, (int32_t)(text - s->text_start));
pushcap(s, janet_wrap_number((double)(lc.col)), rule[1]);
return text;
}

case RULE_ARGUMENT: {
int32_t index = ((int32_t *)rule)[1];
Janet capture = (index >= s->extrac) ? janet_wrap_nil() : s->extrav[index];
pushcap(s, capture, rule[2]);
return text;
}

case RULE_CONSTANT: {
pushcap(s, s->constants[rule[1]], rule[2]);
return text;
}

case RULE_CAPTURE: {
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
up1(s);
if (!result) return NULL;
/* Specialized pushcap - avoid intermediate string creation */
if (!s->has_backref && s->mode == PEG_MODE_ACCUMULATE) {
janet_buffer_push_bytes(s->scratch, text, (int32_t)(result - text));
} else {
uint32_t tag = rule[2];
pushcap(s, janet_stringv(text, (int32_t)(result - text)), tag);
}
return result;
}

case RULE_CAPTURE_NUM: {
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
up1(s);
if (!result) return NULL;
/* check number parsing */
double x = 0.0;
int32_t base = (int32_t) rule[2];
if (janet_scan_number_base(text, (int32_t)(result - text), base, &x)) return NULL;
/* Specialized pushcap - avoid intermediate string creation */
if (!s->has_backref && s->mode == PEG_MODE_ACCUMULATE) {
janet_buffer_push_bytes(s->scratch, text, (int32_t)(result - text));
} else {
uint32_t tag = rule[3];
pushcap(s, janet_wrap_number(x), tag);
}
return result;
}

case RULE_ACCUMULATE: {
uint32_t tag = rule[2];
int oldmode = s->mode;
if (!tag && oldmode == PEG_MODE_ACCUMULATE) {
rule = s->bytecode + rule[1];
goto tail;
}
CapState cs = cap_save(s);
s->mode = PEG_MODE_ACCUMULATE;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
up1(s);
s->mode = oldmode;
if (!result) return NULL;
Janet cap = janet_stringv(s->scratch->data + cs.scratch,
s->scratch->count - cs.scratch);
cap_load_keept(s, cs);
pushcap(s, cap, tag);
return result;
}

case RULE_DROP: {
CapState cs = cap_save(s);
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
up1(s);
if (!result) return NULL;
cap_load(s, cs);
return result;
}

case RULE_ONLY_TAGS: {
CapState cs = cap_save(s);
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
up1(s);
if (!result) return NULL;
cap_load_keept(s, cs);
return result;
}

case RULE_GROUP: {
uint32_t tag = rule[2];
int oldmode = s->mode;
CapState cs = cap_save(s);
s->mode = PEG_MODE_NORMAL;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
up1(s);
s->mode = oldmode;
if (!result) return NULL;
int32_t num_sub_captures = s->captures->count - cs.cap;
JanetArray *sub_captures = janet_array(num_sub_captures);
safe_memcpy(sub_captures->data,
s->captures->data + cs.cap,
sizeof(Janet) * num_sub_captures);
sub_captures->count = num_sub_captures;
cap_load_keept(s, cs);
pushcap(s, janet_wrap_array(sub_captures), tag);
return result;
}

case RULE_NTH: {
uint32_t nth = rule[1];
if (nth > INT32_MAX) nth = INT32_MAX;
uint32_t tag = rule[3];
int oldmode = s->mode;
CapState cs = cap_save(s);
s->mode = PEG_MODE_NORMAL;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[2], text);
up1(s);
s->mode = oldmode;
if (!result) return NULL;
int32_t num_sub_captures = s->captures->count - cs.cap;
Janet cap;
if (num_sub_captures > (int32_t) nth) {
cap = s->captures->data[cs.cap + nth];
} else {
return NULL;
}
cap_load_keept(s, cs);
pushcap(s, cap, tag);
return result;
}

case RULE_SUB: {
const uint8_t *text_start = text;
const uint32_t *rule_window = s->bytecode + rule[1];
const uint32_t *rule_subpattern = s->bytecode + rule[2];
down1(s);
const uint8_t *window_end = peg_rule(s, rule_window, text);
up1(s);
if (!window_end) {
return NULL;
}
const uint8_t *saved_end = s->text_end;
s->text_end = window_end;
down1(s);
const uint8_t *next_text = peg_rule(s, rule_subpattern, text_start);
up1(s);
s->text_end = saved_end;

if (!next_text) {
return NULL;
}

return window_end;
}

case RULE_TIL: {
const uint32_t *rule_terminus = s->bytecode + rule[1];
const uint32_t *rule_subpattern = s->bytecode + rule[2];

const uint8_t *terminus_start = text;
const uint8_t *terminus_end = NULL;
down1(s);
while (terminus_start <= s->text_end) {
CapState cs2 = cap_save(s);
terminus_end = peg_rule(s, rule_terminus, terminus_start);
cap_load(s, cs2);
if (terminus_end) {
break;
}
terminus_start++;
}
up1(s);

if (!terminus_end) {
return NULL;
}

const uint8_t *saved_end = s->text_end;
s->text_end = terminus_start;
down1(s);
const uint8_t *matched = peg_rule(s, rule_subpattern, text);
up1(s);
s->text_end = saved_end;

if (!matched) {
return NULL;
}

return terminus_end;
}

case RULE_SPLIT: {
const uint8_t *saved_end = s->text_end;
const uint32_t *rule_separator = s->bytecode + rule[1];
const uint32_t *rule_subpattern = s->bytecode + rule[2];

const uint8_t *separator_end = NULL;
do {
const uint8_t *text_start = text;
CapState cs = cap_save(s);
down1(s);
while (text <= s->text_end) {
separator_end = peg_rule(s, rule_separator, text);
cap_load(s, cs);
if (separator_end) {
break;
}
text++;
}
up1(s);

if (separator_end) {
s->text_end = text;
text = separator_end;
}

down1(s);
const uint8_t *subpattern_end = peg_rule(s, rule_subpattern, text_start);
up1(s);
s->text_end = saved_end;

if (!subpattern_end) {
return NULL;
}
} while (separator_end);

return s->text_end;
}

case RULE_REPLACE:
case RULE_MATCHTIME: {
uint32_t tag = rule[3];
int oldmode = s->mode;
CapState cs = cap_save(s);
s->mode = PEG_MODE_NORMAL;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
up1(s);
s->mode = oldmode;
if (!result) return NULL;

Janet cap = janet_wrap_nil();
Janet constant = s->constants[rule[2]];
switch (janet_type(constant)) {
default:
cap = constant;
break;
case JANET_STRUCT:
if (s->captures->count) {
cap = janet_struct_get(janet_unwrap_struct(constant),
s->captures->data[s->captures->count - 1]);
}
break;
case JANET_TABLE:
if (s->captures->count) {
cap = janet_table_get(janet_unwrap_table(constant),
s->captures->data[s->captures->count - 1]);
}
break;
case JANET_CFUNCTION:
cap = janet_unwrap_cfunction(constant)(s->captures->count - cs.cap,
s->captures->data + cs.cap);
break;
case JANET_FUNCTION:
cap = janet_call(janet_unwrap_function(constant),
s->captures->count - cs.cap,
s->captures->data + cs.cap);
break;
}
cap_load_keept(s, cs);
if (rule[0] == RULE_MATCHTIME && !janet_truthy(cap)) return NULL;
pushcap(s, cap, tag);
return result;
}

case RULE_ERROR: {
int oldmode = s->mode;
s->mode = PEG_MODE_NORMAL;
int32_t old_cap = s->captures->count;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
up1(s);
s->mode = oldmode;
if (!result) return NULL;
if (s->captures->count > old_cap) {
/* Throw last capture */
janet_panicv(s->captures->data[s->captures->count - 1]);
} else {
/* Throw generic error */
int32_t start = (int32_t)(text - s->text_start);
LineCol lc = get_linecol_from_position(s, start);
janet_panicf("match error at line %d, column %d", lc.line, lc.col);
}
return NULL;
}

case RULE_BACKMATCH: {
uint32_t search = rule[1];
for (int32_t i = s->tags->count - 1; i >= 0; i--) {
if (s->tags->data[i] == search) {
Janet capture = s->tagged_captures->data[i];
if (!janet_checktype(capture, JANET_STRING))
return NULL;
const uint8_t *bytes = janet_unwrap_string(capture);
int32_t len = janet_string_length(bytes);
if (text + len > s->text_end)
return NULL;
return memcmp(text, bytes, len) ? NULL : text + len;
}
}
return NULL;
}

case RULE_LENPREFIX: {
int oldmode = s->mode;
s->mode = PEG_MODE_NORMAL;
const uint8_t *next_text;
CapState cs = cap_save(s);
down1(s);
next_text = peg_rule(s, s->bytecode + rule[1], text);
up1(s);
if (NULL == next_text) return NULL;
s->mode = oldmode;
int32_t num_sub_captures = s->captures->count - cs.cap;
Janet lencap;
if (num_sub_captures <= 0 ||
(lencap = s->captures->data[cs.cap], !janet_checkint(lencap))) {
cap_load(s, cs);
return NULL;
}
int32_t nrep = janet_unwrap_integer(lencap);
/* drop captures from len pattern */
cap_load(s, cs);
for (int32_t i = 0; i < nrep; i++) {
down1(s);
next_text = peg_rule(s, s->bytecode + rule[2], next_text);
up1(s);
if (NULL == next_text) {
cap_load(s, cs);
return NULL;
}
}
return next_text;
}

case RULE_READINT: {
uint32_t tag = rule[2];
uint32_t signedness = rule[1] & 0x10;
uint32_t endianness = rule[1] & 0x20;
int width = (int)(rule[1] & 0xF);
if (text + width > s->text_end) return NULL;
uint64_t accum = 0;
if (endianness) {
/* BE */
for (int i = 0; i < width; i++) accum = (accum << 8) | text[i];
} else {
/* LE */
for (int i = width - 1; i >= 0; i--) accum = (accum << 8) | text[i];
}

Janet capture_value;
/* We can only parse integeres of greater than 6 bytes reliable if int-types are enabled.
* Otherwise, we may lose precision, so 6 is the maximum size when int-types are disabled. */
#ifdef JANET_INT_TYPES
if (width > 6) {
if (signedness) {
capture_value = janet_wrap_s64(peg_convert_u64_s64(accum, width));
} else {
capture_value = janet_wrap_u64(accum);
}
} else
#endif
{
double double_value;
if (signedness) {

Check notice

Code scanning / CodeQL

Long switch case Note

Switch has at least one case that is too long:
RULE_SPLIT (40 lines)
.
Switch has at least one case that is too long:
RULE_MATCHTIME (44 lines)
.
Switch has at least one case that is too long:
RULE_LENPREFIX (31 lines)
.
Switch has at least one case that is too long:
RULE_READINT (40 lines)
.
double_value = (double)(peg_convert_u64_s64(accum, width));
} else {
double_value = (double)accum;
Expand Down Expand Up @@ -1227,6 +1263,14 @@
emit_2(r, RULE_SUB, subrule1, subrule2);
}

static void spec_til(Builder *b, int32_t argc, const Janet *argv) {
peg_fixarity(b, argc, 2);
Reserve r = reserve(b, 3);
uint32_t subrule1 = peg_compile1(b, argv[0]);
uint32_t subrule2 = peg_compile1(b, argv[1]);
emit_2(r, RULE_TIL, subrule1, subrule2);
}

static void spec_split(Builder *b, int32_t argc, const Janet *argv) {
peg_fixarity(b, argc, 2);
Reserve r = reserve(b, 3);
Expand Down Expand Up @@ -1323,6 +1367,7 @@
{"split", spec_split},
{"sub", spec_sub},
{"thru", spec_thru},
{"til", spec_til},
{"to", spec_to},
{"uint", spec_uint_le},
{"uint-be", spec_uint_be},
Expand Down Expand Up @@ -1657,6 +1702,7 @@
i += 4;
break;
case RULE_SUB:
case RULE_TIL:
case RULE_SPLIT:
/* [rule, rule] */
if (rule[1] >= blen) goto bad;
Expand Down
1 change: 1 addition & 0 deletions src/include/janet.h
Original file line number Diff line number Diff line change
Expand Up @@ -2180,6 +2180,7 @@ typedef enum {
RULE_UNREF, /* [rule, tag] */
RULE_CAPTURE_NUM, /* [rule, tag] */
RULE_SUB, /* [rule, rule] */
RULE_TIL, /* [rule, rule] */
RULE_SPLIT, /* [rule, rule] */
RULE_NTH, /* [nth, rule, tag] */
RULE_ONLY_TAGS, /* [rule] */
Expand Down
35 changes: 35 additions & 0 deletions test/suite-peg.janet
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,41 @@
"abcdef"
@[])

(test "til: basic matching"
~(til "d" "abc")
"abcdef"
@[])

(test "til: second pattern can't see past the first occurrence of first pattern"
~(til "d" (* "abc" -1))
"abcdef"
@[])

(test "til: fails if first pattern fails"
~(til "x" "abc")
"abcdef"
nil)

(test "til: fails if second pattern fails"
~(til "abc" "x")
"abcdef"
nil)

(test "til: discards captures from initial pattern"
~(til '"d" '"abc")
"abcdef"
@["abc"])

(test "til: positions inside second match are still relative to the entire input"
~(* "one\ntw" (til 0 (* ($) (line) (column))))
"one\ntwo\nthree\n"
@[6 2 3])

(test "til: advances to the end of the first pattern's first occurrence"
~(* (til "d" "ab") "e")
"abcdef"
@[])

(test "split: basic functionality"
~(split "," '1)
"a,b,c"
Expand Down
Loading