Skip to content

Enable storing multiple compiled patterns at once #25

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 37 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
013a262
Merge pull request #9 from mrigger/out-of-bounds-fixes
kokke Dec 11, 2017
307589e
Update re.c
kokke Dec 11, 2017
a3b5c6a
Update test1.c
kokke Dec 11, 2017
5e73adc
Update re.c
kokke Dec 11, 2017
1f6af93
Update README.md
kokke Dec 11, 2017
350763d
Update README.md
kokke Mar 23, 2018
911f8dc
Update test1.c
kokke Mar 23, 2018
d087143
Update re.c
kokke Mar 23, 2018
309a1f3
Update README.md
kokke Mar 23, 2018
ff7f6e1
Update Makefile
kokke Mar 23, 2018
e01ec35
Update README.md
kokke Mar 23, 2018
b38f74b
Create test_rand_neg.c
kokke Mar 23, 2018
4017b4d
Create regex_test_neg.py
kokke Mar 23, 2018
f7c86d3
Update Makefile
kokke Mar 23, 2018
2dfb463
Update README.md
kokke Mar 23, 2018
ef4bbf8
Check for correct python2 binry in Makefile
roflcopter4 Apr 16, 2018
99e57a3
Add back '@' signs I accidentally removed
roflcopter4 Apr 16, 2018
96aa599
Fix dumb typos
roflcopter4 Apr 16, 2018
8986a1a
Merge pull request #14 from roflcopter4/master
kokke Apr 17, 2018
72e0e56
Fix pattern ".?" issues
TermoSINteZ May 14, 2018
d54114d
Remove tabs
TermoSINteZ May 15, 2018
ca783fb
Merge pull request #16 from TermoSINteZ/master
kokke May 15, 2018
bfa621e
Update re.c
monolifed May 30, 2018
6d78631
Merge pull request #17 from monolifed/patch-1
kokke Jun 6, 2018
f2674ed
Update test1.c
kokke Jun 6, 2018
5abffeb
Update re.c
kokke Oct 22, 2018
5f2af04
Update test1.c
kokke Oct 22, 2018
62f6d14
Update README.md
kokke Oct 23, 2018
16763e1
Update README.md
kokke Oct 23, 2018
3103102
Update re.c
monolifed Oct 25, 2018
446e3ef
Merge pull request #22 from monolifed/master
kokke Oct 26, 2018
f05c037
Storing multiple compiled patterns at once is now possible.
Dec 6, 2018
738160f
Remove unnecessary test in re_match.
Dec 7, 2018
b360169
Group together fail branches in re_compile.
Dec 7, 2018
9d7e190
Add comment describing FAIL sentinel value.
Dec 7, 2018
3abaa00
Remove BRANCH and add FAIL to re_print char types.
Dec 7, 2018
cdc3cce
Update comment describing re_compile function.
Dec 7, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 74 additions & 31 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,23 @@ CC := gcc
# Number of random text expressions to generate, for random testing
NRAND_TESTS := 1000

PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \
echo 'python'; \
elif command -v python2 >/dev/null 2>&1; then \
echo 'python2'; \
else \
echo 'Error: no compatible python version found.' >&2; \
exit 1; \
fi

# Flags to pass to compiler
CFLAGS := -Os -Wall -Wextra -std=c99 -I.
CFLAGS := -O3 -Wall -Wextra -std=c99 -I.

all:
@$(CC) $(CFLAGS) re.c tests/test1.c -o tests/test1
@$(CC) $(CFLAGS) re.c tests/test2.c -o tests/test2
@$(CC) $(CFLAGS) re.c tests/test_rand.c -o tests/test_rand
@$(CC) $(CFLAGS) re.c tests/test_rand_neg.c -o tests/test_rand_neg

clean:
@rm -f tests/test1 tests/test2 tests/test_rand
Expand All @@ -21,43 +30,77 @@ clean:


test: all
@$(test $(PYTHON))
@echo
@echo Testing hand-picked regex\'s:
@./tests/test1
@echo Testing patterns against $(NRAND_TESTS) random strings matching the Python implementation and comparing:
@echo
@python ./scripts/regex_test.py \\d+\\w?\\D\\d $(NRAND_TESTS)
@python ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS)
@python ./scripts/regex_test.py \\w*\\d?\\w\\? $(NRAND_TESTS)
@#python ./scripts/regex_test.py [^\\d]+\\\\?\\s $(NRAND_TESTS)
@#python ./scripts/regex_test.py [^\\w][^-1-4] $(NRAND_TESTS)
@#python ./scripts/regex_test.py [^\\w] $(NRAND_TESTS)
@#python ./scripts/regex_test.py [^1-4] $(NRAND_TESTS)
@#python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS)
@#python ./scripts/regex_test.py [^\\d]+\\s?[\\w]* $(NRAND_TESTS)
@python ./scripts/regex_test.py a+b*[ac]*.+.*.[\\.]. $(NRAND_TESTS)
@python ./scripts/regex_test.py a?b[ac*]*.?[\\]+[?]? $(NRAND_TESTS)
@#python ./scripts/regex_test.py [1-5-]+[-1-2]-[-] $(NRAND_TESTS)
@python ./scripts/regex_test.py [-1-3]-[-]+ $(NRAND_TESTS)
@python ./scripts/regex_test.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS)
@python ./scripts/regex_test.py [-1-2]* $(NRAND_TESTS)
@python ./scripts/regex_test.py \\s?[a-fKL098]+-? $(NRAND_TESTS)
@python ./scripts/regex_test.py [\\-]* $(NRAND_TESTS)
@python ./scripts/regex_test.py [\\\\]+ $(NRAND_TESTS)
@python ./scripts/regex_test.py [0-9a-fA-F]+ $(NRAND_TESTS)
@python ./scripts/regex_test.py [1379][2468][abcdef] $(NRAND_TESTS)
@python ./scripts/regex_test.py [012345-9]?[0123-789] $(NRAND_TESTS)
@python ./scripts/regex_test.py [012345-9] $(NRAND_TESTS)
@python ./scripts/regex_test.py [0-56789] $(NRAND_TESTS)
@python ./scripts/regex_test.py [abc-zABC-Z] $(NRAND_TESTS)
@python ./scripts/regex_test.py [a\d]?1234 $(NRAND_TESTS)
@python ./scripts/regex_test.py .*123faerdig $(NRAND_TESTS)
@python ./scripts/regex_test.py .?\\w+jsj$ $(NRAND_TESTS)
@python ./scripts/regex_test.py [?to][+to][?ta][*ta] $(NRAND_TESTS)
@#python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py \\d+\\w?\\D\\d $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py \\w*\\d?\\w\\? $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [^\\d]+\\\\?\\s $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [^\\w][^-1-4] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [^\\w] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [^1-4] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [^\\d]+\\s?[\\w]* $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py a+b*[ac]*.+.*.[\\.]. $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py a?b[ac*]*.?[\\]+[?]? $(NRAND_TESTS)
@#python ./scripts/regex_test.py [1-5-]+[-1-2]-[-] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [-1-3]-[-]+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [-1-2]* $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py \\s?[a-fKL098]+-? $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [\\-]* $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [\\\\]+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [0-9a-fA-F]+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [1379][2468][abcdef] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [012345-9]?[0123-789] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [012345-9] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [0-56789] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [abc-zABC-Z] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [a\d]?1234 $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py .*123faerdig $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py .?\\w+jsj$ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [?to][+to][?ta][*ta] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py \\d+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [a-z]+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py \\w $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py \\d $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [\\d] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test.py [^\\d] $(NRAND_TESTS)
@#python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS)
@echo
@echo
@echo
@echo Testing rejection of patterns against $(NRAND_TESTS) random strings also rejected by the Python implementation:
@echo
@$(PYTHON) ./scripts/regex_test_neg.py \\d+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [a-z]+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py ^\\w $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py ^\\d $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [\\d] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py ^[^\\d] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [^\\w]+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py ^[\\w]+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py ^[^0-9] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [a-z].[A-Z] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [-1-3]-[-]+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [-0-9]+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [\\-]+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [\\\\]+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [0-9a-fA-F]+ $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [1379][2468][abcdef] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [012345-9] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py [0-56789] $(NRAND_TESTS)
@$(PYTHON) ./scripts/regex_test_neg.py .*123faerdig $(NRAND_TESTS)
@echo
@echo
@./tests/test2
@echo
@echo

20 changes: 15 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,28 @@ The main design goal of this library is to be small, correct, self contained and
```
> gcc -Os -c re.c
> size re.o
text data bss dec hex filename
2341 0 544 2885 b45 re.o
text data bss dec hex filename
2319 0 544 2863 b2f re.o

```
For ARM/Thumb using GCC 4.8.1 it's around 1.5kb code and less RAM :
```
> arm-none-eabi-gcc -Os -mthumb -c re.c
> size re.o
text data bss dec hex filename
1418 0 280 1698 6a2 re.o

```
For 8-bit AVR using AVR-GCC 4.8.1 it's around 2kb code and less RAM :
```
> avr-gcc -Os -c re.c
> size re.o
text data bss dec hex filename
2132 0 130 2262 8d6 re.o
text data bss dec hex filename
2128 0 130 2258 8d2 re.o
```



### API
This is the public / exported API:
```C
Expand Down Expand Up @@ -113,7 +123,7 @@ For more usage examples I encourage you to look at the code in the `tests`-folde
- Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`.
- Add `example.c` that demonstrates usage.
- Add `tests/test_perf.c` for performance and time measurements.
- Testing: add matching on purely random data, comparing with Python's `re`. Currently only matching known positives - need to verify rejection as well.
- Testing: Improve pattern rejection testing.

### FAQ
- *Q: What differentiates this library from other C regex implementations?*
Expand Down
114 changes: 67 additions & 47 deletions re.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,26 +31,13 @@

#include "re.h"
#include <stdio.h>
#include <stdlib.h>

/* Definitions: */

#define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */
#define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in. */


enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, BRANCH };

typedef struct regex_t
{
unsigned char type; /* CHAR, STAR, etc. */
union
{
unsigned char ch; /* the character itself */
unsigned char* ccl; /* OR a pointer to characters in class */
};
} regex_t;

enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH, */ FAIL};


/* Private function declarations: */
Expand All @@ -71,39 +58,48 @@ static int ismetachar(char c);
/* Public functions: */
int re_match(const char* pattern, const char* text)
{
return re_matchp(re_compile(pattern), text);
re_t regex;

re_compile(regex, pattern);
return re_matchp(regex, text);
}

int re_matchp(re_t pattern, const char* text)
{
int idx = -1;

if (pattern[0].type == BEGIN)
{
return ((matchpattern(&pattern[1], text)) ? 0 : -1);
}
else
/* FAIL is a sentinel value indicating compilation of the pattern failed. */
if (pattern[0].type != FAIL)
{
do
if (pattern[0].type == BEGIN)
{
idx += 1;
if (matchpattern(pattern, text))
return ((matchpattern(&pattern[1], text)) ? 0 : -1);
}
else
{
int idx = -1;

do
{
return idx;
idx += 1;

if (matchpattern(pattern, text))
{
if (text[0] == '\0')
return -1;

return idx;
}
}
while (*text++ != '\0');
}
while (*text++ != '\0');

return -1;
}
return -1;
}

re_t re_compile(const char* pattern)
int re_compile(re_t re_compiled, const char* pattern)
{
/* The sizes of the two static arrays below substantiates the static RAM usage of this module.
MAX_REGEXP_OBJECTS is the max number of symbols in the expression.
MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */
static regex_t re_compiled[MAX_REGEXP_OBJECTS];
static unsigned char ccl_buf[MAX_CHAR_CLASS_LEN];
int ccl_bufidx = 1;

Expand All @@ -124,7 +120,7 @@ re_t re_compile(const char* pattern)
case '*': { re_compiled[j].type = STAR; } break;
case '+': { re_compiled[j].type = PLUS; } break;
case '?': { re_compiled[j].type = QUESTIONMARK; } break;
case '|': { re_compiled[j].type = BRANCH; } break;
/* case '|': { re_compiled[j].type = BRANCH; } break; <-- not working properly */

/* Escaped character-classes (\s \w ...): */
case '\\':
Expand Down Expand Up @@ -183,16 +179,27 @@ re_t re_compile(const char* pattern)
while ( (pattern[++i] != ']')
&& (pattern[i] != '\0')) /* Missing ] */
{
if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) {
fputs("exceeded internal buffer!\n", stderr);
exit(-1);
if (pattern[i] == '\\')
{
if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1)
{
//fputs("exceeded internal buffer!\n", stderr);
goto fail;
}
ccl_buf[ccl_bufidx++] = pattern[i++];
}
else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN)
{
//fputs("exceeded internal buffer!\n", stderr);
goto fail;
}
ccl_buf[ccl_bufidx++] = pattern[i];
}
if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) {
if (ccl_bufidx >= MAX_CHAR_CLASS_LEN)
{
/* Catches cases such as [00000000000000000000000000000000000000][ */
fputs("exceeded internal buffer!\n", stderr);
exit(-1);
//fputs("exceeded internal buffer!\n", stderr);
goto fail;
}
/* Null-terminate string end */
ccl_buf[ccl_bufidx++] = 0;
Expand All @@ -212,12 +219,16 @@ re_t re_compile(const char* pattern)
/* 'UNUSED' is a sentinel used to indicate end-of-pattern */
re_compiled[j].type = UNUSED;

return (re_t) re_compiled;
return j;

fail:
re_compiled[0].type = FAIL;
return -1;
}

void re_print(regex_t* pattern)
{
const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" };
const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", /* "BRANCH", */ "FAIL"};

int i;
for (i = 0; i < MAX_REGEXP_OBJECTS; ++i)
Expand Down Expand Up @@ -279,7 +290,7 @@ static int matchrange(char c, const char* str)
}
static int ismetachar(char c)
{
return ((c == 's') || (c == 'S') == (c == 'w') || (c == 'W') || (c == 'd') || (c == 'D'));
return ((c == 's') || (c == 'S') || (c == 'w') || (c == 'W') || (c == 'd') || (c == 'D'));
}

static int matchmetachar(char c, const char* str)
Expand Down Expand Up @@ -373,6 +384,17 @@ static int matchplus(regex_t p, regex_t* pattern, const char* text)
return 0;
}

static int matchquestion(regex_t p, regex_t* pattern, const char* text)
{
if (p.type == UNUSED)
return 1;
if (matchpattern(pattern, text))
return 1;
if (*text && matchone(p, *text++))
return matchpattern(pattern, text);
return 0;
}


#if 0

Expand All @@ -381,7 +403,7 @@ static int matchpattern(regex_t* pattern, const char* text)
{
if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK))
{
return 1;
return matchquestion(pattern[1], &pattern[2], text);
}
else if (pattern[1].type == STAR)
{
Expand Down Expand Up @@ -414,7 +436,7 @@ static int matchpattern(regex_t* pattern, const char* text)
{
if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK))
{
return 1;
return matchquestion(pattern[0], &pattern[2], text);
}
else if (pattern[1].type == STAR)
{
Expand All @@ -428,18 +450,16 @@ static int matchpattern(regex_t* pattern, const char* text)
{
return (text[0] == '\0');
}
/* Branching is not working properly
else if (pattern[1].type == BRANCH)
{
return (matchpattern(pattern, text) || matchpattern(&pattern[2], text));
}
*/
}
while ((text[0] != '\0') && matchone(*pattern++, *text++));

return 0;
}

#endif




Loading