Skip to content

Commit 86838ec

Browse files
Added match length support
1 parent d3058f2 commit 86838ec

File tree

7 files changed

+159
-130
lines changed

7 files changed

+159
-130
lines changed

README.md

+12-23
Original file line numberDiff line numberDiff line change
@@ -20,34 +20,20 @@ I think you should test the patterns you are going to use. You can easily modify
2020
The main design goal of this library is to be small, correct, self contained and use few resources while retaining acceptable performance and feature completeness. Clarity of the code is also highly valued.
2121

2222
### Notable features and omissions
23-
- Small code and binary size: <500 SLOC, ~3kb binary for x86. Statically #define'd memory usage / allocation.
23+
- Small code and binary size: 500 SLOC, ~3kb binary for x86. Statically #define'd memory usage / allocation.
2424
- No use of dynamic memory allocation (i.e. no calls to `malloc` / `free`).
2525
- To avoid call-stack exhaustion, iterative searching is preferred over recursive by default (can be changed with a pre-processor flag).
2626
- No support for capturing groups or named capture: `(^P<name>group)` etc.
2727
- Thorough testing : [exrex](https://github.com/asciimoo/exrex) is used to randomly generate test-cases from regex patterns, which are fed into the regex code for verification. Try `make test` to generate a few thousand tests cases yourself.
28-
- Compiled for x86 using GCC 4.7.4 and optimizing for size, the binary takes up ~2-3kb code space and allocates ~0.5kb RAM :
28+
- Provides character length of matches.
29+
- Compiled for x86 using GCC 7.2.0 and optimizing for size, the binary takes up ~2-3kb code space and allocates ~0.5kb RAM :
2930
```
3031
> gcc -Os -c re.c
3132
> size re.o
3233
text data bss dec hex filename
33-
2319 0 544 2863 b2f re.o
34+
2440 160 544 3144 c48 re.o
3435
3536
```
36-
For ARM/Thumb using GCC 4.8.1 it's around 1.5kb code and less RAM :
37-
```
38-
> arm-none-eabi-gcc -Os -mthumb -c re.c
39-
> size re.o
40-
text data bss dec hex filename
41-
1418 0 280 1698 6a2 re.o
42-
43-
```
44-
For 8-bit AVR using AVR-GCC 4.8.1 it's around 2kb code and less RAM :
45-
```
46-
> avr-gcc -Os -c re.c
47-
> size re.o
48-
text data bss dec hex filename
49-
2128 0 130 2258 8d2 re.o
50-
```
5137

5238

5339

@@ -61,10 +47,10 @@ typedef struct regex_t* re_t;
6147
re_t re_compile(const char* pattern);
6248

6349
/* Finds matches of the compiled pattern inside text. */
64-
int re_matchp(re_t pattern, const char* text);
50+
int re_matchp(re_t pattern, const char* text, int* matchlength);
6551

6652
/* Finds matches of pattern inside text (compiles first automatically). */
67-
int re_match(const char* pattern, const char* text);
53+
int re_match(const char* pattern, const char* text, int* matchlength);
6854
```
6955
7056
### Supported regex-operators
@@ -97,22 +83,25 @@ Search a text-string for a regex and get an index into the string, using `re_mat
9783
9884
The returned index points to the first place in the string, where the regex pattern matches.
9985
86+
The integer pointer passed will hold the length of the match.
87+
10088
If the regular expression doesn't match, the matching function returns an index of -1 to indicate failure.
10189
10290
### Examples
10391
Example of usage:
10492
```C
93+
/* Standard int to hold length of match */
10594
/* Standard null-terminated C-string to search: */
10695
const char* string_to_search = "ahem.. 'hello world !' ..";
10796
10897
/* Compile a simple regular expression using character classes, meta-char and greedy + non-greedy quantifiers: */
10998
re_t pattern = re_compile("[Hh]ello [Ww]orld\\s*[!]?");
11099
111100
/* Check if the regex matches the text: */
112-
int match_idx = re_matchp(pattern, string_to_search);
101+
int match_idx = re_matchp(pattern, string_to_search, &match_length);
113102
if (match_idx != -1)
114103
{
115-
printf("match at idx %d.\n", match_idx);
104+
printf("match at idx %d, %i chars long.\n", match_idx, match_length);
116105
}
117106
```
118107

@@ -128,7 +117,7 @@ For more usage examples I encourage you to look at the code in the `tests`-folde
128117
### FAQ
129118
- *Q: What differentiates this library from other C regex implementations?*
130119

131-
A: Well, the small size for one. <500 lines of C-code compiling to 2-3kb ROM, using very little RAM.
120+
A: Well, the small size for one. 500 lines of C-code compiling to 2-3kb ROM, using very little RAM.
132121

133122
### License
134123
All material in this repository is in the public domain.

re.c

+56-27
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@ typedef struct regex_t
5353

5454

5555
/* Private function declarations: */
56-
static int matchpattern(regex_t* pattern, const char* text);
56+
static int matchpattern(regex_t* pattern, const char* text, int* matchlength);
5757
static int matchcharclass(char c, const char* str);
58-
static int matchstar(regex_t p, regex_t* pattern, const char* text);
59-
static int matchplus(regex_t p, regex_t* pattern, const char* text);
58+
static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength);
59+
static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength);
6060
static int matchone(regex_t p, char c);
6161
static int matchdigit(char c);
6262
static int matchalpha(char c);
@@ -68,18 +68,19 @@ static int ismetachar(char c);
6868

6969

7070
/* Public functions: */
71-
int re_match(const char* pattern, const char* text)
71+
int re_match(const char* pattern, const char* text, int* matchlength)
7272
{
73-
return re_matchp(re_compile(pattern), text);
73+
return re_matchp(re_compile(pattern), text, matchlength);
7474
}
7575

76-
int re_matchp(re_t pattern, const char* text)
76+
int re_matchp(re_t pattern, const char* text, int* matchlength)
7777
{
78+
*matchlength = 0;
7879
if (pattern != 0)
7980
{
8081
if (pattern[0].type == BEGIN)
8182
{
82-
return ((matchpattern(&pattern[1], text)) ? 0 : -1);
83+
return ((matchpattern(&pattern[1], text, matchlength)) ? 0 : -1);
8384
}
8485
else
8586
{
@@ -89,7 +90,7 @@ int re_matchp(re_t pattern, const char* text)
8990
{
9091
idx += 1;
9192

92-
if (matchpattern(pattern, text))
93+
if (matchpattern(pattern, text, matchlength))
9394
{
9495
if (text[0] == '\0')
9596
return -1;
@@ -367,89 +368,115 @@ static int matchone(regex_t p, char c)
367368
}
368369
}
369370

370-
static int matchstar(regex_t p, regex_t* pattern, const char* text)
371+
static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength)
371372
{
372-
do
373+
int prelen = *matchlength;
374+
const char* prepoint = text;
375+
while ((text[0] != '\0') && matchone(p, *text))
376+
{
377+
text++;
378+
(*matchlength)++;
379+
}
380+
while (text >= prepoint)
373381
{
374-
if (matchpattern(pattern, text))
382+
if (matchpattern(pattern, text--, matchlength))
375383
return 1;
384+
(*matchlength)--;
376385
}
377-
while ((text[0] != '\0') && matchone(p, *text++));
378-
386+
387+
*matchlength = prelen;
379388
return 0;
380389
}
381390

382-
static int matchplus(regex_t p, regex_t* pattern, const char* text)
391+
static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength)
383392
{
384-
while ((text[0] != '\0') && matchone(p, *text++))
393+
const char* prepoint = text;
394+
while ((text[0] != '\0') && matchone(p, *text))
395+
{
396+
text++;
397+
(*matchlength)++;
398+
}
399+
while (text > prepoint)
385400
{
386-
if (matchpattern(pattern, text))
401+
if (matchpattern(pattern, text--, matchlength))
387402
return 1;
403+
(*matchlength)--;
388404
}
405+
389406
return 0;
390407
}
391408

392-
static int matchquestion(regex_t p, regex_t* pattern, const char* text)
409+
static int matchquestion(regex_t p, regex_t* pattern, const char* text, int* matchlength)
393410
{
394411
if (p.type == UNUSED)
395412
return 1;
396-
if (matchpattern(pattern, text))
413+
if (matchpattern(pattern, text, matchlength))
397414
return 1;
398415
if (*text && matchone(p, *text++))
399-
return matchpattern(pattern, text);
416+
{
417+
if (matchpattern(pattern, text, matchlength))
418+
{
419+
(*matchlength)++;
420+
return 1;
421+
}
422+
}
400423
return 0;
401424
}
402425

403426

404427
#if 0
405428

406429
/* Recursive matching */
407-
static int matchpattern(regex_t* pattern, const char* text)
430+
static int matchpattern(regex_t* pattern, const char* text, int *matchlength)
408431
{
432+
int pre = *matchlength;
409433
if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK))
410434
{
411-
return matchquestion(pattern[1], &pattern[2], text);
435+
return matchquestion(pattern[1], &pattern[2], text, matchlength);
412436
}
413437
else if (pattern[1].type == STAR)
414438
{
415-
return matchstar(pattern[0], &pattern[2], text);
439+
return matchstar(pattern[0], &pattern[2], text, matchlength);
416440
}
417441
else if (pattern[1].type == PLUS)
418442
{
419-
return matchplus(pattern[0], &pattern[2], text);
443+
return matchplus(pattern[0], &pattern[2], text, matchlength);
420444
}
421445
else if ((pattern[0].type == END) && pattern[1].type == UNUSED)
422446
{
423447
return text[0] == '\0';
424448
}
425449
else if ((text[0] != '\0') && matchone(pattern[0], text[0]))
426450
{
451+
(*matchlength)++;
427452
return matchpattern(&pattern[1], text+1);
428453
}
429454
else
430455
{
456+
*matchlength = pre;
431457
return 0;
432458
}
433459
}
434460

435461
#else
436462

437463
/* Iterative matching */
438-
static int matchpattern(regex_t* pattern, const char* text)
464+
static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
439465
{
466+
int pre = *matchlength;
440467
do
441468
{
442469
if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK))
443470
{
444-
return matchquestion(pattern[0], &pattern[2], text);
471+
return matchquestion(pattern[0], &pattern[2], text, matchlength);
445472
}
446473
else if (pattern[1].type == STAR)
447474
{
448-
return matchstar(pattern[0], &pattern[2], text);
475+
return matchstar(pattern[0], &pattern[2], text, matchlength);
449476
}
450477
else if (pattern[1].type == PLUS)
451478
{
452-
return matchplus(pattern[0], &pattern[2], text);
479+
return matchplus(pattern[0], &pattern[2], text, matchlength);
453480
}
454481
else if ((pattern[0].type == END) && pattern[1].type == UNUSED)
455482
{
@@ -461,9 +488,11 @@ static int matchpattern(regex_t* pattern, const char* text)
461488
return (matchpattern(pattern, text) || matchpattern(&pattern[2], text));
462489
}
463490
*/
491+
(*matchlength)++;
464492
}
465493
while ((text[0] != '\0') && matchone(*pattern++, *text++));
466494

495+
*matchlength = pre;
467496
return 0;
468497
}
469498

re.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@ re_t re_compile(const char* pattern);
4242

4343

4444
/* Find matches of the compiled pattern inside text. */
45-
int re_matchp(re_t pattern, const char* text);
45+
int re_matchp(re_t pattern, const char* text, int* matchlenght);
4646

4747

4848
/* Find matches of the txt pattern inside text (will compile automatically first). */
49-
int re_match(const char* pattern, const char* text);
49+
int re_match(const char* pattern, const char* text, int* matchlenght);
5050

5151

5252
#ifdef __cplusplus

0 commit comments

Comments
 (0)