Skip to content

Commit 1590e53

Browse files
committed
use flat memory layout
1 parent 711981b commit 1590e53

File tree

1 file changed

+103
-84
lines changed

1 file changed

+103
-84
lines changed

re.c

+103-84
Original file line numberDiff line numberDiff line change
@@ -35,30 +35,31 @@
3535

3636
/* Definitions: */
3737

38-
#define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */
39-
#define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in. */
38+
#define MAX_REGEXP_LEN 70 /* Max number of bytes for a regex. */
4039

4140

4241
enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
4342

4443
typedef struct regex_t
4544
{
46-
unsigned char type; /* CHAR, STAR, etc. */
47-
union
48-
{
49-
unsigned char ch; /* the character itself */
50-
unsigned char* ccl; /* OR a pointer to characters in class */
51-
} u;
45+
unsigned char type; /* CHAR, STAR, etc. */
46+
unsigned char data_len;
47+
unsigned char data[0];
5248
} regex_t;
5349

50+
static re_t getnext(regex_t* pattern)
51+
{
52+
return (re_t)(((unsigned char*)pattern) + 2 + pattern->data_len);
53+
}
54+
5455

5556

5657
/* Private function declarations: */
5758
static int matchpattern(regex_t* pattern, const char* text, int* matchlength);
5859
static int matchcharclass(char c, const char* str);
59-
static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength);
60-
static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength);
61-
static int matchone(regex_t p, char c);
60+
static int matchstar(regex_t *p, regex_t* pattern, const char* text, int* matchlength);
61+
static int matchplus(regex_t *p, regex_t* pattern, const char* text, int* matchlength);
62+
static int matchone(regex_t* p, char c);
6263
static int matchdigit(char c);
6364
static int matchalpha(char c);
6465
static int matchwhitespace(char c);
@@ -80,9 +81,9 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
8081
*matchlength = 0;
8182
if (pattern != 0)
8283
{
83-
if (pattern[0].type == BEGIN)
84+
if (pattern->type == BEGIN)
8485
{
85-
return ((matchpattern(&pattern[1], text, matchlength)) ? 0 : -1);
86+
return ((matchpattern(getnext(pattern), text, matchlength)) ? 0 : -1);
8687
}
8788
else
8889
{
@@ -106,33 +107,37 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
106107
return -1;
107108
}
108109

110+
static int min(int a, int b)
111+
{
112+
return (a <= b) ? a : b;
113+
}
114+
109115
re_t re_compile(const char* pattern)
110116
{
111-
/* The sizes of the two static arrays below substantiates the static RAM usage of this module.
112-
MAX_REGEXP_OBJECTS is the max number of symbols in the expression.
113-
MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */
114-
static regex_t re_compiled[MAX_REGEXP_OBJECTS];
115-
static unsigned char ccl_buf[MAX_CHAR_CLASS_LEN];
116-
int ccl_bufidx = 1;
117+
/* The size of this static array substantiates the static RAM usage of this module.
118+
MAX_REGEXP_LEN is the max number number of bytes in the expression. */
119+
static unsigned char re_data[MAX_REGEXP_LEN];
117120

118121
char c; /* current char in pattern */
119122
int i = 0; /* index into pattern */
120-
int j = 0; /* index into re_compiled */
123+
int j = 0; /* index into re_data */
121124

122-
while (pattern[i] != '\0' && (j+1 < MAX_REGEXP_OBJECTS))
125+
while (pattern[i] != '\0' && (j+3 < MAX_REGEXP_LEN))
123126
{
124127
c = pattern[i];
128+
regex_t *re_compiled = (regex_t*)(re_data+j);
129+
re_compiled->data_len = 0;
125130

126131
switch (c)
127132
{
128133
/* Meta-characters: */
129-
case '^': { re_compiled[j].type = BEGIN; } break;
130-
case '$': { re_compiled[j].type = END; } break;
131-
case '.': { re_compiled[j].type = DOT; } break;
132-
case '*': { re_compiled[j].type = STAR; } break;
133-
case '+': { re_compiled[j].type = PLUS; } break;
134-
case '?': { re_compiled[j].type = QUESTIONMARK; } break;
135-
/* case '|': { re_compiled[j].type = BRANCH; } break; <-- not working properly */
134+
case '^': { re_compiled->type = BEGIN; } break;
135+
case '$': { re_compiled->type = END; } break;
136+
case '.': { re_compiled->type = DOT; } break;
137+
case '*': { re_compiled->type = STAR; } break;
138+
case '+': { re_compiled->type = PLUS; } break;
139+
case '?': { re_compiled->type = QUESTIONMARK; } break;
140+
/* case '|': { re_compiled->type = BRANCH; } break; <-- not working properly */
136141

137142
/* Escaped character-classes (\s \w ...): */
138143
case '\\':
@@ -145,41 +150,42 @@ re_t re_compile(const char* pattern)
145150
switch (pattern[i])
146151
{
147152
/* Meta-character: */
148-
case 'd': { re_compiled[j].type = DIGIT; } break;
149-
case 'D': { re_compiled[j].type = NOT_DIGIT; } break;
150-
case 'w': { re_compiled[j].type = ALPHA; } break;
151-
case 'W': { re_compiled[j].type = NOT_ALPHA; } break;
152-
case 's': { re_compiled[j].type = WHITESPACE; } break;
153-
case 'S': { re_compiled[j].type = NOT_WHITESPACE; } break;
153+
case 'd': { re_compiled->type = DIGIT; } break;
154+
case 'D': { re_compiled->type = NOT_DIGIT; } break;
155+
case 'w': { re_compiled->type = ALPHA; } break;
156+
case 'W': { re_compiled->type = NOT_ALPHA; } break;
157+
case 's': { re_compiled->type = WHITESPACE; } break;
158+
case 'S': { re_compiled->type = NOT_WHITESPACE; } break;
154159

155160
/* Escaped character, e.g. '.' or '$' */
156161
default:
157162
{
158-
re_compiled[j].type = CHAR;
159-
re_compiled[j].u.ch = pattern[i];
163+
re_compiled->type = CHAR;
164+
re_compiled->data_len = 1;
165+
re_compiled->data[0] = pattern[i];
160166
} break;
161167
}
162168
}
163169
/* '\\' as last char in pattern -> invalid regular expression. */
164170
/*
165171
else
166172
{
167-
re_compiled[j].type = CHAR;
168-
re_compiled[j].ch = pattern[i];
173+
re_compiled->type = CHAR;
174+
re_compiled->data_len = 1;
175+
re_compiled->data[0] = pattern[i];
169176
}
170177
*/
171178
} break;
172179

173180
/* Character class: */
174181
case '[':
175182
{
176-
/* Remember where the char-buffer starts. */
177-
int buf_begin = ccl_bufidx;
183+
int char_limit = min(0xff, MAX_REGEXP_LEN - j - 4); // 4 for this object and UNUSED at the minimum
178184

179185
/* Look-ahead to determine if negated */
180186
if (pattern[i+1] == '^')
181187
{
182-
re_compiled[j].type = INV_CHAR_CLASS;
188+
re_compiled->type = INV_CHAR_CLASS;
183189
i += 1; /* Increment i to avoid including '^' in the char-buffer */
184190
if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '^' */
185191
{
@@ -188,7 +194,7 @@ re_t re_compile(const char* pattern)
188194
}
189195
else
190196
{
191-
re_compiled[j].type = CHAR_CLASS;
197+
re_compiled->type = CHAR_CLASS;
192198
}
193199

194200
/* Copy characters inside [..] to buffer */
@@ -197,7 +203,7 @@ re_t re_compile(const char* pattern)
197203
{
198204
if (pattern[i] == '\\')
199205
{
200-
if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1)
206+
if (re_compiled->data_len >= char_limit)
201207
{
202208
//fputs("exceeded internal buffer!\n", stderr);
203209
return 0;
@@ -206,31 +212,32 @@ re_t re_compile(const char* pattern)
206212
{
207213
return 0;
208214
}
209-
ccl_buf[ccl_bufidx++] = pattern[i++];
215+
re_compiled->data[re_compiled->data_len++] = pattern[i++];
210216
}
211-
else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN)
217+
// TODO: I think this "else if" is a bug, should just be "if"
218+
else if (re_compiled->data_len >= char_limit)
212219
{
213220
//fputs("exceeded internal buffer!\n", stderr);
214221
return 0;
215222
}
216-
ccl_buf[ccl_bufidx++] = pattern[i];
223+
re_compiled->data[re_compiled->data_len++] = pattern[i];
217224
}
218-
if (ccl_bufidx >= MAX_CHAR_CLASS_LEN)
225+
if (re_compiled->data_len >= char_limit)
219226
{
220227
/* Catches cases such as [00000000000000000000000000000000000000][ */
221228
//fputs("exceeded internal buffer!\n", stderr);
222229
return 0;
223230
}
224231
/* Null-terminate string end */
225-
ccl_buf[ccl_bufidx++] = 0;
226-
re_compiled[j].u.ccl = &ccl_buf[buf_begin];
232+
re_compiled->data[re_compiled->data_len++] = 0;
227233
} break;
228234

229235
/* Other characters: */
230236
default:
231237
{
232-
re_compiled[j].type = CHAR;
233-
re_compiled[j].u.ch = c;
238+
re_compiled->type = CHAR;
239+
re_compiled->data_len = 1;
240+
re_compiled->data[0] = c;
234241
} break;
235242
}
236243
/* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
@@ -240,35 +247,39 @@ re_t re_compile(const char* pattern)
240247
}
241248

242249
i += 1;
243-
j += 1;
250+
j += 2 + re_compiled->data_len;
251+
}
252+
if (j + 1 >= MAX_REGEXP_LEN) {
253+
//fputs("exceeded internal buffer!\n", stderr);
254+
return 0;
244255
}
245256
/* 'UNUSED' is a sentinel used to indicate end-of-pattern */
246-
re_compiled[j].type = UNUSED;
257+
re_data[j] = UNUSED;
258+
re_data[j+1] = 0;
247259

248-
return (re_t) re_compiled;
260+
return (re_t) re_data;
249261
}
250262

251263
void re_print(regex_t* pattern)
252264
{
253265
const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" };
254266

255-
int i;
256267
int j;
257268
char c;
258-
for (i = 0; i < MAX_REGEXP_OBJECTS; ++i)
269+
for (;; pattern = getnext(pattern))
259270
{
260-
if (pattern[i].type == UNUSED)
271+
if (pattern->type == UNUSED)
261272
{
262273
break;
263274
}
264275

265-
printf("type: %s", types[pattern[i].type]);
266-
if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS)
276+
printf("type: %s", types[pattern->type]);
277+
if (pattern->type == CHAR_CLASS || pattern->type == INV_CHAR_CLASS)
267278
{
268279
printf(" [");
269-
for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j)
280+
for (j = 0; j < pattern->data_len; ++j)
270281
{
271-
c = pattern[i].u.ccl[j];
282+
c = pattern->data[j];
272283
if ((c == '\0') || (c == ']'))
273284
{
274285
break;
@@ -277,9 +288,9 @@ void re_print(regex_t* pattern)
277288
}
278289
printf("]");
279290
}
280-
else if (pattern[i].type == CHAR)
291+
else if (pattern->type == CHAR)
281292
{
282-
printf(" '%c'", pattern[i].u.ch);
293+
printf(" '%c'", pattern->data[0]);
283294
}
284295
printf("\n");
285296
}
@@ -380,24 +391,25 @@ static int matchcharclass(char c, const char* str)
380391
return 0;
381392
}
382393

383-
static int matchone(regex_t p, char c)
394+
static int matchone(regex_t* p, char c)
384395
{
385-
switch (p.type)
396+
switch (p->type)
386397
{
387398
case DOT: return matchdot(c);
388-
case CHAR_CLASS: return matchcharclass(c, (const char*)p.u.ccl);
389-
case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)p.u.ccl);
399+
case CHAR_CLASS: return matchcharclass(c, (const char*)p->data);
400+
case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)p->data);
390401
case DIGIT: return matchdigit(c);
391402
case NOT_DIGIT: return !matchdigit(c);
392403
case ALPHA: return matchalphanum(c);
393404
case NOT_ALPHA: return !matchalphanum(c);
394405
case WHITESPACE: return matchwhitespace(c);
395406
case NOT_WHITESPACE: return !matchwhitespace(c);
396-
default: return (p.u.ch == c);
407+
case BEGIN: return 0;
408+
default: return (p->data[0] == c);
397409
}
398410
}
399411

400-
static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength)
412+
static int matchstar(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
401413
{
402414
int prelen = *matchlength;
403415
const char* prepoint = text;
@@ -417,7 +429,7 @@ static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchle
417429
return 0;
418430
}
419431

420-
static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength)
432+
static int matchplus(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
421433
{
422434
const char* prepoint = text;
423435
while ((text[0] != '\0') && matchone(p, *text))
@@ -435,10 +447,8 @@ static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchle
435447
return 0;
436448
}
437449

438-
static int matchquestion(regex_t p, regex_t* pattern, const char* text, int* matchlength)
450+
static int matchquestion(regex_t *p, regex_t* pattern, const char* text, int* matchlength)
439451
{
440-
if (p.type == UNUSED)
441-
return 1;
442452
if (matchpattern(pattern, text, matchlength))
443453
return 1;
444454
if (*text && matchone(p, *text++))
@@ -493,33 +503,42 @@ static int matchpattern(regex_t* pattern, const char* text, int *matchlength)
493503
static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
494504
{
495505
int pre = *matchlength;
496-
do
506+
while (1)
497507
{
498-
if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK))
508+
if (pattern->type == UNUSED)
499509
{
500-
return matchquestion(pattern[0], &pattern[2], text, matchlength);
510+
return 1;
501511
}
502-
else if (pattern[1].type == STAR)
512+
regex_t* next_pattern = getnext(pattern);
513+
if (next_pattern->type == QUESTIONMARK)
503514
{
504-
return matchstar(pattern[0], &pattern[2], text, matchlength);
515+
return matchquestion(pattern, getnext(next_pattern), text, matchlength);
505516
}
506-
else if (pattern[1].type == PLUS)
517+
else if (next_pattern->type == STAR)
507518
{
508-
return matchplus(pattern[0], &pattern[2], text, matchlength);
519+
return matchstar(pattern, getnext(next_pattern), text, matchlength);
509520
}
510-
else if ((pattern[0].type == END) && pattern[1].type == UNUSED)
521+
else if (next_pattern->type == PLUS)
522+
{
523+
return matchplus(pattern, getnext(next_pattern), text, matchlength);
524+
}
525+
else if ((pattern->type == END) && next_pattern->type == UNUSED)
511526
{
512527
return (text[0] == '\0');
513528
}
514529
/* Branching is not working properly
515-
else if (pattern[1].type == BRANCH)
530+
else if (pattern->type == BRANCH)
516531
{
517-
return (matchpattern(pattern, text) || matchpattern(&pattern[2], text));
532+
return (matchpattern(pattern, text) || matchpattern(getnext(next_pattern), text));
518533
}
519534
*/
520535
(*matchlength)++;
536+
if (text[0] == '\0')
537+
break;
538+
if (!matchone(pattern, *text++))
539+
break;
540+
pattern = next_pattern;
521541
}
522-
while ((text[0] != '\0') && matchone(*pattern++, *text++));
523542

524543
*matchlength = pre;
525544
return 0;

0 commit comments

Comments
 (0)