35
35
36
36
/* Definitions: */
37
37
38
- #define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */
39
- #define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in. */
38
+ #define MAX_REGEXP_LEN 70 /* Max number of bytes for a regex. */
40
39
41
40
42
41
enum { UNUSED , DOT , BEGIN , END , QUESTIONMARK , STAR , PLUS , CHAR , CHAR_CLASS , INV_CHAR_CLASS , DIGIT , NOT_DIGIT , ALPHA , NOT_ALPHA , WHITESPACE , NOT_WHITESPACE , /* BRANCH */ };
43
42
44
43
typedef struct regex_t
45
44
{
46
- unsigned char type ; /* CHAR, STAR, etc. */
47
- union
48
- {
49
- unsigned char ch ; /* the character itself */
50
- unsigned char * ccl ; /* OR a pointer to characters in class */
51
- } u ;
45
+ unsigned char type ; /* CHAR, STAR, etc. */
46
+ unsigned char data_len ;
47
+ unsigned char data [0 ];
52
48
} regex_t ;
53
49
50
+ static re_t getnext (regex_t * pattern )
51
+ {
52
+ return (re_t )(((unsigned char * )pattern ) + 2 + pattern -> data_len );
53
+ }
54
+
54
55
55
56
56
57
/* Private function declarations: */
57
58
static int matchpattern (regex_t * pattern , const char * text , int * matchlength );
58
59
static int matchcharclass (char c , const char * str );
59
- static int matchstar (regex_t p , regex_t * pattern , const char * text , int * matchlength );
60
- static int matchplus (regex_t p , regex_t * pattern , const char * text , int * matchlength );
61
- static int matchone (regex_t p , char c );
60
+ static int matchstar (regex_t * p , regex_t * pattern , const char * text , int * matchlength );
61
+ static int matchplus (regex_t * p , regex_t * pattern , const char * text , int * matchlength );
62
+ static int matchone (regex_t * p , char c );
62
63
static int matchdigit (char c );
63
64
static int matchalpha (char c );
64
65
static int matchwhitespace (char c );
@@ -80,9 +81,9 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
80
81
* matchlength = 0 ;
81
82
if (pattern != 0 )
82
83
{
83
- if (pattern [ 0 ]. type == BEGIN )
84
+ if (pattern -> type == BEGIN )
84
85
{
85
- return ((matchpattern (& pattern [ 1 ] , text , matchlength )) ? 0 : -1 );
86
+ return ((matchpattern (getnext ( pattern ) , text , matchlength )) ? 0 : -1 );
86
87
}
87
88
else
88
89
{
@@ -106,33 +107,37 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
106
107
return -1 ;
107
108
}
108
109
110
+ static int min (int a , int b )
111
+ {
112
+ return (a <= b ) ? a : b ;
113
+ }
114
+
109
115
re_t re_compile (const char * pattern )
110
116
{
111
- /* The sizes of the two static arrays below substantiates the static RAM usage of this module.
112
- MAX_REGEXP_OBJECTS is the max number of symbols in the expression.
113
- MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */
114
- static regex_t re_compiled [MAX_REGEXP_OBJECTS ];
115
- static unsigned char ccl_buf [MAX_CHAR_CLASS_LEN ];
116
- int ccl_bufidx = 1 ;
117
+ /* The size of this static array substantiates the static RAM usage of this module.
118
+ MAX_REGEXP_LEN is the max number number of bytes in the expression. */
119
+ static unsigned char re_data [MAX_REGEXP_LEN ];
117
120
118
121
char c ; /* current char in pattern */
119
122
int i = 0 ; /* index into pattern */
120
- int j = 0 ; /* index into re_compiled */
123
+ int j = 0 ; /* index into re_data */
121
124
122
- while (pattern [i ] != '\0' && (j + 1 < MAX_REGEXP_OBJECTS ))
125
+ while (pattern [i ] != '\0' && (j + 3 < MAX_REGEXP_LEN ))
123
126
{
124
127
c = pattern [i ];
128
+ regex_t * re_compiled = (regex_t * )(re_data + j );
129
+ re_compiled -> data_len = 0 ;
125
130
126
131
switch (c )
127
132
{
128
133
/* Meta-characters: */
129
- case '^' : { re_compiled [ j ]. type = BEGIN ; } break ;
130
- case '$' : { re_compiled [ j ]. type = END ; } break ;
131
- case '.' : { re_compiled [ j ]. type = DOT ; } break ;
132
- case '*' : { re_compiled [ j ]. type = STAR ; } break ;
133
- case '+' : { re_compiled [ j ]. type = PLUS ; } break ;
134
- case '?' : { re_compiled [ j ]. type = QUESTIONMARK ; } break ;
135
- /* case '|': { re_compiled[j]. type = BRANCH; } break; <-- not working properly */
134
+ case '^' : { re_compiled -> type = BEGIN ; } break ;
135
+ case '$' : { re_compiled -> type = END ; } break ;
136
+ case '.' : { re_compiled -> type = DOT ; } break ;
137
+ case '*' : { re_compiled -> type = STAR ; } break ;
138
+ case '+' : { re_compiled -> type = PLUS ; } break ;
139
+ case '?' : { re_compiled -> type = QUESTIONMARK ; } break ;
140
+ /* case '|': { re_compiled-> type = BRANCH; } break; <-- not working properly */
136
141
137
142
/* Escaped character-classes (\s \w ...): */
138
143
case '\\' :
@@ -145,41 +150,42 @@ re_t re_compile(const char* pattern)
145
150
switch (pattern [i ])
146
151
{
147
152
/* Meta-character: */
148
- case 'd' : { re_compiled [ j ]. type = DIGIT ; } break ;
149
- case 'D' : { re_compiled [ j ]. type = NOT_DIGIT ; } break ;
150
- case 'w' : { re_compiled [ j ]. type = ALPHA ; } break ;
151
- case 'W' : { re_compiled [ j ]. type = NOT_ALPHA ; } break ;
152
- case 's' : { re_compiled [ j ]. type = WHITESPACE ; } break ;
153
- case 'S' : { re_compiled [ j ]. type = NOT_WHITESPACE ; } break ;
153
+ case 'd' : { re_compiled -> type = DIGIT ; } break ;
154
+ case 'D' : { re_compiled -> type = NOT_DIGIT ; } break ;
155
+ case 'w' : { re_compiled -> type = ALPHA ; } break ;
156
+ case 'W' : { re_compiled -> type = NOT_ALPHA ; } break ;
157
+ case 's' : { re_compiled -> type = WHITESPACE ; } break ;
158
+ case 'S' : { re_compiled -> type = NOT_WHITESPACE ; } break ;
154
159
155
160
/* Escaped character, e.g. '.' or '$' */
156
161
default :
157
162
{
158
- re_compiled [j ].type = CHAR ;
159
- re_compiled [j ].u .ch = pattern [i ];
163
+ re_compiled -> type = CHAR ;
164
+ re_compiled -> data_len = 1 ;
165
+ re_compiled -> data [0 ] = pattern [i ];
160
166
} break ;
161
167
}
162
168
}
163
169
/* '\\' as last char in pattern -> invalid regular expression. */
164
170
/*
165
171
else
166
172
{
167
- re_compiled[j].type = CHAR;
168
- re_compiled[j].ch = pattern[i];
173
+ re_compiled->type = CHAR;
174
+ re_compiled->data_len = 1;
175
+ re_compiled->data[0] = pattern[i];
169
176
}
170
177
*/
171
178
} break ;
172
179
173
180
/* Character class: */
174
181
case '[' :
175
182
{
176
- /* Remember where the char-buffer starts. */
177
- int buf_begin = ccl_bufidx ;
183
+ int char_limit = min (0xff , MAX_REGEXP_LEN - j - 4 ); // 4 for this object and UNUSED at the minimum
178
184
179
185
/* Look-ahead to determine if negated */
180
186
if (pattern [i + 1 ] == '^' )
181
187
{
182
- re_compiled [ j ]. type = INV_CHAR_CLASS ;
188
+ re_compiled -> type = INV_CHAR_CLASS ;
183
189
i += 1 ; /* Increment i to avoid including '^' in the char-buffer */
184
190
if (pattern [i + 1 ] == 0 ) /* incomplete pattern, missing non-zero char after '^' */
185
191
{
@@ -188,7 +194,7 @@ re_t re_compile(const char* pattern)
188
194
}
189
195
else
190
196
{
191
- re_compiled [ j ]. type = CHAR_CLASS ;
197
+ re_compiled -> type = CHAR_CLASS ;
192
198
}
193
199
194
200
/* Copy characters inside [..] to buffer */
@@ -197,7 +203,7 @@ re_t re_compile(const char* pattern)
197
203
{
198
204
if (pattern [i ] == '\\' )
199
205
{
200
- if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1 )
206
+ if (re_compiled -> data_len >= char_limit )
201
207
{
202
208
//fputs("exceeded internal buffer!\n", stderr);
203
209
return 0 ;
@@ -206,31 +212,32 @@ re_t re_compile(const char* pattern)
206
212
{
207
213
return 0 ;
208
214
}
209
- ccl_buf [ ccl_bufidx ++ ] = pattern [i ++ ];
215
+ re_compiled -> data [ re_compiled -> data_len ++ ] = pattern [i ++ ];
210
216
}
211
- else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN )
217
+ // TODO: I think this "else if" is a bug, should just be "if"
218
+ else if (re_compiled -> data_len >= char_limit )
212
219
{
213
220
//fputs("exceeded internal buffer!\n", stderr);
214
221
return 0 ;
215
222
}
216
- ccl_buf [ ccl_bufidx ++ ] = pattern [i ];
223
+ re_compiled -> data [ re_compiled -> data_len ++ ] = pattern [i ];
217
224
}
218
- if (ccl_bufidx >= MAX_CHAR_CLASS_LEN )
225
+ if (re_compiled -> data_len >= char_limit )
219
226
{
220
227
/* Catches cases such as [00000000000000000000000000000000000000][ */
221
228
//fputs("exceeded internal buffer!\n", stderr);
222
229
return 0 ;
223
230
}
224
231
/* Null-terminate string end */
225
- ccl_buf [ccl_bufidx ++ ] = 0 ;
226
- re_compiled [j ].u .ccl = & ccl_buf [buf_begin ];
232
+ re_compiled -> data [re_compiled -> data_len ++ ] = 0 ;
227
233
} break ;
228
234
229
235
/* Other characters: */
230
236
default :
231
237
{
232
- re_compiled [j ].type = CHAR ;
233
- re_compiled [j ].u .ch = c ;
238
+ re_compiled -> type = CHAR ;
239
+ re_compiled -> data_len = 1 ;
240
+ re_compiled -> data [0 ] = c ;
234
241
} break ;
235
242
}
236
243
/* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
@@ -240,35 +247,39 @@ re_t re_compile(const char* pattern)
240
247
}
241
248
242
249
i += 1 ;
243
- j += 1 ;
250
+ j += 2 + re_compiled -> data_len ;
251
+ }
252
+ if (j + 1 >= MAX_REGEXP_LEN ) {
253
+ //fputs("exceeded internal buffer!\n", stderr);
254
+ return 0 ;
244
255
}
245
256
/* 'UNUSED' is a sentinel used to indicate end-of-pattern */
246
- re_compiled [j ].type = UNUSED ;
257
+ re_data [j ] = UNUSED ;
258
+ re_data [j + 1 ] = 0 ;
247
259
248
- return (re_t ) re_compiled ;
260
+ return (re_t ) re_data ;
249
261
}
250
262
251
263
void re_print (regex_t * pattern )
252
264
{
253
265
const char * types [] = { "UNUSED" , "DOT" , "BEGIN" , "END" , "QUESTIONMARK" , "STAR" , "PLUS" , "CHAR" , "CHAR_CLASS" , "INV_CHAR_CLASS" , "DIGIT" , "NOT_DIGIT" , "ALPHA" , "NOT_ALPHA" , "WHITESPACE" , "NOT_WHITESPACE" , "BRANCH" };
254
266
255
- int i ;
256
267
int j ;
257
268
char c ;
258
- for (i = 0 ; i < MAX_REGEXP_OBJECTS ; ++ i )
269
+ for (;; pattern = getnext ( pattern ) )
259
270
{
260
- if (pattern [ i ]. type == UNUSED )
271
+ if (pattern -> type == UNUSED )
261
272
{
262
273
break ;
263
274
}
264
275
265
- printf ("type: %s" , types [pattern [ i ]. type ]);
266
- if (pattern [ i ]. type == CHAR_CLASS || pattern [ i ]. type == INV_CHAR_CLASS )
276
+ printf ("type: %s" , types [pattern -> type ]);
277
+ if (pattern -> type == CHAR_CLASS || pattern -> type == INV_CHAR_CLASS )
267
278
{
268
279
printf (" [" );
269
- for (j = 0 ; j < MAX_CHAR_CLASS_LEN ; ++ j )
280
+ for (j = 0 ; j < pattern -> data_len ; ++ j )
270
281
{
271
- c = pattern [ i ]. u . ccl [j ];
282
+ c = pattern -> data [j ];
272
283
if ((c == '\0' ) || (c == ']' ))
273
284
{
274
285
break ;
@@ -277,9 +288,9 @@ void re_print(regex_t* pattern)
277
288
}
278
289
printf ("]" );
279
290
}
280
- else if (pattern [ i ]. type == CHAR )
291
+ else if (pattern -> type == CHAR )
281
292
{
282
- printf (" '%c'" , pattern [ i ]. u . ch );
293
+ printf (" '%c'" , pattern -> data [ 0 ] );
283
294
}
284
295
printf ("\n" );
285
296
}
@@ -380,24 +391,25 @@ static int matchcharclass(char c, const char* str)
380
391
return 0 ;
381
392
}
382
393
383
- static int matchone (regex_t p , char c )
394
+ static int matchone (regex_t * p , char c )
384
395
{
385
- switch (p . type )
396
+ switch (p -> type )
386
397
{
387
398
case DOT : return matchdot (c );
388
- case CHAR_CLASS : return matchcharclass (c , (const char * )p . u . ccl );
389
- case INV_CHAR_CLASS : return !matchcharclass (c , (const char * )p . u . ccl );
399
+ case CHAR_CLASS : return matchcharclass (c , (const char * )p -> data );
400
+ case INV_CHAR_CLASS : return !matchcharclass (c , (const char * )p -> data );
390
401
case DIGIT : return matchdigit (c );
391
402
case NOT_DIGIT : return !matchdigit (c );
392
403
case ALPHA : return matchalphanum (c );
393
404
case NOT_ALPHA : return !matchalphanum (c );
394
405
case WHITESPACE : return matchwhitespace (c );
395
406
case NOT_WHITESPACE : return !matchwhitespace (c );
396
- default : return (p .u .ch == c );
407
+ case BEGIN : return 0 ;
408
+ default : return (p -> data [0 ] == c );
397
409
}
398
410
}
399
411
400
- static int matchstar (regex_t p , regex_t * pattern , const char * text , int * matchlength )
412
+ static int matchstar (regex_t * p , regex_t * pattern , const char * text , int * matchlength )
401
413
{
402
414
int prelen = * matchlength ;
403
415
const char * prepoint = text ;
@@ -417,7 +429,7 @@ static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchle
417
429
return 0 ;
418
430
}
419
431
420
- static int matchplus (regex_t p , regex_t * pattern , const char * text , int * matchlength )
432
+ static int matchplus (regex_t * p , regex_t * pattern , const char * text , int * matchlength )
421
433
{
422
434
const char * prepoint = text ;
423
435
while ((text [0 ] != '\0' ) && matchone (p , * text ))
@@ -435,10 +447,8 @@ static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchle
435
447
return 0 ;
436
448
}
437
449
438
- static int matchquestion (regex_t p , regex_t * pattern , const char * text , int * matchlength )
450
+ static int matchquestion (regex_t * p , regex_t * pattern , const char * text , int * matchlength )
439
451
{
440
- if (p .type == UNUSED )
441
- return 1 ;
442
452
if (matchpattern (pattern , text , matchlength ))
443
453
return 1 ;
444
454
if (* text && matchone (p , * text ++ ))
@@ -493,33 +503,42 @@ static int matchpattern(regex_t* pattern, const char* text, int *matchlength)
493
503
static int matchpattern (regex_t * pattern , const char * text , int * matchlength )
494
504
{
495
505
int pre = * matchlength ;
496
- do
506
+ while ( 1 )
497
507
{
498
- if (( pattern [ 0 ]. type == UNUSED ) || ( pattern [ 1 ]. type == QUESTIONMARK ) )
508
+ if (pattern -> type == UNUSED )
499
509
{
500
- return matchquestion ( pattern [ 0 ], & pattern [ 2 ], text , matchlength ) ;
510
+ return 1 ;
501
511
}
502
- else if (pattern [1 ].type == STAR )
512
+ regex_t * next_pattern = getnext (pattern );
513
+ if (next_pattern -> type == QUESTIONMARK )
503
514
{
504
- return matchstar (pattern [ 0 ], & pattern [ 2 ] , text , matchlength );
515
+ return matchquestion (pattern , getnext ( next_pattern ) , text , matchlength );
505
516
}
506
- else if (pattern [ 1 ]. type == PLUS )
517
+ else if (next_pattern -> type == STAR )
507
518
{
508
- return matchplus (pattern [ 0 ], & pattern [ 2 ] , text , matchlength );
519
+ return matchstar (pattern , getnext ( next_pattern ) , text , matchlength );
509
520
}
510
- else if ((pattern [0 ].type == END ) && pattern [1 ].type == UNUSED )
521
+ else if (next_pattern -> type == PLUS )
522
+ {
523
+ return matchplus (pattern , getnext (next_pattern ), text , matchlength );
524
+ }
525
+ else if ((pattern -> type == END ) && next_pattern -> type == UNUSED )
511
526
{
512
527
return (text [0 ] == '\0' );
513
528
}
514
529
/* Branching is not working properly
515
- else if (pattern[1]. type == BRANCH)
530
+ else if (pattern-> type == BRANCH)
516
531
{
517
- return (matchpattern(pattern, text) || matchpattern(&pattern[2] , text));
532
+ return (matchpattern(pattern, text) || matchpattern(getnext(next_pattern) , text));
518
533
}
519
534
*/
520
535
(* matchlength )++ ;
536
+ if (text [0 ] == '\0' )
537
+ break ;
538
+ if (!matchone (pattern , * text ++ ))
539
+ break ;
540
+ pattern = next_pattern ;
521
541
}
522
- while ((text [0 ] != '\0' ) && matchone (* pattern ++ , * text ++ ));
523
542
524
543
* matchlength = pre ;
525
544
return 0 ;
0 commit comments