Skip to content

Commit a0f93b9

Browse files
cleaned the code
1 parent 0cf5b09 commit a0f93b9

File tree

11 files changed

+150
-196
lines changed

11 files changed

+150
-196
lines changed

docs/uml/classes.pyregexp.png

-18.8 KB
Loading

docs/uml/classes.re_ast.png

-42.2 KB
Loading

docs/uml/classes.tokens.png

-54 KB
Loading

pyregexp/engine.py

Lines changed: 76 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
"""
1111

1212

13-
from typing import Callable, Union, Tuple, List
13+
from collections import deque
14+
from typing import Callable, Deque, Union, Tuple, List
1415
import unicodedata
1516
from .pyrser import Pyrser
1617
from .match import Match
@@ -74,8 +75,9 @@ def return_fnc(res: bool, consumed: int, all_matches: list, return_matches: bool
7475
re = unicodedata.normalize("NFKD", re).casefold()
7576
string = unicodedata.normalize("NFKD", string).casefold()
7677

77-
all_matches = [] # variables holding the matched groups list for each matched substring in the test string
78-
highest_matched_idx = 0 # holds the highest matched string's index
78+
# variables holding the matched groups list for each matched substring in the test string
79+
all_matches: List[List[Match]] = []
80+
highest_matched_idx: int = 0 # holds the highest matched string's index
7981

8082
res, consumed, matches = self.__match__(re, string, 0)
8183
if res:
@@ -101,8 +103,8 @@ def return_fnc(res: bool, consumed: int, all_matches: list, return_matches: bool
101103
def __match__(self, re: str, string: str, start_str_i: int) -> Tuple[bool, int, List[Match]]:
102104
""" Same as match, but always returns after the first match."""
103105
ast = self.parser.parse(re=re)
104-
matches: List[Match]
105-
matches = []
106+
matches: Deque[Match]
107+
matches = deque()
106108

107109
# str_i represents the matched characters so far. It is inizialized to
108110
# the value of the input parameter start_str_i because the match could
@@ -114,67 +116,7 @@ def __match__(self, re: str, string: str, start_str_i: int) -> Tuple[bool, int,
114116
def return_fnc(res: bool, str_i: int) -> Tuple[bool, int, List[Match]]:
115117
""" Returns the Tuple to be returned by __match__."""
116118
nonlocal matches
117-
# reverses the list so the last match (the "whole" match) is first
118-
matches.reverse()
119-
return res, str_i, matches
120-
121-
def backtrack(backtrack_stack: List[Tuple[int, int, int, List[int]]], str_i: int, curr_i: int) -> Tuple[bool, int, int]:
122-
""" Returns whether it is possible to backtrack and the state to backtrack to.
123-
124-
Takes as input the current state of the engine and returns whether
125-
or not it is possible to backtrack.
126-
127-
Args:
128-
backtrack_stack (List[Tuple[int, int, int, List[int]]]): the
129-
current backtrack_stack situation. The Tuple values represents,
130-
in order from left to right, the node index of the entry in its
131-
parent children list, the minimum times that node must be
132-
matched, the time it is matched in the current state, the list
133-
of consumed character each times it was matched
134-
str_i (int): the current considered index of the test string
135-
curr_i (int): the index of the GroupNode children considered
136-
137-
Returns:
138-
A Tuple containing a bool, True if it is possible to backtrack,
139-
the new string index, and the new node children index to which
140-
backtrack to. Note that the last two parameters only have a
141-
meaning in the case it is possible to backtrack (the bool is
142-
True).
143-
"""
144-
if len(backtrack_stack) == 0:
145-
return False, str_i, curr_i
146-
147-
# the fist step is to pop the last tuple from the backtrack_stack
148-
node_i, min_, matched_times, consumed_list = backtrack_stack.pop()
149-
150-
if matched_times == min_:
151-
# if a node is already matched the minimum number of times, the
152-
# chance you have to potentially be able to backtrack is to is
153-
# to delete the entry from the stack and then search for a new
154-
# possibility (recursively calling this function).
155-
# But, before the recursion, you have to calculate what the
156-
# string index (str_i) value was before the node was matched
157-
# even once. Thus, you have to decrease the string index
158-
# of each consumption in the consumed_list.
159-
160-
# calculate_the new str_i
161-
for consumption in consumed_list:
162-
str_i -= consumption
163-
# recursive call
164-
return backtrack(backtrack_stack, str_i, node_i)
165-
else:
166-
# the node was matched more times than its min, so you just
167-
# need to remove the last consumption from the list,
168-
# decrease the str_i by that amount, decrease the times the node
169-
# was matched - matched_times - by 1, and then append the stack
170-
# the tuple with the new matched_times and consumed_list.
171-
last_consumed = consumed_list.pop()
172-
new_str_i = str_i - last_consumed
173-
backtrack_stack.append(
174-
(node_i, min_, matched_times - 1, consumed_list))
175-
# lastly, you return that the backtracking is possible, and
176-
# the state to which backtrack to.
177-
return True, new_str_i, curr_i
119+
return res, str_i, list(matches)
178120

179121
def save_matches(match_group: Callable, ast: Union[RE, GroupNode], string: str, start_idx: int) -> Tuple[bool, int]:
180122
""" Save the matches of capturing groups.
@@ -198,7 +140,7 @@ def save_matches(match_group: Callable, ast: Union[RE, GroupNode], string: str,
198140
if matches[i].group_id == ast.group_id:
199141
matches.remove(matches[i])
200142
break
201-
matches.append(
143+
matches.appendleft(
202144
Match(ast.group_id, start_idx, end_idx, string, ast.group_name))
203145

204146
return res, end_idx
@@ -211,7 +153,68 @@ def match_group(ast: Union[RE, GroupNode, OrNode], string: str) -> Tuple[bool, i
211153
number of matched characters in the string so far.
212154
'''
213155
nonlocal str_i
214-
backtrack_stack = []
156+
backtrack_stack: List[Tuple[int, int, int, List[int]]] = []
157+
158+
def backtrack(str_i: int, curr_i: int) -> Tuple[bool, int, int]:
159+
""" Returns whether it is possible to backtrack and the state to backtrack to.
160+
161+
Takes as input the current state of the engine and returns whether
162+
or not it is possible to backtrack.
163+
164+
Args:
165+
backtrack_stack (List[Tuple[int, int, int, List[int]]]): the
166+
current backtrack_stack situation. The Tuple values represents,
167+
in order from left to right, the node index of the entry in its
168+
parent children list, the minimum times that node must be
169+
matched, the time it is matched in the current state, the list
170+
of consumed character each times it was matched
171+
str_i (int): the current considered index of the test string
172+
curr_i (int): the index of the GroupNode children considered
173+
174+
Returns:
175+
A Tuple containing a bool, True if it is possible to backtrack,
176+
the new string index, and the new node children index to which
177+
backtrack to. Note that the last two parameters only have a
178+
meaning in the case it is possible to backtrack (the bool is
179+
True).
180+
"""
181+
nonlocal backtrack_stack
182+
183+
if len(backtrack_stack) == 0:
184+
return False, str_i, curr_i
185+
186+
# the fist step is to pop the last tuple from the backtrack_stack
187+
node_i, min_, matched_times, consumed_list = backtrack_stack.pop()
188+
189+
if matched_times == min_:
190+
# if a node is already matched the minimum number of times, the
191+
# chance you have to potentially be able to backtrack is to is
192+
# to delete the entry from the stack and then search for a new
193+
# possibility (recursively calling this function).
194+
# But, before the recursion, you have to calculate what the
195+
# string index (str_i) value was before the node was matched
196+
# even once. Thus, you have to decrease the string index
197+
# of each consumption in the consumed_list.
198+
199+
# calculate_the new str_i
200+
for consumption in consumed_list:
201+
str_i -= consumption
202+
# recursive call
203+
return backtrack(str_i, node_i)
204+
else:
205+
# the node was matched more times than its min, so you just
206+
# need to remove the last consumption from the list,
207+
# decrease the str_i by that amount, decrease the times the node
208+
# was matched - matched_times - by 1, and then append the stack
209+
# the tuple with the new matched_times and consumed_list.
210+
last_consumed = consumed_list.pop()
211+
new_str_i = str_i - last_consumed
212+
backtrack_stack.append(
213+
(node_i, min_, matched_times - 1, consumed_list))
214+
# lastly, you return that the backtracking is possible, and
215+
# the state to which backtrack to.
216+
return True, new_str_i, curr_i
217+
215218
curr_node = ast.children[0] if len(ast.children) > 0 else None
216219
i = 0 # the children i'm iterating, not to confuse with str_i
217220

@@ -243,8 +246,7 @@ def match_group(ast: Union[RE, GroupNode, OrNode], string: str) -> Tuple[bool, i
243246
elif min_ <= j:
244247
break
245248
else:
246-
can_bt, bt_str_i, bt_i = backtrack(
247-
backtrack_stack, str_i, i)
249+
can_bt, bt_str_i, bt_i = backtrack(str_i, i)
248250
if can_bt:
249251
i = bt_i
250252
str_i = bt_str_i
@@ -280,8 +282,7 @@ def match_group(ast: Union[RE, GroupNode, OrNode], string: str) -> Tuple[bool, i
280282
# i did the bare minimum or more
281283
break
282284
else:
283-
can_bt, bt_str_i, bt_i = backtrack(
284-
backtrack_stack, str_i, i)
285+
can_bt, bt_str_i, bt_i = backtrack(str_i, i)
285286
if can_bt:
286287
i = bt_i
287288
str_i = bt_str_i
@@ -321,7 +322,7 @@ def match_group(ast: Union[RE, GroupNode, OrNode], string: str) -> Tuple[bool, i
321322
if min_ <= j: # I already met the minimum requirement for match
322323
break
323324
can_bt, bt_str_i, bt_i = backtrack(
324-
backtrack_stack, before_str_i, i)
325+
before_str_i, i)
325326
if can_bt:
326327
i = bt_i
327328
str_i = bt_str_i
@@ -338,7 +339,7 @@ def match_group(ast: Union[RE, GroupNode, OrNode], string: str) -> Tuple[bool, i
338339
else:
339340
# i have more states, but the input is finished
340341
can_bt, bt_str_i, bt_i = backtrack(
341-
backtrack_stack, before_str_i, i)
342+
before_str_i, i)
342343
if can_bt:
343344
i = bt_i
344345
str_i = bt_str_i
@@ -371,6 +372,6 @@ def match_group(ast: Union[RE, GroupNode, OrNode], string: str) -> Tuple[bool, i
371372
if res:
372373
return return_fnc(True, str_i)
373374
else:
374-
matches = []
375+
matches = deque()
375376
str_i = i
376377
return return_fnc(False, str_i)

pyregexp/lexer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def append(elem: Token) -> None:
4343
append(ElementToken(char='\t'))
4444
if ch == 's':
4545
# \s matches a space character
46-
append(SpaceToken(space_ch=ch))
46+
append(SpaceToken(char=ch))
4747
else:
4848
append(ElementToken(char=ch))
4949
elif ch == '\\':
@@ -75,7 +75,7 @@ def append(elem: Token) -> None:
7575
append(RightCurlyBrace())
7676
break
7777
else:
78-
raise Exception('Bad token at index ${}.'.format(i))
78+
raise Exception("Bad token at index ${}.".format(i))
7979
i += 1
8080
elif ch == '^':
8181
if i == 0:

pyregexp/pyrser.py

Lines changed: 20 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ def parse_re_seq(capturing: bool = True, group_name: str = None, group_id: int =
6868
next_tkn()
6969
match_start = True
7070

71-
node = parse_group(capturing=capturing, group_name=group_name, group_id=group_id)
71+
node = parse_group(capturing=capturing,
72+
group_name=group_name, group_id=group_id)
7273

7374
if isinstance(curr_tkn, EndToken):
7475
next_tkn()
@@ -119,7 +120,6 @@ def parse_group(capturing: bool = True, group_name: str = None, group_id: int =
119120
parse_curly(new_el)
120121

121122
elements.append(new_el)
122-
# next_tkn()
123123

124124
return GroupNode(children=elements, capturing=capturing, group_name=group_name, group_id=group_id)
125125

@@ -139,13 +139,13 @@ def parse_curly(new_el: ASTNode) -> None:
139139
val_1 = int(val_1)
140140

141141
if isinstance(curr_tkn, RightCurlyBrace):
142-
# I'm in the case {exact}
142+
# case {exact}
143143
if type(val_1) is int:
144144
new_el.min, new_el.max = val_1, val_1
145145
next_tkn() # skip the closing brace
146146
return
147147
else:
148-
raise Exception()
148+
raise Exception("Invalid curly brace syntax.")
149149

150150
next_tkn()
151151
while isinstance(curr_tkn, ElementToken):
@@ -156,14 +156,13 @@ def parse_curly(new_el: ASTNode) -> None:
156156
else:
157157
val_2 = int(val_2)
158158

159-
# skip the closing brace
160-
next_tkn()
159+
next_tkn() # skip the closing brace
161160

162161
new_el.min = val_1 if type(val_1) is int else 0
163162
new_el.max = val_2 if type(val_2) is int else math.inf
164163

165164
except Exception as e:
166-
raise Exception('Invalid curly brace syntax.')
165+
raise Exception("Invalid curly brace syntax.")
167166

168167
def parse_range_el() -> ASTNode:
169168
if isinstance(curr_tkn, LeftBracket):
@@ -173,17 +172,17 @@ def parse_range_el() -> ASTNode:
173172
return element
174173
else:
175174
raise Exception(
176-
'Missing closing \']\'. Check the regex and try again.')
175+
"Missing closing ']'.")
177176
else:
178177
return parse_el()
179178

180179
def parse_inner_el() -> RangeElement:
180+
# parse_inner_el creates a single RangeElement with all the matches
181181
nonlocal curr_tkn
182-
# innerel creates a single RangeElement with all the matches
183182
match_str = ''
184183
if curr_tkn is None:
185184
raise Exception(
186-
"Missing closing ']'. Check the regex and try again.")
185+
"Missing closing ']'.")
187186

188187
positive_logic = True
189188
if isinstance(curr_tkn, NotToken):
@@ -205,8 +204,7 @@ def parse_inner_el() -> RangeElement:
205204
curr_tkn = ElementToken(char=curr_tkn.char)
206205

207206
if next_tkn(without_consuming=True) is None:
208-
raise Exception(
209-
"Missing closing ']'. Check the regex and try again.")
207+
raise Exception("Missing closing ']'.")
210208
elif isinstance(next_tkn(without_consuming=True), Dash):
211209
# it may be a range (like a-z, A-M, 0-9, ...)
212210
prev_char = curr_tkn.char
@@ -219,8 +217,7 @@ def parse_inner_el() -> RangeElement:
219217
# we're in the case of an actual range (or next_tkn is none)
220218
next_tkn() # curr_tkn is now the one after the dash
221219
if next_tkn is None:
222-
raise Exception(
223-
"Missing closing ']'. Check the regex and try again.")
220+
raise Exception("Missing closing ']'.")
224221
elif ord(prev_char) > ord(curr_tkn.char):
225222
raise Exception(
226223
f"Range values reversed. Start '{prev_char}' char code is greater than end '{curr_tkn.char}' char code.")
@@ -257,31 +254,31 @@ def parse_el() -> Union[Element, OrNode, GroupNode]:
257254
group_name = parse_group_name()
258255
else:
259256
if curr_tkn is None:
260-
raise Exception('Unterminated group')
257+
raise Exception("Unterminated group.")
261258
else:
262259
raise Exception(
263-
f'Invalid group: \'{LeftParenthesis()}{QuestionMark()}{curr_tkn.char}\'')
260+
f"Invalid group: '{{?{curr_tkn.char}'.")
264261
res = parse_re_seq(capturing=capturing, group_name=group_name)
265262
if isinstance(curr_tkn, RightParenthesis):
266-
# next_tkn() not needed (the parse_group while loop will eat the parenthesis)
263+
# next_tkn() not needed (parse_group's while loop will eat the parenthesis)
267264
return res
268265
else:
269-
raise Exception('Missing closing group parenthesis \')\'')
266+
raise Exception("Missing closing group parenthesis ')'.")
270267
else:
271268
raise Exception(
272-
'Unescaped special character {}'.format(curr_tkn.char))
269+
"Unescaped special character {}.".format(curr_tkn.char))
273270

274271
def parse_group_name() -> str:
275272
if curr_tkn is None:
276-
raise Exception('Unterminated named group name.')
273+
raise Exception("Unterminated named group name.")
277274
group_name = ''
278275
while curr_tkn.char != '>':
279276
group_name += curr_tkn.char
280277
next_tkn()
281278
if curr_tkn is None:
282-
raise Exception('Unterminated named group name.')
279+
raise Exception("Unterminated named group name.")
283280
if len(group_name) == 0:
284-
raise Exception('Unexpected empty named group name.')
281+
raise Exception("Unexpected empty named group name.")
285282
next_tkn() # consumes '>'
286283
return group_name
287284

@@ -294,5 +291,5 @@ def parse_group_name() -> str:
294291
ast = parse_re()
295292
if curr_tkn is not None:
296293
raise Exception(
297-
"Unable to parse the entire regex.\nCheck the regex and try again.")
294+
"Unable to parse the regex.")
298295
return ast

0 commit comments

Comments
 (0)