Skip to content

Commit f82204c

Browse files
committed
Improve safety of get_class_restrictions()
1 parent e185109 commit f82204c

File tree

1 file changed

+16
-6
lines changed

1 file changed

+16
-6
lines changed

app/scrapers/sis_scraper.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -203,9 +203,13 @@ async def get_class_restrictions(session: aiohttp.ClientSession, term: str, crn:
203203
"not_classification": [],
204204
}
205205
restrictions_tag = soup.find("section", {"aria-labelledby": "restrictions"})
206+
# Other known restriction header patterns include:
207+
# "Special Approvals:"
206208
restriction_header_pattern = (
207209
r"(Must|Cannot) be enrolled in one of the following (Majors|Classes|Levels):"
208210
)
211+
# All children of the restrictions section are <div>, <span<>, or <br> tags
212+
# Tags relevant to restrictions are only known to be <span> tags
209213
restrictions_content = [
210214
child
211215
for child in restrictions_tag.children
@@ -214,13 +218,13 @@ async def get_class_restrictions(session: aiohttp.ClientSession, term: str, crn:
214218
i = 0
215219
while i < len(restrictions_content):
216220
content = restrictions_content[i]
217-
content_string = content.string.strip() if content.string else ""
218221
if content.string is None:
219222
print(
220223
f"Skipping unexpected restriction content with no string for term and CRN: {term} - {crn}"
221224
)
222225
i += 1
223226
continue
227+
content_string = content.string.strip()
224228
header_match = re.match(restriction_header_pattern, content_string)
225229
if header_match is None:
226230
i += 1
@@ -232,12 +236,17 @@ async def get_class_restrictions(session: aiohttp.ClientSession, term: str, crn:
232236
i += 1
233237
while i < len(restrictions_content):
234238
next_content = restrictions_content[i]
235-
# if next_content.string is None:
236-
# i += 1
237-
# continue
238-
if re.match(restriction_header_pattern, next_content.string.strip()):
239+
if next_content.string is None:
240+
print(
241+
f"Skipping unexpected restriction content with no string for term and CRN: {term} - {crn}"
242+
)
243+
i += 1
244+
continue
245+
next_content_string = next_content.string.strip()
246+
# Stop if another restriction header is encountered
247+
if re.match(restriction_header_pattern, next_content_string):
239248
break
240-
restriction_list.append(next_content.string.strip())
249+
restriction_list.append(next_content_string)
241250
i += 1
242251
return restrictions_data
243252

@@ -531,6 +540,7 @@ async def main(start_year: int, end_year: int, seasons: list[str] = None) -> boo
531540

532541
if __name__ == "__main__":
533542
start_year = 2023
543+
start_year = 2025
534544
end_year = 2025
535545
start_time = time.time()
536546
asyncio.run(main(start_year, end_year))

0 commit comments

Comments
 (0)