Skip to content

Commit 84cff63

Browse files
committed
Fix course restriction parsing
The reason for this change is detailed in the comment added to the code. Basically, SIS data sucks and data for a single restriction can sometimes be split into multiple lines, causing each line to count as a separate restriction.
1 parent cb7c8d2 commit 84cff63

File tree

2 files changed

+18
-2
lines changed

2 files changed

+18
-2
lines changed

app/scrapers/sis_api.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,7 @@ async def get_class_restrictions(session: aiohttp.ClientSession, term: str, crn:
382382
key = f"not_{key_base}" if must_or_cannot == "Cannot" else key_base
383383
restriction_list = restrictions_data[key]
384384
i += 1
385+
next_content_string = ""
385386
while i < len(restrictions_content):
386387
next_content = restrictions_content[i]
387388
if next_content.string is None:
@@ -390,13 +391,24 @@ async def get_class_restrictions(session: aiohttp.ClientSession, term: str, crn:
390391
)
391392
i += 1
392393
continue
393-
next_content_string = next_content.string.strip()
394+
next_content_string += next_content.string.strip()
394395
# Stop if another restriction header is encountered
395396
if re.match(restriction_header_pattern, next_content_string) or re.match(
396397
special_approvals_pattern, next_content_string
397398
):
398399
break
399-
restriction_list.append(next_content_string)
400+
# SIS separates one restriction item into multiple <span> tags if it contains
401+
# commas, so a restriction item is only complete when parentheses are closed.
402+
#
403+
# For example, a restriction item "Communication, Media, & Design (COMD)"
404+
# would be split into three <span> tags:
405+
#
406+
# <span>Communication</span>
407+
# <span>Media</span>
408+
# <span> & Design (COMD)</span>
409+
if re.match(r".*\(.*\)", next_content_string):
410+
restriction_list.append(next_content_string)
411+
next_content_string = ""
400412
i += 1
401413
return restrictions_data
402414

app/scrapers/sis_scraper.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ async def process_class_details(
9393
crosslists_data = crosslists_task.result()
9494

9595
# Build attribute name to code map
96+
# Attributes are known to be in the format "Attribute Name CODE"
97+
# Note the double space between name and code
9698
if attribute_name_code_map is not None:
9799
for attribute in attributes_data:
98100
attribute_split = attribute.split()
@@ -113,6 +115,8 @@ async def process_class_details(
113115
attribute_name_code_map[attribute_name] = attribute_code
114116

115117
# Build restriction name to code map
118+
# Restrictions are known to be in the format "Restriction Name (CODE)"
119+
# Note the parentheses around the code
116120
if restriction_name_code_map is not None:
117121
restriction_pattern = r"(.*)\((.*)\)"
118122
for restriction_type in restrictions_data:

0 commit comments

Comments
 (0)