Skip to content

Commit 52415ec

Browse files
committed
Integrate prerequisite parser into main scraper
This also uncomments the prereq code in the process_class_details function. In addition, the subject_name_code_map was changed to use the new get_reverse_subject_map function, though I know that's not going to be the final implementation. It's just to get the prereq parser working here.
1 parent 1131970 commit 52415ec

File tree

1 file changed

+47
-8
lines changed

1 file changed

+47
-8
lines changed

app/scrapers/sis_scraper.py

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import aiohttp
1111
import bs4
12+
from prereq_parser import parse_prereq
1213

1314
OUTPUT_DATA_DIR = "data"
1415

@@ -326,12 +327,51 @@ async def get_class_restrictions(session: aiohttp.ClientSession, term: str, crn:
326327
return restrictions_data
327328

328329

329-
async def get_class_prerequisites(session: aiohttp.ClientSession, term: str, crn: str):
330+
async def get_class_prerequisites(
331+
session: aiohttp.ClientSession,
332+
term: str,
333+
crn: str,
334+
subject_name_code_map: dict[str, str],
335+
):
330336
"""
331337
Fetches and parses data from the "Prerequisites" tab of a class details page.
332338
"""
333339
url = "https://sis9.rpi.edu/StudentRegistrationSsb/ssb/searchResults/getSectionPrerequisites"
334340
params = {"term": term, "courseReferenceNumber": crn}
341+
async with session.get(url, params=params) as response:
342+
response.raise_for_status()
343+
text = await response.text()
344+
soup = bs4.BeautifulSoup(text, "html5lib")
345+
346+
data = ""
347+
rows = soup.find_all("tr")
348+
for row in rows:
349+
cols = row.find_all("td")
350+
if len(cols) == 0:
351+
continue
352+
data += (
353+
" and " if cols[0].text == "And" else " or " if cols[0].text == "Or" else ""
354+
)
355+
data += " ( " if cols[1].text != "" else ""
356+
if cols[2].text != "":
357+
data += f" {cols[2].text} {cols[3].text} "
358+
else:
359+
if cols[4].text not in subject_name_code_map:
360+
print(f"Unknown department in CRN {crn}: {cols[4].text}")
361+
data += f" {cols[4].text} {cols[5].text} "
362+
else:
363+
data += f" {subject_name_code_map[cols[4].text]} {cols[5].text} "
364+
data += " ) " if cols[8].text != "" else ""
365+
data = data.replace(" ", " ").strip()
366+
data = data.replace(" ", " ").strip()
367+
data = data.replace("( ", "(").strip()
368+
data = data.replace(" )", ")").strip()
369+
if data:
370+
try:
371+
return parse_prereq(crn, data)
372+
except Exception as e:
373+
print(f"Error parsing prerequisites for CRN {crn} with data: {data} - {e}")
374+
return {}
335375

336376

337377
async def get_class_corequisites(
@@ -480,7 +520,9 @@ async def process_class_details(
480520
description_task = tg.create_task(get_class_description(session, term, crn))
481521
attributes_task = tg.create_task(get_class_attributes(session, term, crn))
482522
restrictions_task = tg.create_task(get_class_restrictions(session, term, crn))
483-
# prerequisites_task = tg.create_task(get_class_prerequisites(session, term, crn))
523+
prerequisites_task = tg.create_task(
524+
get_class_prerequisites(session, term, crn, subject_name_code_map)
525+
)
484526
corequisites_task = tg.create_task(
485527
get_class_corequisites(session, term, crn, subject_name_code_map)
486528
)
@@ -490,7 +532,7 @@ async def process_class_details(
490532
description_data = description_task.result()
491533
attributes_data = attributes_task.result()
492534
restrictions_data = restrictions_task.result()
493-
# prerequisites_data = prerequisites_task.result()
535+
prerequisites_data = prerequisites_task.result()
494536
corequisites_data = corequisites_task.result()
495537
# crosslists_data = crosslists_task.result()
496538

@@ -502,7 +544,7 @@ async def process_class_details(
502544
"course_detail": {
503545
"description": description_data["description"],
504546
"corequisite": corequisites_data,
505-
"prerequisite": [],
547+
"prerequisite": prerequisites_data,
506548
"crosslist": [],
507549
"attributes": attributes_data,
508550
"restrictions": restrictions_data,
@@ -623,9 +665,7 @@ async def get_term_course_data(
623665
print(f"Processing {len(subjects)} subjects for term: {term}")
624666

625667
# Create reverse mapping of subject names to codes
626-
subject_name_code_map = {}
627-
for subject in subjects:
628-
subject_name_code_map[subject["description"]] = subject["code"]
668+
subject_name_code_map = await get_reverse_subject_map(session)
629669

630670
# Stores all course data for the term
631671
all_course_data = {}
@@ -730,7 +770,6 @@ async def main(start_year: int, end_year: int, seasons: list[str] = None) -> boo
730770

731771
if __name__ == "__main__":
732772
start_year = 2023
733-
start_year = 2025
734773
end_year = 2025
735774
start_time = time.time()
736775
asyncio.run(main(start_year, end_year))

0 commit comments

Comments
 (0)