From e8c8b9588b2ad310d11054bff855545be3ebb820 Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Mon, 1 Dec 2025 13:37:32 -0500 Subject: [PATCH 01/24] Refactor scraper and JSON output to display all classes This refactor is the first step in reworking the SIS scraper to fetch more granular information about every section (class) within a course. I plan on expanding on this refactor by scraping more data from SIS that would help generate more insights on courses. --- sis_scraper/sis_api.py | 69 +++++--- sis_scraper/sis_scraper.py | 322 +++++++++++++++++-------------------- 2 files changed, 192 insertions(+), 199 deletions(-) diff --git a/sis_scraper/sis_api.py b/sis_scraper/sis_api.py index 0c157f6..3b1c7b1 100644 --- a/sis_scraper/sis_api.py +++ b/sis_scraper/sis_api.py @@ -309,9 +309,9 @@ async def get_class_attributes( Returned data format is as follows: ``` [ - "Attribute 1", - "Attribute 2", - "Attribute 3", + "Communication Intensive COMM", + "Data Intensive I DI1", + "Introductory Level Course FRSH", ... ] ``` @@ -336,12 +336,23 @@ async def get_class_restrictions(session: aiohttp.ClientSession, term: str, crn: Returned data format is as follows: ``` { - "major": ["Allowed Major 1", ...], - "not_major": ["Disallowed Major 1", ...], - "level": ["Allowed Level 1", ...], - "not_level": ["Disallowed Level 1", ...], - "classification": ["Allowed Classification 1", ...], - "not_classification": ["Disallowed Classification 1", ...] + "major": [ + "Architecture (ARCH)", + ... + ], + "not_major": [ + "Computer Science (CSCI)", + ... + ], + "classification": [ + "Freshman (FR)", + ... + ], + "not_classification": [ + "Senior (SR)", + ... + ], + ... } ``` """ @@ -350,7 +361,7 @@ async def get_class_restrictions(session: aiohttp.ClientSession, term: str, crn: raw_data = await retry_get(session, url, params) raw_data = html_unescape(raw_data) soup = bs4.BeautifulSoup(raw_data, "html5lib") - # Dynamically build restrictions_data dict structure from RESTRICTION_TYPE_MAP values + # Dynamically build dict structure from RESTRICTION_TYPE_MAP values restrictions_data = {} bases = set(_RESTRICTION_TYPE_MAP.values()) for base in sorted(bases): @@ -526,8 +537,11 @@ async def get_class_corequisites( Returned data format is as follows: ``` [ - "Computer Science 1100", - "Mathematics 1010", + { + "subjectName": "Computer Science", + "courseNumber": "1100", + "title": "COMPUTER SCIENCE I" + }, ... ] ``` @@ -546,7 +560,7 @@ async def get_class_corequisites( if not coreqs_thead or not coreqs_tbody: return [] thead_cols = [th.text.strip() for th in coreqs_thead.find_all("th")] - # Known corequisite columns are Subject, Course, and Title + # Known corequisite columns are Subject, Course Number, and Title if len(thead_cols) != 3: logger.warning( f"Unexpected number of corequisite columns for CRN {crn} in term {term}" @@ -561,9 +575,10 @@ async def get_class_corequisites( f"CRN {crn} in term {term}" ) continue - subject = cols[0] - course_num = cols[1] - coreqs.append(f"{subject} {course_num}") + subject, course_num, title = cols + coreqs.append( + {"subjectName": subject, "courseNumber": course_num, "title": title} + ) return coreqs @@ -579,8 +594,13 @@ async def get_class_crosslists( Returned data format is as follows: ``` [ - "Computer Science 1100", - "Mathematics 1010", + { + "courseReferenceNumber": "12345", + "subjectName": "Computer Science", + "courseNumber": "1100", + "title": "COMPUTER SCIENCE I", + "sectionNumber": "01" + }, ... ] ``` @@ -614,7 +634,14 @@ async def get_class_crosslists( f"CRN {crn} in term {term}" ) continue - subject = cols[1] - code = cols[2] - crosslists.append(f"{subject} {code}") + crn, subject, course_num, title, section_num = cols + crosslists.append( + { + "courseReferenceNumber": crn, + "subjectName": subject, + "courseNumber": course_num, + "title": title, + "sectionNumber": section_num, + } + ) return crosslists diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 3323267..474e77e 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -54,7 +54,7 @@ def get_term_code(year: str | int, season: str) -> str: async def process_class_details( session: aiohttp.ClientSession, course_data: dict[str, Any], - class_entry: dict[str, Any], + sis_class_entry: dict[str, Any], instructor_rcsid_name_map: dict[str, str] = None, attribute_code_name_map: dict[str, str] = None, restriction_code_name_map: dict[str, dict[str, str]] = None, @@ -67,7 +67,8 @@ async def process_class_details( @param session: aiohttp client session to use for requests. @param course_data: Dictionary to populate with course data. - @param class_entry: Class data fetched from SIS's class search endpoint. + @param sis_class_entry: Class data fetched from SIS's class search + endpoint. @param known_rcsid_set: Optional set to populate with known instructor RCSIDs. @param attribute_code_name_map: Optional map to populate with attribute @@ -76,182 +77,149 @@ async def process_class_details( codes to names. @return: None """ - # Example course code: CSCI 1100 - course_code = f"{class_entry['subject']} {class_entry['courseNumber']}" - term = class_entry["term"] - crn = class_entry["courseReferenceNumber"] + course_num = sis_class_entry["courseNumber"] + term = sis_class_entry["term"] + crn = sis_class_entry["courseReferenceNumber"] + + # Initialize course entry if not already present + if course_num not in course_data: + course_data[course_num] = [] + + # Initialize empty class entry + class_entry = { + "courseReferenceNumber": sis_class_entry["courseReferenceNumber"], + "sectionNumber": sis_class_entry["sequenceNumber"], + "title": sis_class_entry["courseTitle"], + "description": "", + "attributes": [], + "restrictions": {}, + "prerequisites": [], + "corequisites": [], + "crosslists": [], + "creditMin": sis_class_entry["creditHourLow"] or 0, + "creditMax": sis_class_entry["creditHourHigh"] or 0, + "faculty": [], + "seatsCapacity": sis_class_entry["maximumEnrollment"], + "seatsRegistered": sis_class_entry["enrollment"], + "seatsAvailable": sis_class_entry["seatsAvailable"], + } # Fetch class details not included in main class details - # Only fetch if course not already in course data - if course_code not in course_data: - - # Initialize empty course entry - course_data[course_code] = { - "course_name": class_entry["courseTitle"], - "course_detail": { - "description": "", - "corequisite": [], - "prerequisite": [], - "crosslist": [], - "attributes": [], - "restrictions": [], - "credits": { - "min": float("inf"), - "max": 0, - }, - "sections": [], - }, - } - - async with asyncio.TaskGroup() as tg: - description_task = tg.create_task(get_class_description(session, term, crn)) - attributes_task = tg.create_task(get_class_attributes(session, term, crn)) - restrictions_task = tg.create_task( - get_class_restrictions(session, term, crn) - ) - prerequisites_task = tg.create_task( - get_class_prerequisites(session, term, crn) - ) - corequisites_task = tg.create_task( - get_class_corequisites(session, term, crn) - ) - crosslists_task = tg.create_task(get_class_crosslists(session, term, crn)) - - # Wait for tasks to complete and get results - description_data = description_task.result() - attributes_data = attributes_task.result() - restrictions_data = restrictions_task.result() - prerequisites_data = prerequisites_task.result() - # TODO: Filter out self-references from prerequisites - corequisites_data = corequisites_task.result() - corequisites_data = list( - filter( - lambda data: data.split()[-1] != class_entry["courseNumber"] - or " ".join(data.split()[:-1]) != class_entry["subjectDescription"], - corequisites_data, - ) + async with asyncio.TaskGroup() as tg: + description_task = tg.create_task(get_class_description(session, term, crn)) + attributes_task = tg.create_task(get_class_attributes(session, term, crn)) + restrictions_task = tg.create_task(get_class_restrictions(session, term, crn)) + prerequisites_task = tg.create_task(get_class_prerequisites(session, term, crn)) + corequisites_task = tg.create_task(get_class_corequisites(session, term, crn)) + crosslists_task = tg.create_task(get_class_crosslists(session, term, crn)) + + # Wait for tasks to complete and get results + description_data = description_task.result() + attributes_data = attributes_task.result() + restrictions_data = restrictions_task.result() + prerequisites_data = prerequisites_task.result() + corequisites_data = corequisites_task.result() + crosslists_data = crosslists_task.result() + + # Fill class entry with fetched details + class_entry["description"] = description_data + class_entry["attributes"] = attributes_data + class_entry["restrictions"] = restrictions_data + class_entry["prerequisites"] = prerequisites_data + class_entry["corequisites"] = corequisites_data + class_entry["crosslists"] = crosslists_data + + # Process instructor RCSIDs and names + class_faculty = class_entry["faculty"] + for instructor in sis_class_entry["faculty"]: + instructor_name = instructor["displayName"] + email_address = instructor["emailAddress"] + # Add faculty entry to class faculty list + class_faculty.append( + { + "bannerId": instructor["bannerId"], + "displayName": instructor_name, + "emailAddress": email_address, + "primaryFaculty": instructor["primaryIndicator"], + } ) - crosslists_data = crosslists_task.result() - crosslists_data = list( - filter( - lambda data: data.split()[-1] != class_entry["courseNumber"] - or " ".join(data.split()[:-1]) != class_entry["subjectDescription"], - crosslists_data, + if "emailAddress" not in instructor: + logger.warning( + f"Missing instructor email address field for CRN {crn} " + f"in term {term}: {instructor_name}" ) - ) - - # Build attribute code to name map - # Attributes are known to be in the format "Attribute Name CODE" - # Note the double space between name and code - if attribute_code_name_map is not None: - for attribute in attributes_data: - attribute_split = attribute.split() - if len(attribute_split) < 2: - logger.warning( - f"Skipping unexpected attribute format for CRN {crn} " - f"in term {term}: {attribute}" - ) + continue + # Add faculty RCSID to known RCSID map if provided + if ( + email_address is not None + and email_address.endswith("@rpi.edu") + and instructor_rcsid_name_map is not None + ): + rcsid = email_address.split("@")[0].lower() + instructor_rcsid_name_map[rcsid] = instructor_name + + # Append class entry to course data + course_data[course_num].append(class_entry) + + # Add to attribute code-to-name map + # Attributes are known to be in the format "Attribute Name CODE" + # Note the double space between name and code + if attribute_code_name_map is not None: + for attribute in attributes_data: + attribute_split = attribute.split() + if len(attribute_split) < 2: + logger.warning( + f"Skipping unexpected attribute format for CRN {crn} " + f"in term {term}: {attribute}" + ) + continue + attribute_code = attribute_split[-1].strip() + attribute_name = " ".join(attribute_split[:-1]).strip() + if ( + attribute_code in attribute_code_name_map + and attribute_code_name_map[attribute_code] != attribute_name + ): + logger.warning( + f"Conflicting attribute names for {attribute_code} " + f"in term {term}: " + f"{attribute_code_name_map[attribute_code]} vs. {attribute_name}" + ) + attribute_code_name_map[attribute_code] = attribute_name + + # Add to restriction code-to-name map + # Restrictions are known to be in the format "Restriction Name (CODE)" except + # for special approvals, which are handled explicitly as a special case. + if restriction_code_name_map is not None: + restriction_pattern = r"(.*)\((.*)\)" + for restriction_type in restrictions_data: + restriction_type = restriction_type.lower().replace("not_", "") + if restriction_type not in restriction_code_name_map: + restriction_code_name_map[restriction_type] = {} + for restriction in restrictions_data[restriction_type]: + restriction_match = re.match(restriction_pattern, restriction) + if restriction_match is None or len(restriction_match.groups()) < 2: + # Skip unexpected restriction formats or special approvals continue - attribute_code = attribute_split[-1].strip() - attribute_name = " ".join(attribute_split[:-1]).strip() + restriction_name = restriction_match.group(1).strip() + restriction_code = restriction_match.group(2).strip() if ( - attribute_code in attribute_code_name_map - and attribute_code_name_map[attribute_code] != attribute_name + restriction_name in restriction_code_name_map[restriction_type] + and restriction_code_name_map[restriction_type][restriction_code] + != restriction_name ): logger.warning( - f"Conflicting attribute names for {attribute_code} " + f"Conflicting restriction names for {restriction_code} " f"in term {term}: " - f"{attribute_code_name_map[attribute_code]} vs. {attribute_name}" + f"{restriction_code_name_map[ + restriction_type + ][restriction_code]} vs. {restriction_name}" ) - attribute_code_name_map[attribute_code] = attribute_name - - # Build restriction code to name map - # Restrictions are known to be in the format "Restriction Name (CODE)" except - # for special approvals, which are handled explicitly as a special case. - if restriction_code_name_map is not None: - restriction_pattern = r"(.*)\((.*)\)" - for restriction_type in restrictions_data: - restriction_type = restriction_type.lower().replace("not_", "") - if restriction_type not in restriction_code_name_map: - restriction_code_name_map[restriction_type] = {} - for restriction in restrictions_data[restriction_type]: - restriction_match = re.match(restriction_pattern, restriction) - if restriction_match is None or len(restriction_match.groups()) < 2: - # Skip unexpected restriction formats or special approvals - continue - restriction_name = restriction_match.group(1).strip() - restriction_code = restriction_match.group(2).strip() - if ( - restriction_name in restriction_code_name_map[restriction_type] - and restriction_code_name_map[restriction_type][ - restriction_code - ] - != restriction_name - ): - logger.warning( - f"Conflicting restriction names for {restriction_code} " - f"in term {term}: " - f"{restriction_code_name_map[ - restriction_type - ][restriction_code]} vs. {restriction_name}" - ) - restriction_code_name_map[restriction_type][ - restriction_code - ] = restriction_name - - # Initialize course entry with details - course_details = course_data[course_code]["course_detail"] - course_details["description"] = description_data - course_details["attributes"] = attributes_data - course_details["restrictions"] = restrictions_data - course_details["prerequisite"] = prerequisites_data - course_details["corequisite"] = list(set(corequisites_data)) - course_details["crosslist"] = list(set(crosslists_data)) - - course_details = course_data[course_code]["course_detail"] - - course_credits = course_details["credits"] - course_credits["min"] = min( - course_credits["min"], class_entry["creditHourLow"] or 0 - ) - course_credits["max"] = max( - course_credits["max"], - class_entry["creditHourLow"] or 0, - class_entry["creditHourHigh"] or 0, - ) - - course_sections = course_details["sections"] - class_faculty = class_entry["faculty"] - class_faculty_rcsids = [] - for instructor in class_faculty: - instructor_name = instructor["displayName"] - rcsid = "Unknown RCSID" - if "emailAddress" in instructor: - email_address = instructor["emailAddress"] - if email_address is not None and email_address.endswith("@rpi.edu"): - rcsid = email_address.split("@")[0].lower() - # Add to known RCSID set if provided - if instructor_rcsid_name_map is not None: - instructor_rcsid_name_map[rcsid] = instructor["displayName"] - else: - logger.warning( - f"Missing instructor email address field for CRN {crn} " - f"in term {term}: {instructor_name}" - ) - class_faculty_rcsids.append(f"{instructor_name} ({rcsid})") + restriction_code_name_map[restriction_type][ + restriction_code + ] = restriction_name - course_sections.append( - { - "CRN": class_entry["courseReferenceNumber"], - "instructor": class_faculty_rcsids, - "capacity": class_entry["maximumEnrollment"], - "registered": class_entry["enrollment"], - "open": class_entry["seatsAvailable"], - } - ) - -async def get_course_data( +async def get_subj_course_data( term: str, subject: str, instructor_rcsid_name_map: dict[str, str] = None, @@ -267,13 +235,6 @@ async def get_course_data( This function spawns its own client session to avoid session state conflicts with other subjects that may be processing concurrently. - In the context of this scraper, a "class" refers to a section of a course, while a - "course" refers to the overarching course that may have multiple classes. - - The data returned from SIS is keyed by classes, not courses. This function - manipulates and aggregates this data such that the returned structure is keyed by - courses instead, with classes as a sub-field of each course. - @param term: Term code to fetch data for. @param subject: Subject code to fetch data for. @param instructor_rcsid_name_map: Optional map to populate with instructor @@ -303,21 +264,26 @@ async def get_course_data( # Reset search state on server before fetching class data await reset_class_search(session, term) class_data = await class_search(session, term, subject) - course_data = {} + subj_class_data = {} async with asyncio.TaskGroup() as tg: for class_entry in class_data: tg.create_task( process_class_details( session, - course_data, + subj_class_data, class_entry, instructor_rcsid_name_map=instructor_rcsid_name_map, restriction_code_name_map=restriction_code_name_map, attribute_code_name_map=attribute_code_name_map, ) ) + # Sort class entries by section number + for course_num in subj_class_data: + subj_class_data[course_num] = sorted( + subj_class_data[course_num], key=lambda x: x["sectionNumber"] + ) # Return data sorted by course code - return dict(sorted(course_data.items())) + return dict(sorted(subj_class_data.items())) except aiohttp.ClientError as e: logger.error(f"Error processing subject {subject} in term {term}: {e}") return {} @@ -391,11 +357,11 @@ async def get_term_course_data( for subject in subjects: subject_code = subject["code"] term_course_data[subject_code] = { - "subject_name": subject["description"], + "subjectName": subject["description"], "courses": {}, } task = tg.create_task( - get_course_data( + get_subj_course_data( term, subject_code, instructor_rcsid_name_map=instructor_rcsid_name_map, From 7fee7a4cc3cc2f0163923f40a3dfd4cb1d18234f Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Tue, 2 Dec 2025 17:13:16 -0500 Subject: [PATCH 02/24] Print traceback on term error --- sis_scraper/sis_scraper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 474e77e..59e1100 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -375,6 +375,9 @@ async def get_term_course_data( tasks.append(task) except Exception as e: logger.error(f"Error processing subjects for term {term}: {e}") + import traceback + + traceback.print_exc() return False # Wait for all tasks to complete and gather results From 14dc824c4ea3f25067fb33d58693f9515cfa3a28 Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Tue, 2 Dec 2025 17:18:03 -0500 Subject: [PATCH 03/24] Allow null values for creditMin and creditMax As far as we know, creditMin should never be null in the SIS data. But creditMax can be null, depending on whether the class has a credit range or not. --- sis_scraper/sis_scraper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 59e1100..a703c7a 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -96,8 +96,8 @@ async def process_class_details( "prerequisites": [], "corequisites": [], "crosslists": [], - "creditMin": sis_class_entry["creditHourLow"] or 0, - "creditMax": sis_class_entry["creditHourHigh"] or 0, + "creditMin": sis_class_entry["creditHourLow"], + "creditMax": sis_class_entry["creditHourHigh"], "faculty": [], "seatsCapacity": sis_class_entry["maximumEnrollment"], "seatsRegistered": sis_class_entry["enrollment"], From e3b60278ce821073cf55d85f0ae58d97938c0bc7 Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Tue, 2 Dec 2025 18:30:36 -0500 Subject: [PATCH 04/24] Add meeting info to output JSON data This data will probably be expanded on later once we understand better what all of the data from SIS really means. --- sis_scraper/sis_scraper.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index a703c7a..7dfefe3 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -80,6 +80,7 @@ async def process_class_details( course_num = sis_class_entry["courseNumber"] term = sis_class_entry["term"] crn = sis_class_entry["courseReferenceNumber"] + meetings_list = sis_class_entry["meetingsFaculty"] # Initialize course entry if not already present if course_num not in course_data: @@ -98,11 +99,42 @@ async def process_class_details( "crosslists": [], "creditMin": sis_class_entry["creditHourLow"], "creditMax": sis_class_entry["creditHourHigh"], - "faculty": [], "seatsCapacity": sis_class_entry["maximumEnrollment"], "seatsRegistered": sis_class_entry["enrollment"], "seatsAvailable": sis_class_entry["seatsAvailable"], + "faculty": [], + "meetingInfo": [], + } + + day_codes = { + "sunday": "U", + "monday": "M", + "tuesday": "T", + "wednesday": "W", + "thursday": "R", + "friday": "F", + "saturday": "S", } + # Process meeting information + for meeting in meetings_list: + sis_meeting_info = meeting["meetingTime"] + meeting_info = { + "beginTime": sis_meeting_info["beginTime"], + "endTime": sis_meeting_info["endTime"], + "creditHours": sis_meeting_info["creditHourSession"], + "campusCode": sis_meeting_info["campus"], + "campusDescription": sis_meeting_info["campusDescription"], + "buildingCode": sis_meeting_info["building"], + "buildingDescription": sis_meeting_info["buildingDescription"], + "room": sis_meeting_info["room"], + "startDate": sis_meeting_info["startDate"], + "endDate": sis_meeting_info["endDate"], + "days": [], + } + for day in day_codes: + if sis_meeting_info[day]: + meeting_info["days"].append(day_codes[day]) + class_entry["meetingInfo"].append(meeting_info) # Fetch class details not included in main class details async with asyncio.TaskGroup() as tg: From e4c905c8b041639f29affae4786df8c0ed79f4a1 Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Thu, 25 Dec 2025 17:12:09 -0500 Subject: [PATCH 05/24] Add waitlist metrics to class data --- sis_scraper/sis_scraper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 7dfefe3..1c3af7d 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -102,6 +102,9 @@ async def process_class_details( "seatsCapacity": sis_class_entry["maximumEnrollment"], "seatsRegistered": sis_class_entry["enrollment"], "seatsAvailable": sis_class_entry["seatsAvailable"], + "waitlistCapacity": sis_class_entry["waitCapacity"], + "waitlistRegistered": sis_class_entry["waitCount"], + "waitlistAvailable": sis_class_entry["waitAvailable"], "faculty": [], "meetingInfo": [], } From 733aba9bafa5d3964fcb47bb08fedf39260b6b6d Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Thu, 25 Dec 2025 17:13:03 -0500 Subject: [PATCH 06/24] Add meeting categories to class data From what I understand after a brief look, L typically means lecture, T is test, and B is lab/recitation. I'm not sure if there are others, and SIS doesn't seem to provide any legend for this either. --- sis_scraper/sis_scraper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 1c3af7d..069a1b7 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -129,6 +129,7 @@ async def process_class_details( "campusDescription": sis_meeting_info["campusDescription"], "buildingCode": sis_meeting_info["building"], "buildingDescription": sis_meeting_info["buildingDescription"], + "category": sis_meeting_info["category"], "room": sis_meeting_info["room"], "startDate": sis_meeting_info["startDate"], "endDate": sis_meeting_info["endDate"], From ed235e62c75dfa05c9d58d7d6c26e1f7dd2cd6b6 Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Thu, 25 Dec 2025 21:48:38 -0500 Subject: [PATCH 07/24] Add return type to get_class_crosslists --- sis_scraper/sis_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sis_scraper/sis_api.py b/sis_scraper/sis_api.py index 3b1c7b1..8a1c8ac 100644 --- a/sis_scraper/sis_api.py +++ b/sis_scraper/sis_api.py @@ -586,7 +586,7 @@ async def get_class_crosslists( session: aiohttp.ClientSession, term: str, crn: str, -): +) -> list[dict[str, Any]]: """ Fetches and parses data from the "Cross Listed" tab of a class details page. From b6a2490e834538b49e09f8d39dfb4ec922da77c4 Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Thu, 25 Dec 2025 22:16:31 -0500 Subject: [PATCH 08/24] Add _process_class_meetings() This is a first step in centralizing the logic for processing class meeting information. --- sis_scraper/sis_api.py | 58 ++++++++++++++++++++++++++++++++++++++ sis_scraper/sis_scraper.py | 36 ++--------------------- 2 files changed, 61 insertions(+), 33 deletions(-) diff --git a/sis_scraper/sis_api.py b/sis_scraper/sis_api.py index 8a1c8ac..990c722 100644 --- a/sis_scraper/sis_api.py +++ b/sis_scraper/sis_api.py @@ -645,3 +645,61 @@ async def get_class_crosslists( } ) return crosslists + + +def _process_class_meetings( + sis_meetings_list: list[dict[str, Any]], +) -> list[dict[str, Any]]: + """ + Processes raw class meeting data from SIS into a more usable format. + + Returned data format is as follows: + [ + { + "beginTime": "0800", + "endTime": "0950", + "creditHours": 4, + "campusCode": "T", + "campusDescription": "Troy", + "buildingCode": "SAGE", + "buildingDescription": "Russell Sage Laboratory", + "category": "L", + "room": "303", + "startDate": "01/15/2024", + "endDate": "05/01/2024", + "days": ["M", "W", "F"] + }, + ... + ] + """ + meetings_list = [] + day_codes = { + "sunday": "U", + "monday": "M", + "tuesday": "T", + "wednesday": "W", + "thursday": "R", + "friday": "F", + "saturday": "S", + } + for meeting in sis_meetings_list: + sis_meeting_info = meeting["meetingTime"] + meeting_info = { + "beginTime": sis_meeting_info["beginTime"], + "endTime": sis_meeting_info["endTime"], + "creditHours": sis_meeting_info["creditHourSession"], + "campusCode": sis_meeting_info["campus"], + "campusDescription": sis_meeting_info["campusDescription"], + "buildingCode": sis_meeting_info["building"], + "buildingDescription": sis_meeting_info["buildingDescription"], + "category": sis_meeting_info["category"], + "room": sis_meeting_info["room"], + "startDate": sis_meeting_info["startDate"], + "endDate": sis_meeting_info["endDate"], + "days": [], + } + for day in day_codes: + if sis_meeting_info[day]: + meeting_info["days"].append(day_codes[day]) + meetings_list.append(meeting_info) + return meetings_list diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 069a1b7..36e4c6f 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -17,6 +17,7 @@ get_class_prerequisites, get_class_restrictions, get_term_subjects, + _process_class_meetings, reset_class_search, ) @@ -80,7 +81,7 @@ async def process_class_details( course_num = sis_class_entry["courseNumber"] term = sis_class_entry["term"] crn = sis_class_entry["courseReferenceNumber"] - meetings_list = sis_class_entry["meetingsFaculty"] + sis_meetings_list = sis_class_entry["meetingsFaculty"] # Initialize course entry if not already present if course_num not in course_data: @@ -106,40 +107,9 @@ async def process_class_details( "waitlistRegistered": sis_class_entry["waitCount"], "waitlistAvailable": sis_class_entry["waitAvailable"], "faculty": [], - "meetingInfo": [], + "meetingInfo": _process_class_meetings(sis_meetings_list), } - day_codes = { - "sunday": "U", - "monday": "M", - "tuesday": "T", - "wednesday": "W", - "thursday": "R", - "friday": "F", - "saturday": "S", - } - # Process meeting information - for meeting in meetings_list: - sis_meeting_info = meeting["meetingTime"] - meeting_info = { - "beginTime": sis_meeting_info["beginTime"], - "endTime": sis_meeting_info["endTime"], - "creditHours": sis_meeting_info["creditHourSession"], - "campusCode": sis_meeting_info["campus"], - "campusDescription": sis_meeting_info["campusDescription"], - "buildingCode": sis_meeting_info["building"], - "buildingDescription": sis_meeting_info["buildingDescription"], - "category": sis_meeting_info["category"], - "room": sis_meeting_info["room"], - "startDate": sis_meeting_info["startDate"], - "endDate": sis_meeting_info["endDate"], - "days": [], - } - for day in day_codes: - if sis_meeting_info[day]: - meeting_info["days"].append(day_codes[day]) - class_entry["meetingInfo"].append(meeting_info) - # Fetch class details not included in main class details async with asyncio.TaskGroup() as tg: description_task = tg.create_task(get_class_description(session, term, crn)) From b36289f0c0d67df9da2a8f6c000d5987e84c361c Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Thu, 25 Dec 2025 22:19:21 -0500 Subject: [PATCH 09/24] Add get_class_meetings() This function will be necessary for classes that don't appear in SIS's main class search. --- sis_scraper/sis_api.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/sis_scraper/sis_api.py b/sis_scraper/sis_api.py index 990c722..654ebe8 100644 --- a/sis_scraper/sis_api.py +++ b/sis_scraper/sis_api.py @@ -647,6 +647,44 @@ async def get_class_crosslists( return crosslists +async def get_class_meetings( + session: aiohttp.ClientSession, + term: str, + crn: str, +) -> list[dict[str, Any]]: + """ + Fetches and parses data from the "Instructor/Meeting Times" tab of a class details + page. + + Returned data format is as follows: + [ + { + "beginTime": "0800", + "endTime": "0950", + "creditHours": 4, + "campusCode": "T", + "campusDescription": "Troy", + "buildingCode": "SAGE", + "buildingDescription": "Russell Sage Laboratory", + "category": "L", + "room": "303", + "startDate": "01/15/2024", + "endDate": "05/01/2024", + "days": ["M", "W", "F"] + }, + ... + ] + """ + url = _BASE_URL + "searchResults/getFacultyMeetingTimes" + params = {"term": term, "courseReferenceNumber": crn} + raw_data = await retry_get(session, url, params) + json_data = json.loads(raw_data) + json_data = html_unescape(json_data) + sis_meetings_list = json_data["fmt"] + meetings_list = _process_class_meetings(sis_meetings_list) + return meetings_list + + def _process_class_meetings( sis_meetings_list: list[dict[str, Any]], ) -> list[dict[str, Any]]: From 035395cfec9f2167d82cd5cd19d9737eedf4197b Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Thu, 25 Dec 2025 22:19:49 -0500 Subject: [PATCH 10/24] Add get_class_details() See last commit description. --- sis_scraper/sis_api.py | 49 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/sis_scraper/sis_api.py b/sis_scraper/sis_api.py index 654ebe8..1cfbd01 100644 --- a/sis_scraper/sis_api.py +++ b/sis_scraper/sis_api.py @@ -272,6 +272,55 @@ async def class_search( return course_data +async def get_class_details( + session: aiohttp.ClientSession, term: str, crn: str +) -> dict[str, Any]: + """ + Fetches and parses data from the "Details" tab of a class details page. + + Returned data format is as follows: + ``` + { + "courseReferenceNumber": "12345", + "subjectName": "Computer Science", + "courseNumber": "1100", + "title": "COMPUTER SCIENCE I", + "sectionNumber": "01", + "creditMin": 4, + "creditMax": None + } + ``` + """ + url = _BASE_URL + "searchResults/getClassDetails" + params = {"term": term, "courseReferenceNumber": crn} + raw_data = await retry_get(session, url, params) + raw_data = html_unescape(raw_data) + soup = bs4.BeautifulSoup(raw_data, "html5lib") + details_tag = soup.find("section", {"aria-labelledby": "classDetails"}) + crn = details_tag.find("span", {"id": "courseReferenceNumber"}).text.strip() + section_num = details_tag.find("span", {"id": "sectionNumber"}).text.strip() + subj_name = details_tag.find("span", {"id": "subject"}).text.strip() + course_num = details_tag.find("span", {"id": "courseDisplay"}).text.strip() + title = details_tag.find("span", {"id": "courseTitle"}).text.strip() + # Only courses with a credit range have a span with id "credit-hours-discretion", + # otherwise the credit hours span follows a span with text "Credit Hours:". + credit_min, credit_max = None, None + if credit_hours_tag := details_tag.find("span", {"id": "credit-hours-discretion"}): + credit_min, credit_max = credit_hours_tag.text.strip().split(" TO ") + else: + credit_hours_tag = details_tag.find("span", text="Credit Hours:") + credit_min = credit_hours_tag.find_next_sibling("span").text.strip() + return { + "courseReferenceNumber": crn, + "subjectName": subj_name, + "courseNumber": course_num, + "title": title, + "sectionNumber": section_num, + "creditMin": int(credit_min) if credit_min is not None else None, + "creditMax": int(credit_max) if credit_max is not None else None, + } + + async def get_class_description( session: aiohttp.ClientSession, term: str, crn: str ) -> str: From b2a143d52e07f9834514a8e94656a544f5dcaa16 Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Thu, 25 Dec 2025 22:20:04 -0500 Subject: [PATCH 11/24] Add get_class_enrollment() See last commit description. --- sis_scraper/sis_api.py | 51 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/sis_scraper/sis_api.py b/sis_scraper/sis_api.py index 1cfbd01..17048b9 100644 --- a/sis_scraper/sis_api.py +++ b/sis_scraper/sis_api.py @@ -349,6 +349,57 @@ async def get_class_description( return text +async def get_class_enrollment( + session: aiohttp.ClientSession, term: str, crn: str +) -> dict[str, Any]: + """ + Fetches and parses data from the "Enrollment/Waitlist" tab of a class details page. + + Returned data format is as follows: + ``` + { + "enrollmentActual": 28, + "enrollmentMaximum": 30, + "enrollmentSeatsAvailable": 2, + "waitlistActual": 0, + "waitlistMaximum": 10, + "waitlistSeatsAvailable": 10 + } + ``` + """ + url = _BASE_URL + "searchResults/getEnrollmentInfo" + params = {"term": term, "courseReferenceNumber": crn} + raw_data = await retry_get(session, url, params) + raw_data = html_unescape(raw_data) + soup = bs4.BeautifulSoup(raw_data, "html5lib") + enrollment_tag = soup.find("section", {"aria-labelledby": "enrollmentInfo"}) + # There are no relevant classes or ids on the spans, so we have to rely on the text + # content of the preceding tags. + enrollment_data = {} + span_tags = enrollment_tag.find_all("span") + # Dynamically create dictionary keys based on span text + for span_tag in span_tags: + span_text = span_tag.text.strip() + # Skip numeric span texts + if span_text.isdigit(): + continue + words = span_text.split() + # Skip empty span texts + if not words: + continue + first_word = words[0] + # Lowercase the entire first word if it's all uppercase, otherwise just lowercase + # the first character. + if first_word.isupper(): + first_word = first_word.lower() + else: + first_word = first_word[0].lower() + first_word[1:] + # Construct the dictionary key + dict_key = first_word + "".join(word for word in words[1:]) + enrollment_data[dict_key] = int(span_tag.find_next_sibling("span").text.strip()) + return enrollment_data + + async def get_class_attributes( session: aiohttp.ClientSession, term: str, crn: str ) -> list[str]: From aad101099d422b92ff0dae574c6217929f786330 Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 03:43:36 -0500 Subject: [PATCH 12/24] Fix credit parsing in get_class_details() Previously, the credit values would end up as the name of the next tag in the HTML data (which was often "Grade Modifiers:"). This has been fixed by replacing find_next_sibling() with the next_sibling attribute. --- sis_scraper/sis_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sis_scraper/sis_api.py b/sis_scraper/sis_api.py index 17048b9..9e4dd7c 100644 --- a/sis_scraper/sis_api.py +++ b/sis_scraper/sis_api.py @@ -309,7 +309,7 @@ async def get_class_details( credit_min, credit_max = credit_hours_tag.text.strip().split(" TO ") else: credit_hours_tag = details_tag.find("span", text="Credit Hours:") - credit_min = credit_hours_tag.find_next_sibling("span").text.strip() + credit_min = credit_hours_tag.next_sibling.text.strip() return { "courseReferenceNumber": crn, "subjectName": subj_name, From 2b176a88c6cb8b17634cae3826b9859ea74a0800 Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 03:44:51 -0500 Subject: [PATCH 13/24] Enforce static dict keys in get_class_enrollment() The previous approach was a very experimental one that I tried for fun; after thinking about it, it's probably not great for data consistency over time, as changes in the names of the fields on SIS would cause a crash in the SIS scraper. --- sis_scraper/sis_api.py | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/sis_scraper/sis_api.py b/sis_scraper/sis_api.py index 9e4dd7c..0ded57b 100644 --- a/sis_scraper/sis_api.py +++ b/sis_scraper/sis_api.py @@ -359,11 +359,11 @@ async def get_class_enrollment( ``` { "enrollmentActual": 28, - "enrollmentMaximum": 30, - "enrollmentSeatsAvailable": 2, + "enrollmentCapacity": 30, + "enrollmentAvailable": 2, "waitlistActual": 0, - "waitlistMaximum": 10, - "waitlistSeatsAvailable": 10 + "waitlistCapacity": 10, + "waitlistAvailable": 10 } ``` """ @@ -377,26 +377,20 @@ async def get_class_enrollment( # content of the preceding tags. enrollment_data = {} span_tags = enrollment_tag.find_all("span") - # Dynamically create dictionary keys based on span text - for span_tag in span_tags: - span_text = span_tag.text.strip() - # Skip numeric span texts - if span_text.isdigit(): - continue - words = span_text.split() - # Skip empty span texts - if not words: - continue - first_word = words[0] - # Lowercase the entire first word if it's all uppercase, otherwise just lowercase - # the first character. - if first_word.isupper(): - first_word = first_word.lower() - else: - first_word = first_word[0].lower() + first_word[1:] - # Construct the dictionary key - dict_key = first_word + "".join(word for word in words[1:]) - enrollment_data[dict_key] = int(span_tag.find_next_sibling("span").text.strip()) + for i, tag in enumerate(span_tags): + text = tag.text.strip() + if text == "Enrollment Actual:": + enrollment_data["enrollmentActual"] = int(span_tags[i + 1].text.strip()) + elif text == "Enrollment Maximum:": + enrollment_data["enrollmentCapacity"] = int(span_tags[i + 1].text.strip()) + elif text == "Enrollment Seats Available:": + enrollment_data["enrollmentAvailable"] = int(span_tags[i + 1].text.strip()) + elif text == "Waitlist Capacity:": + enrollment_data["waitlistCapacity"] = int(span_tags[i + 1].text.strip()) + elif text == "Waitlist Actual:": + enrollment_data["waitlistActual"] = int(span_tags[i + 1].text.strip()) + elif text == "Waitlist Seats Available:": + enrollment_data["waitlistAvailable"] = int(span_tags[i + 1].text.strip()) return enrollment_data From c0334207563cc9cc11837deb075b5cebe5719e3b Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 03:46:07 -0500 Subject: [PATCH 14/24] Refactor class faculty meeting processing Renamed get_class_meetings to get_class_faculty_meetings and updated its return format to include both faculty and meeting details. Refactored the processing logic to associate faculty with meetings, assign unique meeting IDs, and provide a more structured output. Improved error handling and logging for missing data. --- sis_scraper/sis_api.py | 161 ++++++++++++++++++++++++++--------------- 1 file changed, 103 insertions(+), 58 deletions(-) diff --git a/sis_scraper/sis_api.py b/sis_scraper/sis_api.py index 0ded57b..b2e7a62 100644 --- a/sis_scraper/sis_api.py +++ b/sis_scraper/sis_api.py @@ -741,7 +741,7 @@ async def get_class_crosslists( return crosslists -async def get_class_meetings( +async def get_class_faculty_meetings( session: aiohttp.ClientSession, term: str, crn: str, @@ -751,59 +751,63 @@ async def get_class_meetings( page. Returned data format is as follows: + ``` [ - { - "beginTime": "0800", - "endTime": "0950", - "creditHours": 4, - "campusCode": "T", - "campusDescription": "Troy", - "buildingCode": "SAGE", - "buildingDescription": "Russell Sage Laboratory", - "category": "L", - "room": "303", - "startDate": "01/15/2024", - "endDate": "05/01/2024", - "days": ["M", "W", "F"] - }, + "faculty": [ + { + "bannerId": "123456", + "displayName": "Last, First", + "emailAddress": "example@rpi.edu", + "allMeetings": [1, 2, ...], + "primaryMeetings": [1, ...] + }, + ... + ], + "meetings": [ + { + "id": 1, + "beginTime": "0800", + "endTime": "0950", + "creditHours": 4, + "campusCode": "T", + "campusDescription": "Troy", + "buildingCode": "SAGE", + "buildingDescription": "Russell Sage Laboratory", + "category": "L", + "room": "303", + "startDate": "01/15/2024", + "endDate": "05/01/2024", + "days": ["M", "W", "F"] + }, + ... + ] ... ] + ``` """ url = _BASE_URL + "searchResults/getFacultyMeetingTimes" params = {"term": term, "courseReferenceNumber": crn} raw_data = await retry_get(session, url, params) json_data = json.loads(raw_data) json_data = html_unescape(json_data) - sis_meetings_list = json_data["fmt"] - meetings_list = _process_class_meetings(sis_meetings_list) + sis_faculty_meetings_list = json_data["fmt"] + meetings_list = _process_class_faculty_meetings( + sis_faculty_meetings_list, term, crn + ) return meetings_list -def _process_class_meetings( - sis_meetings_list: list[dict[str, Any]], +def _process_class_faculty_meetings( + sis_faculty_meetings_list: list[dict[str, Any]], + term: str, + crn: str, ) -> list[dict[str, Any]]: """ Processes raw class meeting data from SIS into a more usable format. - Returned data format is as follows: - [ - { - "beginTime": "0800", - "endTime": "0950", - "creditHours": 4, - "campusCode": "T", - "campusDescription": "Troy", - "buildingCode": "SAGE", - "buildingDescription": "Russell Sage Laboratory", - "category": "L", - "room": "303", - "startDate": "01/15/2024", - "endDate": "05/01/2024", - "days": ["M", "W", "F"] - }, - ... - ] + See get_class_faculty_meetings() for returned data format. """ + faculty_dict = {} meetings_list = [] day_codes = { "sunday": "U", @@ -814,24 +818,65 @@ def _process_class_meetings( "friday": "F", "saturday": "S", } - for meeting in sis_meetings_list: - sis_meeting_info = meeting["meetingTime"] - meeting_info = { - "beginTime": sis_meeting_info["beginTime"], - "endTime": sis_meeting_info["endTime"], - "creditHours": sis_meeting_info["creditHourSession"], - "campusCode": sis_meeting_info["campus"], - "campusDescription": sis_meeting_info["campusDescription"], - "buildingCode": sis_meeting_info["building"], - "buildingDescription": sis_meeting_info["buildingDescription"], - "category": sis_meeting_info["category"], - "room": sis_meeting_info["room"], - "startDate": sis_meeting_info["startDate"], - "endDate": sis_meeting_info["endDate"], - "days": [], - } - for day in day_codes: - if sis_meeting_info[day]: - meeting_info["days"].append(day_codes[day]) - meetings_list.append(meeting_info) - return meetings_list + for i, faculty_meeting in enumerate(sis_faculty_meetings_list, start=1): + sis_meeting_info = faculty_meeting["meetingTime"] + sis_faculty_list = faculty_meeting["faculty"] + if sis_meeting_info is None and sis_faculty_list is None: + logger.warning( + "Found faculty-meeting entry with no meeting info or faculty list for " + f"CRN {crn} in term {term}" + ) + continue + if sis_meeting_info is not None: + meeting_info = { + "id": i, + "beginTime": sis_meeting_info["beginTime"], + "endTime": sis_meeting_info["endTime"], + "creditHours": sis_meeting_info["creditHourSession"], + "campusCode": sis_meeting_info["campus"], + "campusDescription": sis_meeting_info["campusDescription"], + "buildingCode": sis_meeting_info["building"], + "buildingDescription": sis_meeting_info["buildingDescription"], + "category": sis_meeting_info["category"], + "room": sis_meeting_info["room"], + "startDate": sis_meeting_info["startDate"], + "endDate": sis_meeting_info["endDate"], + "days": [ + code for day, code in day_codes.items() if sis_meeting_info[day] + ], + } + meetings_list.append(meeting_info) + else: + logger.warning( + f"Found faculty-meeting entry with no meeting info for CRN {crn} " + f"in term {term}" + ) + if sis_faculty_list is not None: + for faculty in sis_faculty_list: + banner_id = faculty["bannerId"] + # Initialize faculty entry if it doesn't exist + faculty_entry = faculty_dict.setdefault( + banner_id, + { + "displayName": faculty["displayName"], + "emailAddress": faculty["emailAddress"], + "allMeetings": [], + "primaryMeetings": [], + }, + ) + if sis_meeting_info: + # Add meeting ID to faculty's meeting lists + faculty_entry["allMeetings"].append(i) + if faculty["primaryIndicator"]: + faculty_entry["primaryMeetings"].append(i) + else: + logger.warning( + f"Found faculty-meeting entry with no faculty list for CRN {crn} " + f"in term {term}" + ) + # Convert faculty dictionary into a list + faculty_list = list(faculty_dict.values()) + return { + "faculty": faculty_list, + "meetings": meetings_list, + } From 3f5321d4559df3364fbd3ce70e97a4245d34fd78 Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 03:46:47 -0500 Subject: [PATCH 15/24] Change some error logs to fatal logs In these cases, fatal makes more sense as the scraper is designed to not recover after these errors. --- sis_scraper/sis_scraper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 36e4c6f..22c9009 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -464,7 +464,7 @@ async def main( """ if output_data_dir is None: - logger.error("No data output directory specified") + logger.fatal("No data output directory specified") return False # Convert paths to Path objects if given as strings @@ -544,7 +544,7 @@ async def main( f"at {subject_code_name_map_path}" ) except Exception as e: - logger.error(f"Error loading code mapping files: {e}") + logger.fatal(f"Error loading code mapping files: {e}") import traceback traceback.print_exc() @@ -592,7 +592,7 @@ async def main( num_terms_processed += 1 except Exception as e: - logger.error(f"Error in SIS scraper: {e}") + logger.fatal(f"Error in SIS scraper: {e}") import traceback traceback.print_exc() From 5bbcfd6e46ba7c55928e9119b2a0718daa847b40 Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 03:50:51 -0500 Subject: [PATCH 16/24] Create global TCPConnector for aiohttp sessions Replaces per-call TCPConnector creation with a single shared aiohttp.TCPConnector instance, passed to all relevant functions. This reduces resource usage and allows for better connection pooling across parallel tasks. --- sis_scraper/sis_scraper.py | 55 +++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 22c9009..f0cbbb8 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -232,7 +232,7 @@ async def get_subj_course_data( restriction_code_name_map: dict[str, dict[str, str]] = None, attribute_code_name_map: dict[str, str] = None, semaphore: asyncio.Semaphore = asyncio.Semaphore(1), - limit_per_host: int = 5, + tcp_connector: aiohttp.TCPConnector = None, timeout: int = 30, ) -> dict[str, dict[str, Any]]: """ @@ -257,14 +257,10 @@ async def get_subj_course_data( @return: Dictionary of course data keyed by course code. """ async with semaphore: - # Limit simultaneous connections to SIS server per session - connector = aiohttp.TCPConnector( - ttl_dns_cache=500, limit_per_host=limit_per_host - ) timeout_obj = aiohttp.ClientTimeout(total=timeout) async with aiohttp.ClientSession( - connector=connector, timeout=timeout_obj + connector=tcp_connector, timeout=timeout_obj ) as session: try: # Reset search state on server before fetching class data @@ -303,7 +299,7 @@ async def get_term_course_data( restriction_code_name_map: dict[str, dict[str, str]] = None, attribute_code_name_map: dict[str, str] = None, semaphore: asyncio.Semaphore = asyncio.Semaphore(10), - limit_per_host: int = 5, + tcp_connector: aiohttp.TCPConnector = None, timeout: int = 30, ) -> None: """ @@ -374,7 +370,7 @@ async def get_term_course_data( restriction_code_name_map=restriction_code_name_map, attribute_code_name_map=attribute_code_name_map, semaphore=semaphore, - limit_per_host=limit_per_host, + tcp_connector=tcp_connector, timeout=timeout, ) ) @@ -562,25 +558,30 @@ async def main( tasks: list[asyncio.Task] = [] num_terms_processed = 0 try: - # Process terms in parallel - async with asyncio.TaskGroup() as tg: - for year in range(start_year, end_year + 1): - for season in seasons: - term = get_term_code(year, season) - if term == "": - continue - output_path = Path(output_data_dir) / f"{term}.json" - task = tg.create_task( - get_term_course_data( - term, - output_path=output_path, - subject_code_name_map=subject_code_name_map, - instructor_rcsid_name_map=instructor_rcsid_name_map, - restriction_code_name_map=restriction_code_name_map, - attribute_code_name_map=attribute_code_name_map, - semaphore=semaphore, - limit_per_host=limit_per_host, - timeout=timeout, + # Global TCP connector for all sessions + async with aiohttp.TCPConnector( + ttl_dns_cache=500, limit_per_host=limit_per_host + ) as tcp_connector: + # Process terms in parallel + async with asyncio.TaskGroup() as tg: + for year in range(start_year, end_year + 1): + for season in seasons: + term = get_term_code(year, season) + if term == "": + continue + output_path = Path(output_data_dir) / f"{term}.json" + task = tg.create_task( + get_term_course_data( + term, + output_path=output_path, + subject_code_name_map=subject_code_name_map, + instructor_rcsid_name_map=instructor_rcsid_name_map, + restriction_code_name_map=restriction_code_name_map, + attribute_code_name_map=attribute_code_name_map, + semaphore=semaphore, + tcp_connector=tcp_connector, + timeout=timeout, + ) ) ) tasks.append(task) From 019d3313cb90e7226600af88fe323e1792c2188f Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 03:51:43 -0500 Subject: [PATCH 17/24] Add support for hidden CRN detection and parsing Refactored process_class_details to support fetching class details by CRN and term when SIS class entry is not available. The course data structure is now keyed by subject description during processing and converted to subject code before output. Added logic to detect and process hidden crosslisted CRNs not present in the main class search. Improved parallelization and data consistency throughout the scraping process. --- sis_scraper/sis_scraper.py | 247 ++++++++++++++++++++++++++----------- 1 file changed, 178 insertions(+), 69 deletions(-) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index f0cbbb8..436b525 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -14,10 +14,12 @@ get_class_corequisites, get_class_crosslists, get_class_description, + get_class_details, + get_class_enrollment, + get_class_faculty_meetings, get_class_prerequisites, get_class_restrictions, get_term_subjects, - _process_class_meetings, reset_class_search, ) @@ -54,8 +56,11 @@ def get_term_code(year: str | int, season: str) -> str: async def process_class_details( session: aiohttp.ClientSession, - course_data: dict[str, Any], - sis_class_entry: dict[str, Any], + term_course_data: dict[str, Any], + term_crn_set: set[str], + sis_class_entry: dict[str, Any] | None = None, + term: str | None = None, + crn: str | None = None, instructor_rcsid_name_map: dict[str, str] = None, attribute_code_name_map: dict[str, str] = None, restriction_code_name_map: dict[str, dict[str, str]] = None, @@ -67,9 +72,10 @@ async def process_class_details( Takes as input class data fetched from SIS's class search endpoint. @param session: aiohttp client session to use for requests. - @param course_data: Dictionary to populate with course data. + @param subj_class_data: Subject course data dictionary to populate with class entries. @param sis_class_entry: Class data fetched from SIS's class search endpoint. + @param term_crn_set: Set of all CRNs processed in the term. @param known_rcsid_set: Optional set to populate with known instructor RCSIDs. @param attribute_code_name_map: Optional map to populate with attribute @@ -78,39 +84,50 @@ async def process_class_details( codes to names. @return: None """ - course_num = sis_class_entry["courseNumber"] - term = sis_class_entry["term"] - crn = sis_class_entry["courseReferenceNumber"] - sis_meetings_list = sis_class_entry["meetingsFaculty"] + if sis_class_entry is None and (term is None or crn is None): + logger.error( + "Either sis_class_entry or both term and crn must be provided " + "to process_class_details" + ) + return - # Initialize course entry if not already present - if course_num not in course_data: - course_data[course_num] = [] + # Extract basic class details from SIS class entry if provided + if sis_class_entry is not None: + subject_desc = sis_class_entry["subjectDescription"] + course_num = sis_class_entry["courseNumber"] + term = sis_class_entry["term"] + crn = sis_class_entry["courseReferenceNumber"] + + # Add CRN to term CRN set + if crn in term_crn_set: + logger.warning(f"Duplicate CRN {crn} found in term {term}") + else: + term_crn_set.add(crn) # Initialize empty class entry class_entry = { - "courseReferenceNumber": sis_class_entry["courseReferenceNumber"], - "sectionNumber": sis_class_entry["sequenceNumber"], - "title": sis_class_entry["courseTitle"], + "courseReferenceNumber": crn, + "sectionNumber": "", + "title": "", "description": "", "attributes": [], "restrictions": {}, "prerequisites": [], "corequisites": [], "crosslists": [], - "creditMin": sis_class_entry["creditHourLow"], - "creditMax": sis_class_entry["creditHourHigh"], - "seatsCapacity": sis_class_entry["maximumEnrollment"], - "seatsRegistered": sis_class_entry["enrollment"], - "seatsAvailable": sis_class_entry["seatsAvailable"], - "waitlistCapacity": sis_class_entry["waitCapacity"], - "waitlistRegistered": sis_class_entry["waitCount"], - "waitlistAvailable": sis_class_entry["waitAvailable"], + "creditMin": -1, + "creditMax": -1, + "seatsCapacity": -1, + "seatsRegistered": -1, + "seatsAvailable": -1, + "waitlistCapacity": -1, + "waitlistRegistered": -1, + "waitlistAvailable": -1, "faculty": [], - "meetingInfo": _process_class_meetings(sis_meetings_list), + "meetingInfo": [], } - # Fetch class details not included in main class details + # Fetch class details not included in SIS class search async with asyncio.TaskGroup() as tg: description_task = tg.create_task(get_class_description(session, term, crn)) attributes_task = tg.create_task(get_class_attributes(session, term, crn)) @@ -118,6 +135,13 @@ async def process_class_details( prerequisites_task = tg.create_task(get_class_prerequisites(session, term, crn)) corequisites_task = tg.create_task(get_class_corequisites(session, term, crn)) crosslists_task = tg.create_task(get_class_crosslists(session, term, crn)) + faculty_meetings_task = tg.create_task( + get_class_faculty_meetings(session, term, crn) + ) + # Fetch full class details if not provided from SIS class search + if sis_class_entry is None: + details_task = tg.create_task(get_class_details(session, term, crn)) + enrollment_task = tg.create_task(get_class_enrollment(session, term, crn)) # Wait for tasks to complete and get results description_data = description_task.result() @@ -126,6 +150,15 @@ async def process_class_details( prerequisites_data = prerequisites_task.result() corequisites_data = corequisites_task.result() crosslists_data = crosslists_task.result() + faculty_meetings_data = faculty_meetings_task.result() + if sis_class_entry is None: + details_data = details_task.result() + enrollment_data = enrollment_task.result() + + # Extract subject and course number from full details if SIS class entry not provided + if sis_class_entry is None: + subject_desc = details_data["subjectName"] + course_num = details_data["courseNumber"] # Fill class entry with fetched details class_entry["description"] = description_data @@ -134,27 +167,42 @@ async def process_class_details( class_entry["prerequisites"] = prerequisites_data class_entry["corequisites"] = corequisites_data class_entry["crosslists"] = crosslists_data + class_entry["faculty"] = faculty_meetings_data["faculty"] + class_entry["meetingInfo"] = faculty_meetings_data["meetings"] + # Fill class entry with SIS class search data if provided + if sis_class_entry is not None: + class_entry["sectionNumber"] = sis_class_entry["sequenceNumber"] + class_entry["title"] = sis_class_entry["courseTitle"] + class_entry["creditMin"] = sis_class_entry["creditHourLow"] + class_entry["creditMax"] = sis_class_entry["creditHourHigh"] + class_entry["seatsCapacity"] = sis_class_entry["maximumEnrollment"] + class_entry["seatsRegistered"] = sis_class_entry["enrollment"] + class_entry["seatsAvailable"] = sis_class_entry["seatsAvailable"] + class_entry["waitlistCapacity"] = sis_class_entry["waitCapacity"] + class_entry["waitlistRegistered"] = sis_class_entry["waitCount"] + class_entry["waitlistAvailable"] = sis_class_entry["waitAvailable"] + else: + class_entry["sectionNumber"] = details_data["sectionNumber"] + class_entry["title"] = details_data["title"] + class_entry["creditMin"] = details_data["creditMin"] + class_entry["creditMax"] = details_data["creditMax"] + class_entry["seatsCapacity"] = enrollment_data["enrollmentCapacity"] + class_entry["seatsRegistered"] = enrollment_data["enrollmentActual"] + class_entry["seatsAvailable"] = enrollment_data["enrollmentAvailable"] + class_entry["waitlistCapacity"] = enrollment_data["waitlistCapacity"] + class_entry["waitlistRegistered"] = enrollment_data["waitlistActual"] + class_entry["waitlistAvailable"] = enrollment_data["waitlistAvailable"] + + # Get appropriate subject course data dictionary from term course data + subj_course_data = term_course_data[subject_desc]["courses"] + # Initialize course entry if not already present + if course_num not in subj_course_data: + subj_course_data[course_num] = [] - # Process instructor RCSIDs and names - class_faculty = class_entry["faculty"] - for instructor in sis_class_entry["faculty"]: - instructor_name = instructor["displayName"] - email_address = instructor["emailAddress"] - # Add faculty entry to class faculty list - class_faculty.append( - { - "bannerId": instructor["bannerId"], - "displayName": instructor_name, - "emailAddress": email_address, - "primaryFaculty": instructor["primaryIndicator"], - } - ) - if "emailAddress" not in instructor: - logger.warning( - f"Missing instructor email address field for CRN {crn} " - f"in term {term}: {instructor_name}" - ) - continue + # Process faculty RCSIDs and names + for faculty in class_entry["faculty"]: + faculty_name = faculty["displayName"] + email_address = faculty["emailAddress"] # Add faculty RCSID to known RCSID map if provided if ( email_address is not None @@ -162,10 +210,10 @@ async def process_class_details( and instructor_rcsid_name_map is not None ): rcsid = email_address.split("@")[0].lower() - instructor_rcsid_name_map[rcsid] = instructor_name + instructor_rcsid_name_map[rcsid] = faculty_name - # Append class entry to course data - course_data[course_num].append(class_entry) + # Add class entry to subject course data + subj_course_data[course_num].append(class_entry) # Add to attribute code-to-name map # Attributes are known to be in the format "Attribute Name CODE" @@ -227,7 +275,10 @@ async def process_class_details( async def get_subj_course_data( term: str, - subject: str, + subject_code: str, + subject_desc: str, + term_course_data: dict[str, dict[str, Any]], + term_crn_set: set[str], instructor_rcsid_name_map: dict[str, str] = None, restriction_code_name_map: dict[str, dict[str, str]] = None, attribute_code_name_map: dict[str, str] = None, @@ -243,6 +294,7 @@ async def get_subj_course_data( @param term: Term code to fetch data for. @param subject: Subject code to fetch data for. + @param term_crn_set: Set of all CRNs processed in the term. @param instructor_rcsid_name_map: Optional map to populate with instructor RCSIDs to names. @param restriction_code_name_map: Optional map to populate with restriction @@ -265,29 +317,40 @@ async def get_subj_course_data( try: # Reset search state on server before fetching class data await reset_class_search(session, term) - class_data = await class_search(session, term, subject) - subj_class_data = {} + sis_class_data = await class_search(session, term, subject_code) + if len(sis_class_data) == 0: + logger.info( + f"No classes found for subject {subject_code} in term {term}" + ) + return {} + # Process class entries from the class search in parallel async with asyncio.TaskGroup() as tg: - for class_entry in class_data: + for sis_class_entry in sis_class_data: tg.create_task( process_class_details( session, - subj_class_data, - class_entry, + term_course_data, + term_crn_set, + sis_class_entry, instructor_rcsid_name_map=instructor_rcsid_name_map, restriction_code_name_map=restriction_code_name_map, attribute_code_name_map=attribute_code_name_map, ) ) + # Get subject course data from term course data + subj_course_data = term_course_data[subject_desc]["courses"] # Sort class entries by section number - for course_num in subj_class_data: - subj_class_data[course_num] = sorted( - subj_class_data[course_num], key=lambda x: x["sectionNumber"] + for course_num in subj_course_data: + subj_course_data[course_num] = sorted( + subj_course_data[course_num], + key=lambda class_entry: class_entry["sectionNumber"], ) # Return data sorted by course code - return dict(sorted(subj_class_data.items())) + return dict(sorted(subj_course_data.items())) except aiohttp.ClientError as e: - logger.error(f"Error processing subject {subject} in term {term}: {e}") + logger.error( + f"Error processing subject {subject_code} in term {term}: {e}" + ) return {} @@ -352,20 +415,27 @@ async def get_term_course_data( # Stores all course data for the term term_course_data = {} + # Stores all CRNs for the term + term_crn_set = set() + # Process subjects in parallel, each with its own session tasks: list[asyncio.Task] = [] try: async with asyncio.TaskGroup() as tg: for subject in subjects: subject_code = subject["code"] - term_course_data[subject_code] = { - "subjectName": subject["description"], + subject_desc = subject["description"] + term_course_data[subject_desc] = { + "subjectCode": subject_code, "courses": {}, } task = tg.create_task( get_subj_course_data( term, subject_code, + subject_desc, + term_course_data, + term_crn_set, instructor_rcsid_name_map=instructor_rcsid_name_map, restriction_code_name_map=restriction_code_name_map, attribute_code_name_map=attribute_code_name_map, @@ -385,12 +455,52 @@ async def get_term_course_data( # Wait for all tasks to complete and gather results for i, subject in enumerate(subjects): course_data = tasks[i].result() - term_course_data[subject["code"]]["courses"] = course_data + term_course_data[subject["description"]]["courses"] = course_data if len(term_course_data) == 0: return False - # Write all data for term to JSON file + # Check all crosslist CRNs in the term course data for any hidden classes not shown in + # the main class search and fetch their details. + async with semaphore: + async with aiohttp.ClientSession( + connector=tcp_connector, timeout=timeout_obj + ) as session: + hidden_crns = { + crosslist["courseReferenceNumber"] + for subject in term_course_data.values() + for course in subject["courses"].values() + for class_entry in course + for crosslist in class_entry["crosslists"] + if crosslist["courseReferenceNumber"] not in term_crn_set + } + if len(hidden_crns) > 0: + async with asyncio.TaskGroup() as tg: + for crn in hidden_crns: + tg.create_task( + process_class_details( + session, + term_course_data, + term_crn_set, + term=term, + crn=crn, + ) + ) + logger.info( + f"Processing hidden class with CRN {crn} in term {term}" + ) + + # Convert term course data to be keyed by subject code instead of description + term_course_data_by_code = {} + for subject_desc, data in term_course_data.items(): + subject_code = data["subjectCode"] + term_course_data_by_code[subject_code] = data + # Replace subject code field with subject description + del term_course_data_by_code[subject_code]["subjectCode"] + term_course_data_by_code[subject_code]["subjectDescription"] = subject_desc + term_course_data = term_course_data_by_code + + # Write all term data to JSON file if isinstance(output_path, str): output_path = Path(output_path) try: @@ -583,14 +693,13 @@ async def main( timeout=timeout, ) ) - ) - tasks.append(task) + tasks.append(task) - # Wait for all tasks to complete - for task in tasks: - success = task.result() - if success: - num_terms_processed += 1 + # Wait for all tasks to complete + for task in tasks: + success = task.result() + if success: + num_terms_processed += 1 except Exception as e: logger.fatal(f"Error in SIS scraper: {e}") From cc96208fed77eff708510d1243562afc151b0e2c Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 04:00:48 -0500 Subject: [PATCH 18/24] Improve error logging with full tracebacks Enhanced error logging in sis_scraper.py by including full exception tracebacks in log messages instead of printing them separately. This provides more detailed context for debugging and consolidates error information in the logs. --- sis_scraper/sis_scraper.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 436b525..65f319e 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -390,7 +390,9 @@ async def get_term_course_data( """ timeout_obj = aiohttp.ClientTimeout(total=timeout) try: - async with aiohttp.ClientSession(timeout=timeout_obj) as session: + async with aiohttp.ClientSession( + connector=tcp_connector, timeout=timeout_obj + ) as session: subjects = await get_term_subjects(session, term) except aiohttp.ClientError as e: logger.error(f"Error fetching subjects for term {term}: {e}") @@ -446,10 +448,12 @@ async def get_term_course_data( ) tasks.append(task) except Exception as e: - logger.error(f"Error processing subjects for term {term}: {e}") import traceback - traceback.print_exc() + logger.error( + f"Error processing subjects for term {term}: {e}" + f"\n{traceback.format_exc()}" + ) return False # Wait for all tasks to complete and gather results @@ -650,10 +654,11 @@ async def main( f"at {subject_code_name_map_path}" ) except Exception as e: - logger.fatal(f"Error loading code mapping files: {e}") import traceback - traceback.print_exc() + logger.fatal( + f"Error loading code mapping files: {e}" f"\n{traceback.format_exc()}" + ) return False # Limit concurrent client sessions and simultaneous connections @@ -702,10 +707,9 @@ async def main( num_terms_processed += 1 except Exception as e: - logger.fatal(f"Error in SIS scraper: {e}") import traceback - traceback.print_exc() + logger.fatal(f"Error in SIS scraper: {e}\n{traceback.format_exc()}") return False # Write code maps to JSON files if code mapping paths are provided @@ -750,10 +754,9 @@ async def main( with subject_code_name_map_path.open("w", encoding="utf-8") as f: json.dump(subject_code_name_map, f, indent=4, ensure_ascii=False) except Exception as e: - logger.error(f"Error writing code mapping files: {e}") import traceback - traceback.print_exc() + logger.error(f"Error writing code mapping files: {e}\n{traceback.format_exc()}") return False end_time = time.time() From 05c4bd83a8dddf6b4e64d865ef7014014734c33d Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 04:04:32 -0500 Subject: [PATCH 19/24] Fix closed session errors in SIS scraper I didn't know that sessions take ownership of the TCPConnector given to it by default and automatically closes it at the end of the session. This caused every other running session to crash due to the connector being closed. I fixed this by adding the connector_owner=False kwarg to every session instantiation. --- sis_scraper/sis_scraper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 65f319e..26ad5b2 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -312,7 +312,7 @@ async def get_subj_course_data( timeout_obj = aiohttp.ClientTimeout(total=timeout) async with aiohttp.ClientSession( - connector=tcp_connector, timeout=timeout_obj + connector=tcp_connector, connector_owner=False, timeout=timeout_obj ) as session: try: # Reset search state on server before fetching class data @@ -391,7 +391,7 @@ async def get_term_course_data( timeout_obj = aiohttp.ClientTimeout(total=timeout) try: async with aiohttp.ClientSession( - connector=tcp_connector, timeout=timeout_obj + connector=tcp_connector, connector_owner=False, timeout=timeout_obj ) as session: subjects = await get_term_subjects(session, term) except aiohttp.ClientError as e: @@ -468,7 +468,7 @@ async def get_term_course_data( # the main class search and fetch their details. async with semaphore: async with aiohttp.ClientSession( - connector=tcp_connector, timeout=timeout_obj + connector=tcp_connector, connector_owner=False, timeout=timeout_obj ) as session: hidden_crns = { crosslist["courseReferenceNumber"] From 09c2dbb1e30250f39de30c4653439fa3f34a4cbe Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 04:08:24 -0500 Subject: [PATCH 20/24] Raise max simultaneous connections to 20 Renamed the 'semaphore_val' parameter to 'max_concurrent_sessions' for clarity and increased 'limit_per_host' from 5 to 20 to allow more simultaneous connections per host. Updated related docstrings and variable usage accordingly. --- sis_scraper/sis_scraper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 26ad5b2..23908f5 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -528,8 +528,8 @@ async def main( instructor_rcsid_name_map_path: Path | str | None = None, restriction_code_name_map_path: Path | str | None = None, subject_code_name_map_path: Path | str | None = None, - semaphore_val: int = 10, - limit_per_host: int = 5, + max_concurrent_sessions: int = 10, + limit_per_host: int = 20, timeout: int = 30, ) -> bool: """ @@ -565,7 +565,7 @@ async def main( mapping JSON file. @param subject_code_name_map_path: Path to load/save subject code mapping JSON file. - @param semaphore_val: Maximum number of concurrent client sessions to + @param max_concurrent_sessions: Maximum number of concurrent client sessions to spawn. @param limit_per_host: Maximum number of simultaneous connections a session can make to the SIS server. @@ -662,7 +662,7 @@ async def main( return False # Limit concurrent client sessions and simultaneous connections - semaphore = asyncio.Semaphore(semaphore_val) + semaphore = asyncio.Semaphore(max_concurrent_sessions) logger.info("Starting SIS scraper with settings:") logger.info(f" Years: {start_year} - {end_year}") From d4b79d39215bb77557c2b964c89883882e109b6a Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 04:23:48 -0500 Subject: [PATCH 21/24] Improve retry logging with URL and params Enhanced the retry warning log to include the request URL, parameters, and exception details for better debugging of failed requests. --- sis_scraper/sis_api.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sis_scraper/sis_api.py b/sis_scraper/sis_api.py index b2e7a62..a1c3f51 100644 --- a/sis_scraper/sis_api.py +++ b/sis_scraper/sis_api.py @@ -69,7 +69,12 @@ def html_unescape(obj: Any) -> Any: wait=wait_random_exponential(multiplier=1.5) + wait_random(min=0, max=2), retry=retry_if_exception_type((asyncio.TimeoutError, aiohttp.ClientError)), before_sleep=lambda retry_state: logger.warning( - f"Retrying failed request (attempt {retry_state.attempt_number})" + f"Retrying failed request (attempt {retry_state.attempt_number}) " + f"for URL: {getattr(retry_state.args[1], 'url', retry_state.args[1])} " + f"with params: {retry_state.args[2]} | " + f"Exception: {retry_state.outcome.exception()}" + if retry_state.outcome and retry_state.outcome.exception() + else "Unknown" ), ) async def retry_get( From e24a2a4bcbb1ea79592e031e11f353b57902feda Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 05:42:35 -0500 Subject: [PATCH 22/24] Increase concurrency and update TCPConnector settings Raised max_concurrent_sessions to 25 and limit_per_host to 75 for improved parallelism. Added keepalive_timeout and force_close options to aiohttp.TCPConnector for better connection management. --- sis_scraper/sis_scraper.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 23908f5..7e9493e 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -528,8 +528,8 @@ async def main( instructor_rcsid_name_map_path: Path | str | None = None, restriction_code_name_map_path: Path | str | None = None, subject_code_name_map_path: Path | str | None = None, - max_concurrent_sessions: int = 10, - limit_per_host: int = 20, + max_concurrent_sessions: int = 25, + limit_per_host: int = 75, timeout: int = 30, ) -> bool: """ @@ -675,7 +675,10 @@ async def main( try: # Global TCP connector for all sessions async with aiohttp.TCPConnector( - ttl_dns_cache=500, limit_per_host=limit_per_host + ttl_dns_cache=500, + limit_per_host=limit_per_host, + keepalive_timeout=60, + force_close=False, ) as tcp_connector: # Process terms in parallel async with asyncio.TaskGroup() as tg: From b6cfeecae88eba9a3dd1126edda5dae566708e1f Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 16:11:06 -0500 Subject: [PATCH 23/24] Reorder "subjectDescription" field to be above course data Putting the subject description right underneath the subject code in the output JSON is much nicer to read. It was also the original plan. --- sis_scraper/sis_scraper.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 7e9493e..a9e62f0 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -498,10 +498,12 @@ async def get_term_course_data( term_course_data_by_code = {} for subject_desc, data in term_course_data.items(): subject_code = data["subjectCode"] - term_course_data_by_code[subject_code] = data - # Replace subject code field with subject description + term_course_data_by_code[subject_code] = { + "subjectDescription": subject_desc, + **data, + } + # Remove redundant subject code entry del term_course_data_by_code[subject_code]["subjectCode"] - term_course_data_by_code[subject_code]["subjectDescription"] = subject_desc term_course_data = term_course_data_by_code # Write all term data to JSON file From 80c5fab6d816ea188626e9f4f30d63abefbdb5ee Mon Sep 17 00:00:00 2001 From: Raymond Chen <42894676+ramonechen@users.noreply.github.com> Date: Fri, 26 Dec 2025 20:25:32 -0500 Subject: [PATCH 24/24] Refactor postprocess.py for new JSON structure Replaces individual codify functions and mapping logic with a new CodeMapper class that manages code-name mappings for subjects, attributes, restrictions, and instructors. Updates the main post-processing flow to use process_term and CodeMapper, improving maintainability, normalization, and consistency of code mapping and generation. Mapping files are now updated and saved after processing. --- sis_scraper/postprocess.py | 540 +++++++++++++++++++------------------ 1 file changed, 272 insertions(+), 268 deletions(-) diff --git a/sis_scraper/postprocess.py b/sis_scraper/postprocess.py index 10a0a6a..90bec58 100644 --- a/sis_scraper/postprocess.py +++ b/sis_scraper/postprocess.py @@ -7,197 +7,240 @@ logger = logging.getLogger(__name__) -def codify_course_code(course_code: str, subject_code_name_map: dict[str, str]) -> str: - """ - Codifies a course code from its full subject name and number to its subject code and - number. For example, "Computer Science 1010" becomes "CSCI 1010". - - @param course_code: The course code in the format `[Subject Name] [Course Number]`. - @param subject_code_name_map: A mapping of subject codes to subject full names. - @return: The codified course code in the format `[Subject Code] [Course Number]`, \ - or the original course code if the format is invalid. - """ - course_pattern = r"(.+) (\d{4})" - match = re.match(course_pattern, course_code) - if match is None or len(match.groups()) != 2: - logger.warning(f"Unexpected course code format: {course_code}") - return course_code - - subject_name = match.group(1) - course_number = match.group(2) - # Translate subject_name (full name) back to its code using subject_code_name_map - # subject_code_name_map: {code: name} - # We need to find the code whose value matches subject_name - code = next( - (k for k, v in subject_code_name_map.items() if v == subject_name), subject_name - ) - return f"{code} {course_number}" - - -def codify_attribute(attribute: str) -> str: - """ - Codifies an attribute from its full name and code to just its code. For example, - "Writing Intensive WI" becomes "WI". - - @param attribute: The attribute in the format `[Attribute Name] [Attribute Code]`. - @return: The codified attribute code, or the original attribute if the format is \ - invalid. - """ - attribute_pattern = r"(.+) (.+)" - match = re.match(attribute_pattern, attribute) - if match is None or len(match.groups()) != 2: - logger.warning(f"Unexpected attribute format: {attribute}") - return attribute - attribute_code = match.group(2) - return attribute_code - - -def codify_restriction(restriction: str) -> str: - """ - Codifies a restriction from its full name and code to just its code. For example, - "Graduate (GR)" becomes "GR". - - @param restriction: The restriction in the format \ - `[Restriction Name] ([Restriction Code])`. - @return: The codified restriction code, or the original restriction if the format \ - is invalid. - """ - restriction_pattern = r"(.+)\s*\((.+)\)" - match = re.match(restriction_pattern, restriction) - if match is None or len(match.groups()) != 2: - logger.warning(f"Unexpected restriction format: {restriction}") - return restriction - restriction_code = match.group(2) - return restriction_code - - -def generate_rcsid( - instructor_name: str, - instructor_rcsid_name_map: dict[str, str], - generated_instructor_rcsid_name_map: dict[str, str], -) -> str: - """ - Accepts an instructor name in the format `Last, First` and generates an RCSID. - Assumes the instructor name does not have an associated RCSID in the SIS data. - - @param instructor_name: The instructor name in the format `Last, First`. - @param instructor_rcsid_name_map: A mapping of existing instructor RCSIDs to names. - @param generated_instructor_rcsid_name_map: A mapping to store newly generated \ - instructor RCSIDs to names. - @return: The generated RCSID for the instructor. - """ - instructor_name_pattern = r"(.+), (.+)" - match = re.match(instructor_name_pattern, instructor_name) - if match is None or len(match.groups()) != 2: - logger.warning(f"Unexpected instructor name format: {instructor_name}") - return instructor_name - # An RCSID is composed of up to the first 5 letters of the last name, followed by - # the first name initial, as well as a number if needed to ensure uniqueness. - # For example, "Doe, John" would become "doej", or "doej2" if "doej" is taken. - last_name = match.group(1) - last_name_component = "" - # Extract up to first 5 alphabetic characters from last name - for char in last_name: - if char.isalpha(): - last_name_component += char.lower() - if len(last_name_component) == 5: - break - first_name = match.group(2) - # Extract first alphabetic character from first name - first_name_initial = "" - for char in first_name: - if char.isalpha(): - first_name_initial += char.lower() - break - rcsid = f"{last_name_component}{first_name_initial}" - # Ensure uniqueness - counter = 1 - while rcsid in instructor_rcsid_name_map: - rcsid = f"{last_name_component}{first_name_initial}{counter}" - counter += 1 - # The generated RCSID may already exist in the generated map, this is normal - generated_instructor_rcsid_name_map[rcsid] = instructor_name - return rcsid - - -def post_process( - term_course_data: dict[str, Any], - subject_code_name_map: dict[str, str], - instructor_rcsid_name_map: dict[str, str], - generated_instructor_rcsid_name_map: dict[str, str], -) -> None: - """ - Post-process the term course data by codifying course codes, attributes, - restrictions, and instructor RCSIDs. - - @param term_course_data: The term course data to post-process. - @param subject_code_name_map: A mapping of subject codes to subject full names. - @param instructor_rcsid_name_map: A mapping of existing instructor RCSIDs to names. - @param generated_instructor_rcsid_name_map: A mapping to store newly generated \ - instructor RCSIDs to names. - @return: None - """ - for _, subject_data in term_course_data.items(): - subject_courses = subject_data["courses"] - for _, course_data in subject_courses.items(): - course_detail = course_data["course_detail"] - course_corequisites = course_detail["corequisite"] - # course_prerequisites = course_detail["prerequisite"] - course_crosslists = course_detail["crosslist"] - course_attributes = course_detail["attributes"] - course_restriction_types = course_detail["restrictions"] - course_sections = course_detail["sections"] - - # Corequisites - for i, corequisite in enumerate(course_corequisites): - course_corequisites[i] = codify_course_code( - corequisite, subject_code_name_map - ) - - # Prerequisites - # Will implement when prerequisite parsing is done - # for i, prerequisite in enumerate(course_prerequisites): - # pass - - # Crosslists - for i, crosslist in enumerate(course_crosslists): - course_crosslists[i] = codify_course_code( - crosslist, subject_code_name_map - ) - - # Attributes - for i, attribute in enumerate(course_attributes): - course_attributes[i] = codify_attribute(attribute) - - # Restrictions - for restriction_type in course_restriction_types: - # Skip special approvals - if restriction_type == "special_approval": - continue - restriction_type_list = course_restriction_types[restriction_type] - for i, restriction in enumerate(restriction_type_list): - restriction_type_list[i] = codify_restriction(restriction) - - # Instructors - for section in course_sections: - instructor_list = section["instructor"] - instructor_pattern = r"(.+), (.+) \((.+)\)" - for i, instructor in enumerate(instructor_list): - match = re.match(instructor_pattern, instructor) - if match is None or len(match.groups()) != 3: - logger.warning( - f"Unexpected instructor name and RCSID format: {instructor}" - ) - continue - instructor_name = f"{match.group(1)}, {match.group(2)}" - instructor_rcsid = match.group(3) - if instructor_rcsid == "Unknown RCSID": - instructor_rcsid = generate_rcsid( - instructor_name, - instructor_rcsid_name_map, - generated_instructor_rcsid_name_map, - ) - instructor_list[i] = instructor_rcsid +class CodeMapper: + def __init__( + self, + attribute_path: Path | str, + instructor_path: Path | str, + restriction_path: Path | str, + subject_path: Path | str, + ) -> None: + self.attribute_path = Path(attribute_path) + self.instructor_path = Path(instructor_path) + self.restriction_path = Path(restriction_path) + self.subject_path = Path(subject_path) + + self.attributes = self._load_json(self.attribute_path) + self.instructors = self._load_json(self.instructor_path) + self.restrictions = self._load_json(self.restriction_path) + self._normalize_restrictions() + self.subjects = self._load_json(self.subject_path) + + # Reverse map for subject name to code lookup + self.subject_name_to_code = {v: k for k, v in self.subjects.items()} + + # Reverse map for instructor name to RCSID lookup + self.instructor_name_to_rcsid = {v: k for k, v in self.instructors.items()} + + def _normalize_restrictions(self) -> None: + normalized = {} + for r_type, codes in self.restrictions.items(): + target_type = r_type + if r_type.startswith("not_"): + target_type = r_type[4:] + if target_type not in normalized: + normalized[target_type] = {} + for code, name in codes.items(): + normalized[target_type][code] = name.strip() + self.restrictions = normalized + + def _load_json(self, path: Path | str) -> dict: + path = Path(path) + if path.exists() and not path.is_dir(): + try: + with path.open("r", encoding="utf-8") as f: + return json.load(f) + except json.JSONDecodeError as e: + logger.error(f"Error decoding JSON from {path}: {e}") + return {} + + def save(self) -> None: + self._save_json(self.attribute_path, self.attributes) + self._save_json(self.instructor_path, self.instructors) + self._save_json(self.restriction_path, self.restrictions) + self._save_json(self.subject_path, self.subjects) + + def _save_json(self, path: Path, data: dict) -> None: + # Sort keys for consistent output + sorted_data = dict(sorted(data.items())) + # For nested dicts (restrictions), sort inner keys too + if path == self.restriction_path: + sorted_data = {k: dict(sorted(v.items())) for k, v in sorted_data.items()} + + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + json.dump(sorted_data, f, indent=4, ensure_ascii=False) + + def add_subject(self, code: str, name: str) -> None: + if code in self.subjects and self.subjects[code] != name: + logging.warning( + f"Conflicting subject name for code {code}: " + f"'{self.subjects[code]}' vs '{name}'" + ) + # Update code to name mapping regardless of whether a conflict exists + self.subjects[code] = name + self.subject_name_to_code[name] = code + + def add_attribute(self, code: str, name: str) -> None: + if code in self.attributes and self.attributes[code] != name: + logging.warning( + f"Conflicting attribute name for code {code}: " + f"'{self.attributes[code]}' vs '{name}'" + ) + # Update code to name mapping regardless of whether a conflict exists + self.attributes[code] = name + + def add_restriction(self, r_type: str, code: str, name: str) -> None: + if r_type.startswith("not_"): + r_type = r_type[4:] + if r_type not in self.restrictions: + self.restrictions[r_type] = {} + if ( + code in self.restrictions[r_type] + and self.restrictions[r_type][code] != name + ): + logging.warning( + f"Conflicting restriction name for type {r_type} code {code}: " + f"'{self.restrictions[r_type][code]}' vs '{name}'" + ) + # Update code to name mapping regardless of whether a conflict exists + self.restrictions[r_type][code] = name.strip() + + def add_instructor(self, rcsid: str, name: str) -> None: + if rcsid in self.instructors and self.instructors[rcsid] != name: + logging.warning( + f"Conflicting instructor name for RCSID {rcsid}: " + f"'{self.instructors[rcsid]}' vs '{name}'" + ) + # Update RCSID to name mapping regardless of whether a conflict exists + self.instructors[rcsid] = name + + def get_subject_code(self, name: str) -> str | None: + if name in self.subject_name_to_code: + return self.subject_name_to_code[name] + return None + + def get_or_generate_rcsid(self, name: str) -> str: + # Check if name already maps to an RCSID (reverse lookup) + if name in self.instructor_name_to_rcsid: + return self.instructor_name_to_rcsid[name] + # Otherwise, generate a new RCSID + return self._generate_rcsid(name) + + def _generate_rcsid(self, instructor_name: str) -> str: + instructor_name_pattern = r"(.+), (.+)" + match = re.match(instructor_name_pattern, instructor_name) + if match is None or len(match.groups()) != 2: + logger.warning(f"Unexpected instructor name format: {instructor_name}") + # Fallback: remove spaces and lowercase + return re.sub(r"\s+", "", instructor_name).lower()[:8] + last_name = match.group(1) + last_name_component = "" + # Extract up to first 5 alphabetic characters from last name + for char in last_name: + if char.isalpha(): + last_name_component += char.lower() + if len(last_name_component) == 5: + break + first_name = match.group(2) + # Extract first alphabetic character from first name + first_name_initial = "" + for char in first_name: + if char.isalpha(): + first_name_initial += char.lower() + break + rcsid = f"{last_name_component}{first_name_initial}" + # Ensure uniqueness against existing instructors + counter = 1 + original_rcsid = rcsid + while rcsid in self.instructors: + # If the name matches, we can reuse this RCSID (handled in get_or_generate_rcsid) + # But here we are generating a NEW one because we didn't find the name. + # So we must ensure uniqueness. + rcsid = f"{original_rcsid}{counter}" + counter += 1 + return rcsid + + +def process_term(term: str, term_data: dict[str, Any], mapper: CodeMapper): + for subject_code, subject_data in term_data.items(): + # Update Subject Map + if "subjectDescription" in subject_data: + mapper.add_subject(subject_code, subject_data["subjectDescription"]) + + if "courses" not in subject_data: + continue + + for _, class_list in subject_data["courses"].items(): + for class_entry in class_list: + # Attributes + if "attributes" in class_entry: + new_attributes = [] + for attr in class_entry["attributes"]: + # Parse "Name Code" (two spaces) + match = re.match(r"(.+) (.+)", attr) + if match: + name, code = match.groups() + mapper.add_attribute(code, name.strip()) + new_attributes.append(code) + else: + logger.warning( + f"Unexpected attribute format: '{attr}' " + f"for CRN {class_entry['courseReferenceNumber']} " + f"in term {term}" + ) + new_attributes.append(attr) + class_entry["attributes"] = new_attributes + + # Restrictions + if "restrictions" in class_entry: + for r_type, r_list in class_entry["restrictions"].items(): + if r_type == "special_approval": + continue + new_r_list = [] + for restriction in r_list: + # Parse "Name (Code)" + match = re.match(r"(.+)\s*\((.+)\)", restriction) + if match: + name, code = match.groups() + mapper.add_restriction(r_type, code, name.strip()) + new_r_list.append(code) + else: + new_r_list.append(restriction) + class_entry["restrictions"][r_type] = new_r_list + + # Faculty + if "faculty" in class_entry: + new_faculty = [] + for faculty in class_entry["faculty"]: + name = faculty["displayName"] + email = faculty["emailAddress"] + rcsid = None + if email: + rcsid = email.split("@")[0] + if not rcsid and name: + rcsid = mapper.get_or_generate_rcsid(name) + if rcsid and name: + mapper.add_instructor(rcsid, name) + if rcsid: + new_faculty.append(rcsid) + elif name: + new_faculty.append(name) + else: + new_faculty.append(str(faculty)) + + class_entry["faculty"] = new_faculty + + # Crosslists & Corequisites + for field in ["crosslists", "corequisites"]: + if field in class_entry: + new_list = [] + for item in class_entry[field]: + subj_name = item["subjectName"] + course_num = item["courseNumber"] + subj_code = mapper.get_subject_code(subj_name) + new_list.append(f"{subj_code} {course_num}") + class_entry[field] = new_list def main( @@ -210,70 +253,28 @@ def main( ) -> bool: """ Runs post-processing on the raw output data from the SIS scraper. This includes - codifying course codes, attributes, restrictions, and instructor RCSIDs. - - @param output_data_dir: Directory containing raw output data from the SIS scraper. - @param processed_output_data_dir: Directory to write processed output data to. - @param attribute_code_name_map_path: Path to the attribute code-name mapping file. - @param instructor_rcsid_name_map_path: Path to the instructor RCSID-name mapping - file. - @param restriction_code_name_map_path: Path to the restriction code-name mapping - file. - @param subject_code_name_map_path: Path to the subject code-name mapping file. - @return: True if post-processing was successful, False otherwise. + codifying course codes, attributes, restrictions, and instructor RCSIDs, + and updating the code mappings. """ - # Validate input directories - if not all( - ( - output_data_dir, - processed_output_data_dir, - attribute_code_name_map_path, - instructor_rcsid_name_map_path, - restriction_code_name_map_path, - subject_code_name_map_path, - ) - ): - logger.error("One or more required directories are not specified.") - return False - - # Convert to Path objects if necessary - if isinstance(output_data_dir, str): - output_data_dir = Path(output_data_dir) - if isinstance(processed_output_data_dir, str): - processed_output_data_dir = Path(processed_output_data_dir) - if isinstance(attribute_code_name_map_path, str): - attribute_code_name_map_path = Path(attribute_code_name_map_path) - if isinstance(instructor_rcsid_name_map_path, str): - instructor_rcsid_name_map_path = Path(instructor_rcsid_name_map_path) - if isinstance(restriction_code_name_map_path, str): - restriction_code_name_map_path = Path(restriction_code_name_map_path) - if isinstance(subject_code_name_map_path, str): - subject_code_name_map_path = Path(subject_code_name_map_path) - - # Validate input directories - if not output_data_dir.exists() or not output_data_dir.is_dir(): + # Convert to Path objects + output_data_dir = Path(output_data_dir) + processed_output_data_dir = Path(processed_output_data_dir) + attribute_code_name_map_path = Path(attribute_code_name_map_path) + instructor_rcsid_name_map_path = Path(instructor_rcsid_name_map_path) + restriction_code_name_map_path = Path(restriction_code_name_map_path) + subject_code_name_map_path = Path(subject_code_name_map_path) + + if not output_data_dir.exists(): logger.error(f"Output data directory {output_data_dir} does not exist.") return False - # Validate mapping files - for map_path in [ + # Initialize code mapper + mapper = CodeMapper( attribute_code_name_map_path, instructor_rcsid_name_map_path, restriction_code_name_map_path, subject_code_name_map_path, - ]: - if not map_path.exists() or map_path.is_dir(): - logger.error(f"Mapping file {map_path} does not exist or is a directory.") - return False - - # Load code mappings - with instructor_rcsid_name_map_path.open("r", encoding="utf-8") as f: - instructor_rcsid_name_map = json.load(f) - with subject_code_name_map_path.open("r", encoding="utf-8") as f: - subject_code_name_map = json.load(f) - - # Initialize generated instructor RCSID map - generated_instructor_rcsid_name_map = {} + ) processed_output_data_dir.mkdir(exist_ok=True, parents=True) @@ -282,32 +283,35 @@ def main( with term_file.open("r", encoding="utf-8") as f: term_course_data = json.load(f) - post_process( - term_course_data, - subject_code_name_map, - instructor_rcsid_name_map, - generated_instructor_rcsid_name_map, - ) + process_term(term_file.stem, term_course_data, mapper) # Write processed data processed_file_path = processed_output_data_dir / term_file.name - logger.info(f"Writing processed data to {processed_file_path}") with processed_file_path.open("w", encoding="utf-8") as f: + logger.info(f"Writing processed data to {processed_file_path}") json.dump(term_course_data, f, indent=4, ensure_ascii=False) - # Write generated instructor RCSID map - if len(generated_instructor_rcsid_name_map) > 0: - generated_map_path = ( - instructor_rcsid_name_map_path.parent - / "generated_instructor_rcsid_name_map.json" - ) - logger.info( - f"Writing {len(generated_instructor_rcsid_name_map)} generated " - f"instructor RCSID mappings to {generated_map_path}" - ) - with generated_map_path.open("w", encoding="utf-8") as f: - json.dump( - generated_instructor_rcsid_name_map, f, indent=4, ensure_ascii=False - ) + # Save updated mappings + num_attribute_codes = len(mapper.attributes) + num_instructor_rcsids = len(mapper.instructors) + num_restriction_codes = sum(len(codes) for codes in mapper.restrictions.values()) + num_subject_codes = len(mapper.subjects) + logger.info( + f"Saving {num_attribute_codes} attribute codes to " + + str(attribute_code_name_map_path) + ) + logger.info( + f"Saving {num_instructor_rcsids} instructor RCSIDs to " + + str(instructor_rcsid_name_map_path) + ) + logger.info( + f"Saving {num_restriction_codes} restriction codes to " + + str(restriction_code_name_map_path) + ) + logger.info( + f"Saving {num_subject_codes} subject codes to " + + str(subject_code_name_map_path) + ) + mapper.save() return True