diff --git a/sis_scraper/postprocess.py b/sis_scraper/postprocess.py index 10a0a6a..90bec58 100644 --- a/sis_scraper/postprocess.py +++ b/sis_scraper/postprocess.py @@ -7,197 +7,240 @@ logger = logging.getLogger(__name__) -def codify_course_code(course_code: str, subject_code_name_map: dict[str, str]) -> str: - """ - Codifies a course code from its full subject name and number to its subject code and - number. For example, "Computer Science 1010" becomes "CSCI 1010". - - @param course_code: The course code in the format `[Subject Name] [Course Number]`. - @param subject_code_name_map: A mapping of subject codes to subject full names. - @return: The codified course code in the format `[Subject Code] [Course Number]`, \ - or the original course code if the format is invalid. - """ - course_pattern = r"(.+) (\d{4})" - match = re.match(course_pattern, course_code) - if match is None or len(match.groups()) != 2: - logger.warning(f"Unexpected course code format: {course_code}") - return course_code - - subject_name = match.group(1) - course_number = match.group(2) - # Translate subject_name (full name) back to its code using subject_code_name_map - # subject_code_name_map: {code: name} - # We need to find the code whose value matches subject_name - code = next( - (k for k, v in subject_code_name_map.items() if v == subject_name), subject_name - ) - return f"{code} {course_number}" - - -def codify_attribute(attribute: str) -> str: - """ - Codifies an attribute from its full name and code to just its code. For example, - "Writing Intensive WI" becomes "WI". - - @param attribute: The attribute in the format `[Attribute Name] [Attribute Code]`. - @return: The codified attribute code, or the original attribute if the format is \ - invalid. - """ - attribute_pattern = r"(.+) (.+)" - match = re.match(attribute_pattern, attribute) - if match is None or len(match.groups()) != 2: - logger.warning(f"Unexpected attribute format: {attribute}") - return attribute - attribute_code = match.group(2) - return attribute_code - - -def codify_restriction(restriction: str) -> str: - """ - Codifies a restriction from its full name and code to just its code. For example, - "Graduate (GR)" becomes "GR". - - @param restriction: The restriction in the format \ - `[Restriction Name] ([Restriction Code])`. - @return: The codified restriction code, or the original restriction if the format \ - is invalid. - """ - restriction_pattern = r"(.+)\s*\((.+)\)" - match = re.match(restriction_pattern, restriction) - if match is None or len(match.groups()) != 2: - logger.warning(f"Unexpected restriction format: {restriction}") - return restriction - restriction_code = match.group(2) - return restriction_code - - -def generate_rcsid( - instructor_name: str, - instructor_rcsid_name_map: dict[str, str], - generated_instructor_rcsid_name_map: dict[str, str], -) -> str: - """ - Accepts an instructor name in the format `Last, First` and generates an RCSID. - Assumes the instructor name does not have an associated RCSID in the SIS data. - - @param instructor_name: The instructor name in the format `Last, First`. - @param instructor_rcsid_name_map: A mapping of existing instructor RCSIDs to names. - @param generated_instructor_rcsid_name_map: A mapping to store newly generated \ - instructor RCSIDs to names. - @return: The generated RCSID for the instructor. - """ - instructor_name_pattern = r"(.+), (.+)" - match = re.match(instructor_name_pattern, instructor_name) - if match is None or len(match.groups()) != 2: - logger.warning(f"Unexpected instructor name format: {instructor_name}") - return instructor_name - # An RCSID is composed of up to the first 5 letters of the last name, followed by - # the first name initial, as well as a number if needed to ensure uniqueness. - # For example, "Doe, John" would become "doej", or "doej2" if "doej" is taken. - last_name = match.group(1) - last_name_component = "" - # Extract up to first 5 alphabetic characters from last name - for char in last_name: - if char.isalpha(): - last_name_component += char.lower() - if len(last_name_component) == 5: - break - first_name = match.group(2) - # Extract first alphabetic character from first name - first_name_initial = "" - for char in first_name: - if char.isalpha(): - first_name_initial += char.lower() - break - rcsid = f"{last_name_component}{first_name_initial}" - # Ensure uniqueness - counter = 1 - while rcsid in instructor_rcsid_name_map: - rcsid = f"{last_name_component}{first_name_initial}{counter}" - counter += 1 - # The generated RCSID may already exist in the generated map, this is normal - generated_instructor_rcsid_name_map[rcsid] = instructor_name - return rcsid - - -def post_process( - term_course_data: dict[str, Any], - subject_code_name_map: dict[str, str], - instructor_rcsid_name_map: dict[str, str], - generated_instructor_rcsid_name_map: dict[str, str], -) -> None: - """ - Post-process the term course data by codifying course codes, attributes, - restrictions, and instructor RCSIDs. - - @param term_course_data: The term course data to post-process. - @param subject_code_name_map: A mapping of subject codes to subject full names. - @param instructor_rcsid_name_map: A mapping of existing instructor RCSIDs to names. - @param generated_instructor_rcsid_name_map: A mapping to store newly generated \ - instructor RCSIDs to names. - @return: None - """ - for _, subject_data in term_course_data.items(): - subject_courses = subject_data["courses"] - for _, course_data in subject_courses.items(): - course_detail = course_data["course_detail"] - course_corequisites = course_detail["corequisite"] - # course_prerequisites = course_detail["prerequisite"] - course_crosslists = course_detail["crosslist"] - course_attributes = course_detail["attributes"] - course_restriction_types = course_detail["restrictions"] - course_sections = course_detail["sections"] - - # Corequisites - for i, corequisite in enumerate(course_corequisites): - course_corequisites[i] = codify_course_code( - corequisite, subject_code_name_map - ) - - # Prerequisites - # Will implement when prerequisite parsing is done - # for i, prerequisite in enumerate(course_prerequisites): - # pass - - # Crosslists - for i, crosslist in enumerate(course_crosslists): - course_crosslists[i] = codify_course_code( - crosslist, subject_code_name_map - ) - - # Attributes - for i, attribute in enumerate(course_attributes): - course_attributes[i] = codify_attribute(attribute) - - # Restrictions - for restriction_type in course_restriction_types: - # Skip special approvals - if restriction_type == "special_approval": - continue - restriction_type_list = course_restriction_types[restriction_type] - for i, restriction in enumerate(restriction_type_list): - restriction_type_list[i] = codify_restriction(restriction) - - # Instructors - for section in course_sections: - instructor_list = section["instructor"] - instructor_pattern = r"(.+), (.+) \((.+)\)" - for i, instructor in enumerate(instructor_list): - match = re.match(instructor_pattern, instructor) - if match is None or len(match.groups()) != 3: - logger.warning( - f"Unexpected instructor name and RCSID format: {instructor}" - ) - continue - instructor_name = f"{match.group(1)}, {match.group(2)}" - instructor_rcsid = match.group(3) - if instructor_rcsid == "Unknown RCSID": - instructor_rcsid = generate_rcsid( - instructor_name, - instructor_rcsid_name_map, - generated_instructor_rcsid_name_map, - ) - instructor_list[i] = instructor_rcsid +class CodeMapper: + def __init__( + self, + attribute_path: Path | str, + instructor_path: Path | str, + restriction_path: Path | str, + subject_path: Path | str, + ) -> None: + self.attribute_path = Path(attribute_path) + self.instructor_path = Path(instructor_path) + self.restriction_path = Path(restriction_path) + self.subject_path = Path(subject_path) + + self.attributes = self._load_json(self.attribute_path) + self.instructors = self._load_json(self.instructor_path) + self.restrictions = self._load_json(self.restriction_path) + self._normalize_restrictions() + self.subjects = self._load_json(self.subject_path) + + # Reverse map for subject name to code lookup + self.subject_name_to_code = {v: k for k, v in self.subjects.items()} + + # Reverse map for instructor name to RCSID lookup + self.instructor_name_to_rcsid = {v: k for k, v in self.instructors.items()} + + def _normalize_restrictions(self) -> None: + normalized = {} + for r_type, codes in self.restrictions.items(): + target_type = r_type + if r_type.startswith("not_"): + target_type = r_type[4:] + if target_type not in normalized: + normalized[target_type] = {} + for code, name in codes.items(): + normalized[target_type][code] = name.strip() + self.restrictions = normalized + + def _load_json(self, path: Path | str) -> dict: + path = Path(path) + if path.exists() and not path.is_dir(): + try: + with path.open("r", encoding="utf-8") as f: + return json.load(f) + except json.JSONDecodeError as e: + logger.error(f"Error decoding JSON from {path}: {e}") + return {} + + def save(self) -> None: + self._save_json(self.attribute_path, self.attributes) + self._save_json(self.instructor_path, self.instructors) + self._save_json(self.restriction_path, self.restrictions) + self._save_json(self.subject_path, self.subjects) + + def _save_json(self, path: Path, data: dict) -> None: + # Sort keys for consistent output + sorted_data = dict(sorted(data.items())) + # For nested dicts (restrictions), sort inner keys too + if path == self.restriction_path: + sorted_data = {k: dict(sorted(v.items())) for k, v in sorted_data.items()} + + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + json.dump(sorted_data, f, indent=4, ensure_ascii=False) + + def add_subject(self, code: str, name: str) -> None: + if code in self.subjects and self.subjects[code] != name: + logging.warning( + f"Conflicting subject name for code {code}: " + f"'{self.subjects[code]}' vs '{name}'" + ) + # Update code to name mapping regardless of whether a conflict exists + self.subjects[code] = name + self.subject_name_to_code[name] = code + + def add_attribute(self, code: str, name: str) -> None: + if code in self.attributes and self.attributes[code] != name: + logging.warning( + f"Conflicting attribute name for code {code}: " + f"'{self.attributes[code]}' vs '{name}'" + ) + # Update code to name mapping regardless of whether a conflict exists + self.attributes[code] = name + + def add_restriction(self, r_type: str, code: str, name: str) -> None: + if r_type.startswith("not_"): + r_type = r_type[4:] + if r_type not in self.restrictions: + self.restrictions[r_type] = {} + if ( + code in self.restrictions[r_type] + and self.restrictions[r_type][code] != name + ): + logging.warning( + f"Conflicting restriction name for type {r_type} code {code}: " + f"'{self.restrictions[r_type][code]}' vs '{name}'" + ) + # Update code to name mapping regardless of whether a conflict exists + self.restrictions[r_type][code] = name.strip() + + def add_instructor(self, rcsid: str, name: str) -> None: + if rcsid in self.instructors and self.instructors[rcsid] != name: + logging.warning( + f"Conflicting instructor name for RCSID {rcsid}: " + f"'{self.instructors[rcsid]}' vs '{name}'" + ) + # Update RCSID to name mapping regardless of whether a conflict exists + self.instructors[rcsid] = name + + def get_subject_code(self, name: str) -> str | None: + if name in self.subject_name_to_code: + return self.subject_name_to_code[name] + return None + + def get_or_generate_rcsid(self, name: str) -> str: + # Check if name already maps to an RCSID (reverse lookup) + if name in self.instructor_name_to_rcsid: + return self.instructor_name_to_rcsid[name] + # Otherwise, generate a new RCSID + return self._generate_rcsid(name) + + def _generate_rcsid(self, instructor_name: str) -> str: + instructor_name_pattern = r"(.+), (.+)" + match = re.match(instructor_name_pattern, instructor_name) + if match is None or len(match.groups()) != 2: + logger.warning(f"Unexpected instructor name format: {instructor_name}") + # Fallback: remove spaces and lowercase + return re.sub(r"\s+", "", instructor_name).lower()[:8] + last_name = match.group(1) + last_name_component = "" + # Extract up to first 5 alphabetic characters from last name + for char in last_name: + if char.isalpha(): + last_name_component += char.lower() + if len(last_name_component) == 5: + break + first_name = match.group(2) + # Extract first alphabetic character from first name + first_name_initial = "" + for char in first_name: + if char.isalpha(): + first_name_initial += char.lower() + break + rcsid = f"{last_name_component}{first_name_initial}" + # Ensure uniqueness against existing instructors + counter = 1 + original_rcsid = rcsid + while rcsid in self.instructors: + # If the name matches, we can reuse this RCSID (handled in get_or_generate_rcsid) + # But here we are generating a NEW one because we didn't find the name. + # So we must ensure uniqueness. + rcsid = f"{original_rcsid}{counter}" + counter += 1 + return rcsid + + +def process_term(term: str, term_data: dict[str, Any], mapper: CodeMapper): + for subject_code, subject_data in term_data.items(): + # Update Subject Map + if "subjectDescription" in subject_data: + mapper.add_subject(subject_code, subject_data["subjectDescription"]) + + if "courses" not in subject_data: + continue + + for _, class_list in subject_data["courses"].items(): + for class_entry in class_list: + # Attributes + if "attributes" in class_entry: + new_attributes = [] + for attr in class_entry["attributes"]: + # Parse "Name Code" (two spaces) + match = re.match(r"(.+) (.+)", attr) + if match: + name, code = match.groups() + mapper.add_attribute(code, name.strip()) + new_attributes.append(code) + else: + logger.warning( + f"Unexpected attribute format: '{attr}' " + f"for CRN {class_entry['courseReferenceNumber']} " + f"in term {term}" + ) + new_attributes.append(attr) + class_entry["attributes"] = new_attributes + + # Restrictions + if "restrictions" in class_entry: + for r_type, r_list in class_entry["restrictions"].items(): + if r_type == "special_approval": + continue + new_r_list = [] + for restriction in r_list: + # Parse "Name (Code)" + match = re.match(r"(.+)\s*\((.+)\)", restriction) + if match: + name, code = match.groups() + mapper.add_restriction(r_type, code, name.strip()) + new_r_list.append(code) + else: + new_r_list.append(restriction) + class_entry["restrictions"][r_type] = new_r_list + + # Faculty + if "faculty" in class_entry: + new_faculty = [] + for faculty in class_entry["faculty"]: + name = faculty["displayName"] + email = faculty["emailAddress"] + rcsid = None + if email: + rcsid = email.split("@")[0] + if not rcsid and name: + rcsid = mapper.get_or_generate_rcsid(name) + if rcsid and name: + mapper.add_instructor(rcsid, name) + if rcsid: + new_faculty.append(rcsid) + elif name: + new_faculty.append(name) + else: + new_faculty.append(str(faculty)) + + class_entry["faculty"] = new_faculty + + # Crosslists & Corequisites + for field in ["crosslists", "corequisites"]: + if field in class_entry: + new_list = [] + for item in class_entry[field]: + subj_name = item["subjectName"] + course_num = item["courseNumber"] + subj_code = mapper.get_subject_code(subj_name) + new_list.append(f"{subj_code} {course_num}") + class_entry[field] = new_list def main( @@ -210,70 +253,28 @@ def main( ) -> bool: """ Runs post-processing on the raw output data from the SIS scraper. This includes - codifying course codes, attributes, restrictions, and instructor RCSIDs. - - @param output_data_dir: Directory containing raw output data from the SIS scraper. - @param processed_output_data_dir: Directory to write processed output data to. - @param attribute_code_name_map_path: Path to the attribute code-name mapping file. - @param instructor_rcsid_name_map_path: Path to the instructor RCSID-name mapping - file. - @param restriction_code_name_map_path: Path to the restriction code-name mapping - file. - @param subject_code_name_map_path: Path to the subject code-name mapping file. - @return: True if post-processing was successful, False otherwise. + codifying course codes, attributes, restrictions, and instructor RCSIDs, + and updating the code mappings. """ - # Validate input directories - if not all( - ( - output_data_dir, - processed_output_data_dir, - attribute_code_name_map_path, - instructor_rcsid_name_map_path, - restriction_code_name_map_path, - subject_code_name_map_path, - ) - ): - logger.error("One or more required directories are not specified.") - return False - - # Convert to Path objects if necessary - if isinstance(output_data_dir, str): - output_data_dir = Path(output_data_dir) - if isinstance(processed_output_data_dir, str): - processed_output_data_dir = Path(processed_output_data_dir) - if isinstance(attribute_code_name_map_path, str): - attribute_code_name_map_path = Path(attribute_code_name_map_path) - if isinstance(instructor_rcsid_name_map_path, str): - instructor_rcsid_name_map_path = Path(instructor_rcsid_name_map_path) - if isinstance(restriction_code_name_map_path, str): - restriction_code_name_map_path = Path(restriction_code_name_map_path) - if isinstance(subject_code_name_map_path, str): - subject_code_name_map_path = Path(subject_code_name_map_path) - - # Validate input directories - if not output_data_dir.exists() or not output_data_dir.is_dir(): + # Convert to Path objects + output_data_dir = Path(output_data_dir) + processed_output_data_dir = Path(processed_output_data_dir) + attribute_code_name_map_path = Path(attribute_code_name_map_path) + instructor_rcsid_name_map_path = Path(instructor_rcsid_name_map_path) + restriction_code_name_map_path = Path(restriction_code_name_map_path) + subject_code_name_map_path = Path(subject_code_name_map_path) + + if not output_data_dir.exists(): logger.error(f"Output data directory {output_data_dir} does not exist.") return False - # Validate mapping files - for map_path in [ + # Initialize code mapper + mapper = CodeMapper( attribute_code_name_map_path, instructor_rcsid_name_map_path, restriction_code_name_map_path, subject_code_name_map_path, - ]: - if not map_path.exists() or map_path.is_dir(): - logger.error(f"Mapping file {map_path} does not exist or is a directory.") - return False - - # Load code mappings - with instructor_rcsid_name_map_path.open("r", encoding="utf-8") as f: - instructor_rcsid_name_map = json.load(f) - with subject_code_name_map_path.open("r", encoding="utf-8") as f: - subject_code_name_map = json.load(f) - - # Initialize generated instructor RCSID map - generated_instructor_rcsid_name_map = {} + ) processed_output_data_dir.mkdir(exist_ok=True, parents=True) @@ -282,32 +283,35 @@ def main( with term_file.open("r", encoding="utf-8") as f: term_course_data = json.load(f) - post_process( - term_course_data, - subject_code_name_map, - instructor_rcsid_name_map, - generated_instructor_rcsid_name_map, - ) + process_term(term_file.stem, term_course_data, mapper) # Write processed data processed_file_path = processed_output_data_dir / term_file.name - logger.info(f"Writing processed data to {processed_file_path}") with processed_file_path.open("w", encoding="utf-8") as f: + logger.info(f"Writing processed data to {processed_file_path}") json.dump(term_course_data, f, indent=4, ensure_ascii=False) - # Write generated instructor RCSID map - if len(generated_instructor_rcsid_name_map) > 0: - generated_map_path = ( - instructor_rcsid_name_map_path.parent - / "generated_instructor_rcsid_name_map.json" - ) - logger.info( - f"Writing {len(generated_instructor_rcsid_name_map)} generated " - f"instructor RCSID mappings to {generated_map_path}" - ) - with generated_map_path.open("w", encoding="utf-8") as f: - json.dump( - generated_instructor_rcsid_name_map, f, indent=4, ensure_ascii=False - ) + # Save updated mappings + num_attribute_codes = len(mapper.attributes) + num_instructor_rcsids = len(mapper.instructors) + num_restriction_codes = sum(len(codes) for codes in mapper.restrictions.values()) + num_subject_codes = len(mapper.subjects) + logger.info( + f"Saving {num_attribute_codes} attribute codes to " + + str(attribute_code_name_map_path) + ) + logger.info( + f"Saving {num_instructor_rcsids} instructor RCSIDs to " + + str(instructor_rcsid_name_map_path) + ) + logger.info( + f"Saving {num_restriction_codes} restriction codes to " + + str(restriction_code_name_map_path) + ) + logger.info( + f"Saving {num_subject_codes} subject codes to " + + str(subject_code_name_map_path) + ) + mapper.save() return True diff --git a/sis_scraper/sis_api.py b/sis_scraper/sis_api.py index 0c157f6..a1c3f51 100644 --- a/sis_scraper/sis_api.py +++ b/sis_scraper/sis_api.py @@ -69,7 +69,12 @@ def html_unescape(obj: Any) -> Any: wait=wait_random_exponential(multiplier=1.5) + wait_random(min=0, max=2), retry=retry_if_exception_type((asyncio.TimeoutError, aiohttp.ClientError)), before_sleep=lambda retry_state: logger.warning( - f"Retrying failed request (attempt {retry_state.attempt_number})" + f"Retrying failed request (attempt {retry_state.attempt_number}) " + f"for URL: {getattr(retry_state.args[1], 'url', retry_state.args[1])} " + f"with params: {retry_state.args[2]} | " + f"Exception: {retry_state.outcome.exception()}" + if retry_state.outcome and retry_state.outcome.exception() + else "Unknown" ), ) async def retry_get( @@ -272,6 +277,55 @@ async def class_search( return course_data +async def get_class_details( + session: aiohttp.ClientSession, term: str, crn: str +) -> dict[str, Any]: + """ + Fetches and parses data from the "Details" tab of a class details page. + + Returned data format is as follows: + ``` + { + "courseReferenceNumber": "12345", + "subjectName": "Computer Science", + "courseNumber": "1100", + "title": "COMPUTER SCIENCE I", + "sectionNumber": "01", + "creditMin": 4, + "creditMax": None + } + ``` + """ + url = _BASE_URL + "searchResults/getClassDetails" + params = {"term": term, "courseReferenceNumber": crn} + raw_data = await retry_get(session, url, params) + raw_data = html_unescape(raw_data) + soup = bs4.BeautifulSoup(raw_data, "html5lib") + details_tag = soup.find("section", {"aria-labelledby": "classDetails"}) + crn = details_tag.find("span", {"id": "courseReferenceNumber"}).text.strip() + section_num = details_tag.find("span", {"id": "sectionNumber"}).text.strip() + subj_name = details_tag.find("span", {"id": "subject"}).text.strip() + course_num = details_tag.find("span", {"id": "courseDisplay"}).text.strip() + title = details_tag.find("span", {"id": "courseTitle"}).text.strip() + # Only courses with a credit range have a span with id "credit-hours-discretion", + # otherwise the credit hours span follows a span with text "Credit Hours:". + credit_min, credit_max = None, None + if credit_hours_tag := details_tag.find("span", {"id": "credit-hours-discretion"}): + credit_min, credit_max = credit_hours_tag.text.strip().split(" TO ") + else: + credit_hours_tag = details_tag.find("span", text="Credit Hours:") + credit_min = credit_hours_tag.next_sibling.text.strip() + return { + "courseReferenceNumber": crn, + "subjectName": subj_name, + "courseNumber": course_num, + "title": title, + "sectionNumber": section_num, + "creditMin": int(credit_min) if credit_min is not None else None, + "creditMax": int(credit_max) if credit_max is not None else None, + } + + async def get_class_description( session: aiohttp.ClientSession, term: str, crn: str ) -> str: @@ -300,6 +354,51 @@ async def get_class_description( return text +async def get_class_enrollment( + session: aiohttp.ClientSession, term: str, crn: str +) -> dict[str, Any]: + """ + Fetches and parses data from the "Enrollment/Waitlist" tab of a class details page. + + Returned data format is as follows: + ``` + { + "enrollmentActual": 28, + "enrollmentCapacity": 30, + "enrollmentAvailable": 2, + "waitlistActual": 0, + "waitlistCapacity": 10, + "waitlistAvailable": 10 + } + ``` + """ + url = _BASE_URL + "searchResults/getEnrollmentInfo" + params = {"term": term, "courseReferenceNumber": crn} + raw_data = await retry_get(session, url, params) + raw_data = html_unescape(raw_data) + soup = bs4.BeautifulSoup(raw_data, "html5lib") + enrollment_tag = soup.find("section", {"aria-labelledby": "enrollmentInfo"}) + # There are no relevant classes or ids on the spans, so we have to rely on the text + # content of the preceding tags. + enrollment_data = {} + span_tags = enrollment_tag.find_all("span") + for i, tag in enumerate(span_tags): + text = tag.text.strip() + if text == "Enrollment Actual:": + enrollment_data["enrollmentActual"] = int(span_tags[i + 1].text.strip()) + elif text == "Enrollment Maximum:": + enrollment_data["enrollmentCapacity"] = int(span_tags[i + 1].text.strip()) + elif text == "Enrollment Seats Available:": + enrollment_data["enrollmentAvailable"] = int(span_tags[i + 1].text.strip()) + elif text == "Waitlist Capacity:": + enrollment_data["waitlistCapacity"] = int(span_tags[i + 1].text.strip()) + elif text == "Waitlist Actual:": + enrollment_data["waitlistActual"] = int(span_tags[i + 1].text.strip()) + elif text == "Waitlist Seats Available:": + enrollment_data["waitlistAvailable"] = int(span_tags[i + 1].text.strip()) + return enrollment_data + + async def get_class_attributes( session: aiohttp.ClientSession, term: str, crn: str ) -> list[str]: @@ -309,9 +408,9 @@ async def get_class_attributes( Returned data format is as follows: ``` [ - "Attribute 1", - "Attribute 2", - "Attribute 3", + "Communication Intensive COMM", + "Data Intensive I DI1", + "Introductory Level Course FRSH", ... ] ``` @@ -336,12 +435,23 @@ async def get_class_restrictions(session: aiohttp.ClientSession, term: str, crn: Returned data format is as follows: ``` { - "major": ["Allowed Major 1", ...], - "not_major": ["Disallowed Major 1", ...], - "level": ["Allowed Level 1", ...], - "not_level": ["Disallowed Level 1", ...], - "classification": ["Allowed Classification 1", ...], - "not_classification": ["Disallowed Classification 1", ...] + "major": [ + "Architecture (ARCH)", + ... + ], + "not_major": [ + "Computer Science (CSCI)", + ... + ], + "classification": [ + "Freshman (FR)", + ... + ], + "not_classification": [ + "Senior (SR)", + ... + ], + ... } ``` """ @@ -350,7 +460,7 @@ async def get_class_restrictions(session: aiohttp.ClientSession, term: str, crn: raw_data = await retry_get(session, url, params) raw_data = html_unescape(raw_data) soup = bs4.BeautifulSoup(raw_data, "html5lib") - # Dynamically build restrictions_data dict structure from RESTRICTION_TYPE_MAP values + # Dynamically build dict structure from RESTRICTION_TYPE_MAP values restrictions_data = {} bases = set(_RESTRICTION_TYPE_MAP.values()) for base in sorted(bases): @@ -526,8 +636,11 @@ async def get_class_corequisites( Returned data format is as follows: ``` [ - "Computer Science 1100", - "Mathematics 1010", + { + "subjectName": "Computer Science", + "courseNumber": "1100", + "title": "COMPUTER SCIENCE I" + }, ... ] ``` @@ -546,7 +659,7 @@ async def get_class_corequisites( if not coreqs_thead or not coreqs_tbody: return [] thead_cols = [th.text.strip() for th in coreqs_thead.find_all("th")] - # Known corequisite columns are Subject, Course, and Title + # Known corequisite columns are Subject, Course Number, and Title if len(thead_cols) != 3: logger.warning( f"Unexpected number of corequisite columns for CRN {crn} in term {term}" @@ -561,9 +674,10 @@ async def get_class_corequisites( f"CRN {crn} in term {term}" ) continue - subject = cols[0] - course_num = cols[1] - coreqs.append(f"{subject} {course_num}") + subject, course_num, title = cols + coreqs.append( + {"subjectName": subject, "courseNumber": course_num, "title": title} + ) return coreqs @@ -571,7 +685,7 @@ async def get_class_crosslists( session: aiohttp.ClientSession, term: str, crn: str, -): +) -> list[dict[str, Any]]: """ Fetches and parses data from the "Cross Listed" tab of a class details page. @@ -579,8 +693,13 @@ async def get_class_crosslists( Returned data format is as follows: ``` [ - "Computer Science 1100", - "Mathematics 1010", + { + "courseReferenceNumber": "12345", + "subjectName": "Computer Science", + "courseNumber": "1100", + "title": "COMPUTER SCIENCE I", + "sectionNumber": "01" + }, ... ] ``` @@ -614,7 +733,155 @@ async def get_class_crosslists( f"CRN {crn} in term {term}" ) continue - subject = cols[1] - code = cols[2] - crosslists.append(f"{subject} {code}") + crn, subject, course_num, title, section_num = cols + crosslists.append( + { + "courseReferenceNumber": crn, + "subjectName": subject, + "courseNumber": course_num, + "title": title, + "sectionNumber": section_num, + } + ) return crosslists + + +async def get_class_faculty_meetings( + session: aiohttp.ClientSession, + term: str, + crn: str, +) -> list[dict[str, Any]]: + """ + Fetches and parses data from the "Instructor/Meeting Times" tab of a class details + page. + + Returned data format is as follows: + ``` + [ + "faculty": [ + { + "bannerId": "123456", + "displayName": "Last, First", + "emailAddress": "example@rpi.edu", + "allMeetings": [1, 2, ...], + "primaryMeetings": [1, ...] + }, + ... + ], + "meetings": [ + { + "id": 1, + "beginTime": "0800", + "endTime": "0950", + "creditHours": 4, + "campusCode": "T", + "campusDescription": "Troy", + "buildingCode": "SAGE", + "buildingDescription": "Russell Sage Laboratory", + "category": "L", + "room": "303", + "startDate": "01/15/2024", + "endDate": "05/01/2024", + "days": ["M", "W", "F"] + }, + ... + ] + ... + ] + ``` + """ + url = _BASE_URL + "searchResults/getFacultyMeetingTimes" + params = {"term": term, "courseReferenceNumber": crn} + raw_data = await retry_get(session, url, params) + json_data = json.loads(raw_data) + json_data = html_unescape(json_data) + sis_faculty_meetings_list = json_data["fmt"] + meetings_list = _process_class_faculty_meetings( + sis_faculty_meetings_list, term, crn + ) + return meetings_list + + +def _process_class_faculty_meetings( + sis_faculty_meetings_list: list[dict[str, Any]], + term: str, + crn: str, +) -> list[dict[str, Any]]: + """ + Processes raw class meeting data from SIS into a more usable format. + + See get_class_faculty_meetings() for returned data format. + """ + faculty_dict = {} + meetings_list = [] + day_codes = { + "sunday": "U", + "monday": "M", + "tuesday": "T", + "wednesday": "W", + "thursday": "R", + "friday": "F", + "saturday": "S", + } + for i, faculty_meeting in enumerate(sis_faculty_meetings_list, start=1): + sis_meeting_info = faculty_meeting["meetingTime"] + sis_faculty_list = faculty_meeting["faculty"] + if sis_meeting_info is None and sis_faculty_list is None: + logger.warning( + "Found faculty-meeting entry with no meeting info or faculty list for " + f"CRN {crn} in term {term}" + ) + continue + if sis_meeting_info is not None: + meeting_info = { + "id": i, + "beginTime": sis_meeting_info["beginTime"], + "endTime": sis_meeting_info["endTime"], + "creditHours": sis_meeting_info["creditHourSession"], + "campusCode": sis_meeting_info["campus"], + "campusDescription": sis_meeting_info["campusDescription"], + "buildingCode": sis_meeting_info["building"], + "buildingDescription": sis_meeting_info["buildingDescription"], + "category": sis_meeting_info["category"], + "room": sis_meeting_info["room"], + "startDate": sis_meeting_info["startDate"], + "endDate": sis_meeting_info["endDate"], + "days": [ + code for day, code in day_codes.items() if sis_meeting_info[day] + ], + } + meetings_list.append(meeting_info) + else: + logger.warning( + f"Found faculty-meeting entry with no meeting info for CRN {crn} " + f"in term {term}" + ) + if sis_faculty_list is not None: + for faculty in sis_faculty_list: + banner_id = faculty["bannerId"] + # Initialize faculty entry if it doesn't exist + faculty_entry = faculty_dict.setdefault( + banner_id, + { + "displayName": faculty["displayName"], + "emailAddress": faculty["emailAddress"], + "allMeetings": [], + "primaryMeetings": [], + }, + ) + if sis_meeting_info: + # Add meeting ID to faculty's meeting lists + faculty_entry["allMeetings"].append(i) + if faculty["primaryIndicator"]: + faculty_entry["primaryMeetings"].append(i) + else: + logger.warning( + f"Found faculty-meeting entry with no faculty list for CRN {crn} " + f"in term {term}" + ) + # Convert faculty dictionary into a list + faculty_list = list(faculty_dict.values()) + return { + "faculty": faculty_list, + "meetings": meetings_list, + } diff --git a/sis_scraper/sis_scraper.py b/sis_scraper/sis_scraper.py index 3323267..a9e62f0 100644 --- a/sis_scraper/sis_scraper.py +++ b/sis_scraper/sis_scraper.py @@ -14,6 +14,9 @@ get_class_corequisites, get_class_crosslists, get_class_description, + get_class_details, + get_class_enrollment, + get_class_faculty_meetings, get_class_prerequisites, get_class_restrictions, get_term_subjects, @@ -53,8 +56,11 @@ def get_term_code(year: str | int, season: str) -> str: async def process_class_details( session: aiohttp.ClientSession, - course_data: dict[str, Any], - class_entry: dict[str, Any], + term_course_data: dict[str, Any], + term_crn_set: set[str], + sis_class_entry: dict[str, Any] | None = None, + term: str | None = None, + crn: str | None = None, instructor_rcsid_name_map: dict[str, str] = None, attribute_code_name_map: dict[str, str] = None, restriction_code_name_map: dict[str, dict[str, str]] = None, @@ -66,8 +72,10 @@ async def process_class_details( Takes as input class data fetched from SIS's class search endpoint. @param session: aiohttp client session to use for requests. - @param course_data: Dictionary to populate with course data. - @param class_entry: Class data fetched from SIS's class search endpoint. + @param subj_class_data: Subject course data dictionary to populate with class entries. + @param sis_class_entry: Class data fetched from SIS's class search + endpoint. + @param term_crn_set: Set of all CRNs processed in the term. @param known_rcsid_set: Optional set to populate with known instructor RCSIDs. @param attribute_code_name_map: Optional map to populate with attribute @@ -76,189 +84,206 @@ async def process_class_details( codes to names. @return: None """ - # Example course code: CSCI 1100 - course_code = f"{class_entry['subject']} {class_entry['courseNumber']}" - term = class_entry["term"] - crn = class_entry["courseReferenceNumber"] - - # Fetch class details not included in main class details - # Only fetch if course not already in course data - if course_code not in course_data: - - # Initialize empty course entry - course_data[course_code] = { - "course_name": class_entry["courseTitle"], - "course_detail": { - "description": "", - "corequisite": [], - "prerequisite": [], - "crosslist": [], - "attributes": [], - "restrictions": [], - "credits": { - "min": float("inf"), - "max": 0, - }, - "sections": [], - }, - } - - async with asyncio.TaskGroup() as tg: - description_task = tg.create_task(get_class_description(session, term, crn)) - attributes_task = tg.create_task(get_class_attributes(session, term, crn)) - restrictions_task = tg.create_task( - get_class_restrictions(session, term, crn) - ) - prerequisites_task = tg.create_task( - get_class_prerequisites(session, term, crn) - ) - corequisites_task = tg.create_task( - get_class_corequisites(session, term, crn) - ) - crosslists_task = tg.create_task(get_class_crosslists(session, term, crn)) - - # Wait for tasks to complete and get results - description_data = description_task.result() - attributes_data = attributes_task.result() - restrictions_data = restrictions_task.result() - prerequisites_data = prerequisites_task.result() - # TODO: Filter out self-references from prerequisites - corequisites_data = corequisites_task.result() - corequisites_data = list( - filter( - lambda data: data.split()[-1] != class_entry["courseNumber"] - or " ".join(data.split()[:-1]) != class_entry["subjectDescription"], - corequisites_data, - ) - ) - crosslists_data = crosslists_task.result() - crosslists_data = list( - filter( - lambda data: data.split()[-1] != class_entry["courseNumber"] - or " ".join(data.split()[:-1]) != class_entry["subjectDescription"], - crosslists_data, - ) + if sis_class_entry is None and (term is None or crn is None): + logger.error( + "Either sis_class_entry or both term and crn must be provided " + "to process_class_details" ) + return + + # Extract basic class details from SIS class entry if provided + if sis_class_entry is not None: + subject_desc = sis_class_entry["subjectDescription"] + course_num = sis_class_entry["courseNumber"] + term = sis_class_entry["term"] + crn = sis_class_entry["courseReferenceNumber"] + + # Add CRN to term CRN set + if crn in term_crn_set: + logger.warning(f"Duplicate CRN {crn} found in term {term}") + else: + term_crn_set.add(crn) + + # Initialize empty class entry + class_entry = { + "courseReferenceNumber": crn, + "sectionNumber": "", + "title": "", + "description": "", + "attributes": [], + "restrictions": {}, + "prerequisites": [], + "corequisites": [], + "crosslists": [], + "creditMin": -1, + "creditMax": -1, + "seatsCapacity": -1, + "seatsRegistered": -1, + "seatsAvailable": -1, + "waitlistCapacity": -1, + "waitlistRegistered": -1, + "waitlistAvailable": -1, + "faculty": [], + "meetingInfo": [], + } - # Build attribute code to name map - # Attributes are known to be in the format "Attribute Name CODE" - # Note the double space between name and code - if attribute_code_name_map is not None: - for attribute in attributes_data: - attribute_split = attribute.split() - if len(attribute_split) < 2: - logger.warning( - f"Skipping unexpected attribute format for CRN {crn} " - f"in term {term}: {attribute}" - ) + # Fetch class details not included in SIS class search + async with asyncio.TaskGroup() as tg: + description_task = tg.create_task(get_class_description(session, term, crn)) + attributes_task = tg.create_task(get_class_attributes(session, term, crn)) + restrictions_task = tg.create_task(get_class_restrictions(session, term, crn)) + prerequisites_task = tg.create_task(get_class_prerequisites(session, term, crn)) + corequisites_task = tg.create_task(get_class_corequisites(session, term, crn)) + crosslists_task = tg.create_task(get_class_crosslists(session, term, crn)) + faculty_meetings_task = tg.create_task( + get_class_faculty_meetings(session, term, crn) + ) + # Fetch full class details if not provided from SIS class search + if sis_class_entry is None: + details_task = tg.create_task(get_class_details(session, term, crn)) + enrollment_task = tg.create_task(get_class_enrollment(session, term, crn)) + + # Wait for tasks to complete and get results + description_data = description_task.result() + attributes_data = attributes_task.result() + restrictions_data = restrictions_task.result() + prerequisites_data = prerequisites_task.result() + corequisites_data = corequisites_task.result() + crosslists_data = crosslists_task.result() + faculty_meetings_data = faculty_meetings_task.result() + if sis_class_entry is None: + details_data = details_task.result() + enrollment_data = enrollment_task.result() + + # Extract subject and course number from full details if SIS class entry not provided + if sis_class_entry is None: + subject_desc = details_data["subjectName"] + course_num = details_data["courseNumber"] + + # Fill class entry with fetched details + class_entry["description"] = description_data + class_entry["attributes"] = attributes_data + class_entry["restrictions"] = restrictions_data + class_entry["prerequisites"] = prerequisites_data + class_entry["corequisites"] = corequisites_data + class_entry["crosslists"] = crosslists_data + class_entry["faculty"] = faculty_meetings_data["faculty"] + class_entry["meetingInfo"] = faculty_meetings_data["meetings"] + # Fill class entry with SIS class search data if provided + if sis_class_entry is not None: + class_entry["sectionNumber"] = sis_class_entry["sequenceNumber"] + class_entry["title"] = sis_class_entry["courseTitle"] + class_entry["creditMin"] = sis_class_entry["creditHourLow"] + class_entry["creditMax"] = sis_class_entry["creditHourHigh"] + class_entry["seatsCapacity"] = sis_class_entry["maximumEnrollment"] + class_entry["seatsRegistered"] = sis_class_entry["enrollment"] + class_entry["seatsAvailable"] = sis_class_entry["seatsAvailable"] + class_entry["waitlistCapacity"] = sis_class_entry["waitCapacity"] + class_entry["waitlistRegistered"] = sis_class_entry["waitCount"] + class_entry["waitlistAvailable"] = sis_class_entry["waitAvailable"] + else: + class_entry["sectionNumber"] = details_data["sectionNumber"] + class_entry["title"] = details_data["title"] + class_entry["creditMin"] = details_data["creditMin"] + class_entry["creditMax"] = details_data["creditMax"] + class_entry["seatsCapacity"] = enrollment_data["enrollmentCapacity"] + class_entry["seatsRegistered"] = enrollment_data["enrollmentActual"] + class_entry["seatsAvailable"] = enrollment_data["enrollmentAvailable"] + class_entry["waitlistCapacity"] = enrollment_data["waitlistCapacity"] + class_entry["waitlistRegistered"] = enrollment_data["waitlistActual"] + class_entry["waitlistAvailable"] = enrollment_data["waitlistAvailable"] + + # Get appropriate subject course data dictionary from term course data + subj_course_data = term_course_data[subject_desc]["courses"] + # Initialize course entry if not already present + if course_num not in subj_course_data: + subj_course_data[course_num] = [] + + # Process faculty RCSIDs and names + for faculty in class_entry["faculty"]: + faculty_name = faculty["displayName"] + email_address = faculty["emailAddress"] + # Add faculty RCSID to known RCSID map if provided + if ( + email_address is not None + and email_address.endswith("@rpi.edu") + and instructor_rcsid_name_map is not None + ): + rcsid = email_address.split("@")[0].lower() + instructor_rcsid_name_map[rcsid] = faculty_name + + # Add class entry to subject course data + subj_course_data[course_num].append(class_entry) + + # Add to attribute code-to-name map + # Attributes are known to be in the format "Attribute Name CODE" + # Note the double space between name and code + if attribute_code_name_map is not None: + for attribute in attributes_data: + attribute_split = attribute.split() + if len(attribute_split) < 2: + logger.warning( + f"Skipping unexpected attribute format for CRN {crn} " + f"in term {term}: {attribute}" + ) + continue + attribute_code = attribute_split[-1].strip() + attribute_name = " ".join(attribute_split[:-1]).strip() + if ( + attribute_code in attribute_code_name_map + and attribute_code_name_map[attribute_code] != attribute_name + ): + logger.warning( + f"Conflicting attribute names for {attribute_code} " + f"in term {term}: " + f"{attribute_code_name_map[attribute_code]} vs. {attribute_name}" + ) + attribute_code_name_map[attribute_code] = attribute_name + + # Add to restriction code-to-name map + # Restrictions are known to be in the format "Restriction Name (CODE)" except + # for special approvals, which are handled explicitly as a special case. + if restriction_code_name_map is not None: + restriction_pattern = r"(.*)\((.*)\)" + for restriction_type in restrictions_data: + restriction_type = restriction_type.lower().replace("not_", "") + if restriction_type not in restriction_code_name_map: + restriction_code_name_map[restriction_type] = {} + for restriction in restrictions_data[restriction_type]: + restriction_match = re.match(restriction_pattern, restriction) + if restriction_match is None or len(restriction_match.groups()) < 2: + # Skip unexpected restriction formats or special approvals continue - attribute_code = attribute_split[-1].strip() - attribute_name = " ".join(attribute_split[:-1]).strip() + restriction_name = restriction_match.group(1).strip() + restriction_code = restriction_match.group(2).strip() if ( - attribute_code in attribute_code_name_map - and attribute_code_name_map[attribute_code] != attribute_name + restriction_name in restriction_code_name_map[restriction_type] + and restriction_code_name_map[restriction_type][restriction_code] + != restriction_name ): logger.warning( - f"Conflicting attribute names for {attribute_code} " + f"Conflicting restriction names for {restriction_code} " f"in term {term}: " - f"{attribute_code_name_map[attribute_code]} vs. {attribute_name}" + f"{restriction_code_name_map[ + restriction_type + ][restriction_code]} vs. {restriction_name}" ) - attribute_code_name_map[attribute_code] = attribute_name - - # Build restriction code to name map - # Restrictions are known to be in the format "Restriction Name (CODE)" except - # for special approvals, which are handled explicitly as a special case. - if restriction_code_name_map is not None: - restriction_pattern = r"(.*)\((.*)\)" - for restriction_type in restrictions_data: - restriction_type = restriction_type.lower().replace("not_", "") - if restriction_type not in restriction_code_name_map: - restriction_code_name_map[restriction_type] = {} - for restriction in restrictions_data[restriction_type]: - restriction_match = re.match(restriction_pattern, restriction) - if restriction_match is None or len(restriction_match.groups()) < 2: - # Skip unexpected restriction formats or special approvals - continue - restriction_name = restriction_match.group(1).strip() - restriction_code = restriction_match.group(2).strip() - if ( - restriction_name in restriction_code_name_map[restriction_type] - and restriction_code_name_map[restriction_type][ - restriction_code - ] - != restriction_name - ): - logger.warning( - f"Conflicting restriction names for {restriction_code} " - f"in term {term}: " - f"{restriction_code_name_map[ - restriction_type - ][restriction_code]} vs. {restriction_name}" - ) - restriction_code_name_map[restriction_type][ - restriction_code - ] = restriction_name - - # Initialize course entry with details - course_details = course_data[course_code]["course_detail"] - course_details["description"] = description_data - course_details["attributes"] = attributes_data - course_details["restrictions"] = restrictions_data - course_details["prerequisite"] = prerequisites_data - course_details["corequisite"] = list(set(corequisites_data)) - course_details["crosslist"] = list(set(crosslists_data)) - - course_details = course_data[course_code]["course_detail"] - - course_credits = course_details["credits"] - course_credits["min"] = min( - course_credits["min"], class_entry["creditHourLow"] or 0 - ) - course_credits["max"] = max( - course_credits["max"], - class_entry["creditHourLow"] or 0, - class_entry["creditHourHigh"] or 0, - ) - - course_sections = course_details["sections"] - class_faculty = class_entry["faculty"] - class_faculty_rcsids = [] - for instructor in class_faculty: - instructor_name = instructor["displayName"] - rcsid = "Unknown RCSID" - if "emailAddress" in instructor: - email_address = instructor["emailAddress"] - if email_address is not None and email_address.endswith("@rpi.edu"): - rcsid = email_address.split("@")[0].lower() - # Add to known RCSID set if provided - if instructor_rcsid_name_map is not None: - instructor_rcsid_name_map[rcsid] = instructor["displayName"] - else: - logger.warning( - f"Missing instructor email address field for CRN {crn} " - f"in term {term}: {instructor_name}" - ) - class_faculty_rcsids.append(f"{instructor_name} ({rcsid})") - - course_sections.append( - { - "CRN": class_entry["courseReferenceNumber"], - "instructor": class_faculty_rcsids, - "capacity": class_entry["maximumEnrollment"], - "registered": class_entry["enrollment"], - "open": class_entry["seatsAvailable"], - } - ) + restriction_code_name_map[restriction_type][ + restriction_code + ] = restriction_name -async def get_course_data( +async def get_subj_course_data( term: str, - subject: str, + subject_code: str, + subject_desc: str, + term_course_data: dict[str, dict[str, Any]], + term_crn_set: set[str], instructor_rcsid_name_map: dict[str, str] = None, restriction_code_name_map: dict[str, dict[str, str]] = None, attribute_code_name_map: dict[str, str] = None, semaphore: asyncio.Semaphore = asyncio.Semaphore(1), - limit_per_host: int = 5, + tcp_connector: aiohttp.TCPConnector = None, timeout: int = 30, ) -> dict[str, dict[str, Any]]: """ @@ -267,15 +292,9 @@ async def get_course_data( This function spawns its own client session to avoid session state conflicts with other subjects that may be processing concurrently. - In the context of this scraper, a "class" refers to a section of a course, while a - "course" refers to the overarching course that may have multiple classes. - - The data returned from SIS is keyed by classes, not courses. This function - manipulates and aggregates this data such that the returned structure is keyed by - courses instead, with classes as a sub-field of each course. - @param term: Term code to fetch data for. @param subject: Subject code to fetch data for. + @param term_crn_set: Set of all CRNs processed in the term. @param instructor_rcsid_name_map: Optional map to populate with instructor RCSIDs to names. @param restriction_code_name_map: Optional map to populate with restriction @@ -290,36 +309,48 @@ async def get_course_data( @return: Dictionary of course data keyed by course code. """ async with semaphore: - # Limit simultaneous connections to SIS server per session - connector = aiohttp.TCPConnector( - ttl_dns_cache=500, limit_per_host=limit_per_host - ) timeout_obj = aiohttp.ClientTimeout(total=timeout) async with aiohttp.ClientSession( - connector=connector, timeout=timeout_obj + connector=tcp_connector, connector_owner=False, timeout=timeout_obj ) as session: try: # Reset search state on server before fetching class data await reset_class_search(session, term) - class_data = await class_search(session, term, subject) - course_data = {} + sis_class_data = await class_search(session, term, subject_code) + if len(sis_class_data) == 0: + logger.info( + f"No classes found for subject {subject_code} in term {term}" + ) + return {} + # Process class entries from the class search in parallel async with asyncio.TaskGroup() as tg: - for class_entry in class_data: + for sis_class_entry in sis_class_data: tg.create_task( process_class_details( session, - course_data, - class_entry, + term_course_data, + term_crn_set, + sis_class_entry, instructor_rcsid_name_map=instructor_rcsid_name_map, restriction_code_name_map=restriction_code_name_map, attribute_code_name_map=attribute_code_name_map, ) ) + # Get subject course data from term course data + subj_course_data = term_course_data[subject_desc]["courses"] + # Sort class entries by section number + for course_num in subj_course_data: + subj_course_data[course_num] = sorted( + subj_course_data[course_num], + key=lambda class_entry: class_entry["sectionNumber"], + ) # Return data sorted by course code - return dict(sorted(course_data.items())) + return dict(sorted(subj_course_data.items())) except aiohttp.ClientError as e: - logger.error(f"Error processing subject {subject} in term {term}: {e}") + logger.error( + f"Error processing subject {subject_code} in term {term}: {e}" + ) return {} @@ -331,7 +362,7 @@ async def get_term_course_data( restriction_code_name_map: dict[str, dict[str, str]] = None, attribute_code_name_map: dict[str, str] = None, semaphore: asyncio.Semaphore = asyncio.Semaphore(10), - limit_per_host: int = 5, + tcp_connector: aiohttp.TCPConnector = None, timeout: int = 30, ) -> None: """ @@ -359,7 +390,9 @@ async def get_term_course_data( """ timeout_obj = aiohttp.ClientTimeout(total=timeout) try: - async with aiohttp.ClientSession(timeout=timeout_obj) as session: + async with aiohttp.ClientSession( + connector=tcp_connector, connector_owner=False, timeout=timeout_obj + ) as session: subjects = await get_term_subjects(session, term) except aiohttp.ClientError as e: logger.error(f"Error fetching subjects for term {term}: {e}") @@ -384,42 +417,96 @@ async def get_term_course_data( # Stores all course data for the term term_course_data = {} + # Stores all CRNs for the term + term_crn_set = set() + # Process subjects in parallel, each with its own session tasks: list[asyncio.Task] = [] try: async with asyncio.TaskGroup() as tg: for subject in subjects: subject_code = subject["code"] - term_course_data[subject_code] = { - "subject_name": subject["description"], + subject_desc = subject["description"] + term_course_data[subject_desc] = { + "subjectCode": subject_code, "courses": {}, } task = tg.create_task( - get_course_data( + get_subj_course_data( term, subject_code, + subject_desc, + term_course_data, + term_crn_set, instructor_rcsid_name_map=instructor_rcsid_name_map, restriction_code_name_map=restriction_code_name_map, attribute_code_name_map=attribute_code_name_map, semaphore=semaphore, - limit_per_host=limit_per_host, + tcp_connector=tcp_connector, timeout=timeout, ) ) tasks.append(task) except Exception as e: - logger.error(f"Error processing subjects for term {term}: {e}") + import traceback + + logger.error( + f"Error processing subjects for term {term}: {e}" + f"\n{traceback.format_exc()}" + ) return False # Wait for all tasks to complete and gather results for i, subject in enumerate(subjects): course_data = tasks[i].result() - term_course_data[subject["code"]]["courses"] = course_data + term_course_data[subject["description"]]["courses"] = course_data if len(term_course_data) == 0: return False - # Write all data for term to JSON file + # Check all crosslist CRNs in the term course data for any hidden classes not shown in + # the main class search and fetch their details. + async with semaphore: + async with aiohttp.ClientSession( + connector=tcp_connector, connector_owner=False, timeout=timeout_obj + ) as session: + hidden_crns = { + crosslist["courseReferenceNumber"] + for subject in term_course_data.values() + for course in subject["courses"].values() + for class_entry in course + for crosslist in class_entry["crosslists"] + if crosslist["courseReferenceNumber"] not in term_crn_set + } + if len(hidden_crns) > 0: + async with asyncio.TaskGroup() as tg: + for crn in hidden_crns: + tg.create_task( + process_class_details( + session, + term_course_data, + term_crn_set, + term=term, + crn=crn, + ) + ) + logger.info( + f"Processing hidden class with CRN {crn} in term {term}" + ) + + # Convert term course data to be keyed by subject code instead of description + term_course_data_by_code = {} + for subject_desc, data in term_course_data.items(): + subject_code = data["subjectCode"] + term_course_data_by_code[subject_code] = { + "subjectDescription": subject_desc, + **data, + } + # Remove redundant subject code entry + del term_course_data_by_code[subject_code]["subjectCode"] + term_course_data = term_course_data_by_code + + # Write all term data to JSON file if isinstance(output_path, str): output_path = Path(output_path) try: @@ -443,8 +530,8 @@ async def main( instructor_rcsid_name_map_path: Path | str | None = None, restriction_code_name_map_path: Path | str | None = None, subject_code_name_map_path: Path | str | None = None, - semaphore_val: int = 10, - limit_per_host: int = 5, + max_concurrent_sessions: int = 25, + limit_per_host: int = 75, timeout: int = 30, ) -> bool: """ @@ -480,7 +567,7 @@ async def main( mapping JSON file. @param subject_code_name_map_path: Path to load/save subject code mapping JSON file. - @param semaphore_val: Maximum number of concurrent client sessions to + @param max_concurrent_sessions: Maximum number of concurrent client sessions to spawn. @param limit_per_host: Maximum number of simultaneous connections a session can make to the SIS server. @@ -489,7 +576,7 @@ async def main( """ if output_data_dir is None: - logger.error("No data output directory specified") + logger.fatal("No data output directory specified") return False # Convert paths to Path objects if given as strings @@ -569,14 +656,15 @@ async def main( f"at {subject_code_name_map_path}" ) except Exception as e: - logger.error(f"Error loading code mapping files: {e}") import traceback - traceback.print_exc() + logger.fatal( + f"Error loading code mapping files: {e}" f"\n{traceback.format_exc()}" + ) return False # Limit concurrent client sessions and simultaneous connections - semaphore = asyncio.Semaphore(semaphore_val) + semaphore = asyncio.Semaphore(max_concurrent_sessions) logger.info("Starting SIS scraper with settings:") logger.info(f" Years: {start_year} - {end_year}") @@ -587,40 +675,46 @@ async def main( tasks: list[asyncio.Task] = [] num_terms_processed = 0 try: - # Process terms in parallel - async with asyncio.TaskGroup() as tg: - for year in range(start_year, end_year + 1): - for season in seasons: - term = get_term_code(year, season) - if term == "": - continue - output_path = Path(output_data_dir) / f"{term}.json" - task = tg.create_task( - get_term_course_data( - term, - output_path=output_path, - subject_code_name_map=subject_code_name_map, - instructor_rcsid_name_map=instructor_rcsid_name_map, - restriction_code_name_map=restriction_code_name_map, - attribute_code_name_map=attribute_code_name_map, - semaphore=semaphore, - limit_per_host=limit_per_host, - timeout=timeout, + # Global TCP connector for all sessions + async with aiohttp.TCPConnector( + ttl_dns_cache=500, + limit_per_host=limit_per_host, + keepalive_timeout=60, + force_close=False, + ) as tcp_connector: + # Process terms in parallel + async with asyncio.TaskGroup() as tg: + for year in range(start_year, end_year + 1): + for season in seasons: + term = get_term_code(year, season) + if term == "": + continue + output_path = Path(output_data_dir) / f"{term}.json" + task = tg.create_task( + get_term_course_data( + term, + output_path=output_path, + subject_code_name_map=subject_code_name_map, + instructor_rcsid_name_map=instructor_rcsid_name_map, + restriction_code_name_map=restriction_code_name_map, + attribute_code_name_map=attribute_code_name_map, + semaphore=semaphore, + tcp_connector=tcp_connector, + timeout=timeout, + ) ) - ) - tasks.append(task) + tasks.append(task) - # Wait for all tasks to complete - for task in tasks: - success = task.result() - if success: - num_terms_processed += 1 + # Wait for all tasks to complete + for task in tasks: + success = task.result() + if success: + num_terms_processed += 1 except Exception as e: - logger.error(f"Error in SIS scraper: {e}") import traceback - traceback.print_exc() + logger.fatal(f"Error in SIS scraper: {e}\n{traceback.format_exc()}") return False # Write code maps to JSON files if code mapping paths are provided @@ -665,10 +759,9 @@ async def main( with subject_code_name_map_path.open("w", encoding="utf-8") as f: json.dump(subject_code_name_map, f, indent=4, ensure_ascii=False) except Exception as e: - logger.error(f"Error writing code mapping files: {e}") import traceback - traceback.print_exc() + logger.error(f"Error writing code mapping files: {e}\n{traceback.format_exc()}") return False end_time = time.time()