Skip to content

Commit 2682c1c

Browse files
committed
Remove extraneous data from course descriptions
With this change, the "When Offered" field of course data has also been removed, as there are no plans to use this data.
1 parent 4ac8448 commit 2682c1c

File tree

2 files changed

+39
-55
lines changed

2 files changed

+39
-55
lines changed

app/scrapers/sis_api.py

Lines changed: 11 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -224,49 +224,30 @@ async def class_search(
224224

225225
async def get_class_description(
226226
session: aiohttp.ClientSession, term: str, crn: str
227-
) -> dict[str, str]:
227+
) -> str:
228228
"""
229229
Fetches and parses data from the "Course Description" tab of a class details page.
230230
231-
Returned data format is as follows:
232-
{
233-
"description": "This course provides an introduction to ...",
234-
"when_offered": "Spring, Summer, and Fall"
235-
}
231+
Returns a string containing the course description, without any additional fields
232+
such as "When Offered", "Credit Hours", "Prerequisite", etc.
236233
"""
237234
url = "https://sis9.rpi.edu/StudentRegistrationSsb/ssb/searchResults/getCourseDescription"
238235
params = {"term": term, "courseReferenceNumber": crn}
239236
async with session.get(url, params=params) as response:
240237
response.raise_for_status()
241238
raw_data = await response.text()
242239
raw_data = html_unescape(raw_data)
243-
description_data = {
244-
"description": "",
245-
"when_offered": "",
246-
}
247240
soup = bs4.BeautifulSoup(raw_data, "html5lib")
248241
description_tag = soup.find("section", {"aria-labelledby": "courseDescription"})
249-
description_text = [
250-
text.strip("\n").strip() for text in description_tag.text.split("\n")
242+
if description_tag is None:
243+
print(f"No description found for term and CRN: {term} - {crn}")
244+
return ""
245+
description_text_list = [
246+
text.strip() for text in description_tag.get_text(separator="\n").split("\n")
251247
]
252-
for text in description_text:
253-
print(text or "EMPTY")
254-
if text.startswith("When Offered:"):
255-
description_data["when_offered"] = text.replace("When Offered: ", "")
256-
# Skip useless fields that can be obtained elsewhere
257-
elif text.startswith("Credit Hours:"):
258-
continue
259-
elif text.startswith("Contact, Lecture or Lab Hours:"):
260-
continue
261-
elif text.startswith("Prerequisite:"):
262-
continue
263-
elif text.startswith("Corequisite:"):
264-
continue
265-
elif text.startswith("Cross Listed:"):
266-
continue
267-
else:
268-
description_data["description"] += text
269-
return description_data
248+
for text in description_text_list:
249+
if text != "":
250+
return text
270251

271252

272253
async def get_class_attributes(session: aiohttp.ClientSession, term: str, crn: str):

app/scrapers/sis_scraper.py

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,32 @@
1111
OUTPUT_DATA_DIR = "scraper_data"
1212

1313

14-
async def get_reverse_subject_map(
14+
def get_term_code(year: int, season: str) -> str:
15+
"""
16+
Converts a year and academic season into a term code used by SIS.
17+
"""
18+
if season is None:
19+
return ""
20+
season_lower = season.lower().strip()
21+
if season_lower == "fall":
22+
return f"{year}09"
23+
elif season_lower == "summer":
24+
return f"{year}05"
25+
elif season_lower == "spring":
26+
return f"{year}01"
27+
else:
28+
return ""
29+
30+
31+
async def get_subject_name_code_map(
1532
session: aiohttp.ClientSession,
1633
start_year: int = 1998,
1734
end_year: int = datetime.now().year,
1835
seasons: list[str] = None,
1936
) -> dict[str, str]:
2037
"""
2138
Fetches the list of subjects from the specified range of years and seasons, and
22-
returns a "reverse" mapping of subject names to subject codes.
39+
returns a mapping of subject names to subject codes.
2340
2441
Defaults to a range from 1998 to the current year, and Spring, Summer, and Fall
2542
seasons. SIS data begins in Summer 1998.
@@ -95,7 +112,7 @@ async def process_class_details(
95112
course_data[course_code] = {
96113
"course_name": class_entry["courseTitle"],
97114
"course_detail": {
98-
"description": description_data["description"],
115+
"description": description_data,
99116
"corequisite": corequisites_data,
100117
"prerequisite": prerequisites_data,
101118
"crosslist": crosslists_data,
@@ -105,7 +122,6 @@ async def process_class_details(
105122
"min": float("inf"),
106123
"max": 0,
107124
},
108-
"offered": description_data["when_offered"],
109125
"sections": [],
110126
},
111127
}
@@ -218,7 +234,6 @@ async def get_term_course_data(
218234
219235
Writes data as JSON after all subjects in the term have been processed.
220236
"""
221-
print(f"Fetching subject list for term: {term}")
222237
async with aiohttp.ClientSession() as session:
223238
subjects = await get_term_subjects(session, term)
224239
print(f"Processing {len(subjects)} subjects for term: {term}")
@@ -260,23 +275,6 @@ async def get_term_course_data(
260275
json.dump(all_course_data, f, indent=4, ensure_ascii=False)
261276

262277

263-
def get_term_code(year: int, season: str) -> str:
264-
"""
265-
Converts a year and academic season into a term code used by SIS.
266-
"""
267-
if season is None:
268-
return ""
269-
season_lower = season.lower().strip()
270-
if season_lower == "fall":
271-
return f"{year}09"
272-
elif season_lower == "summer":
273-
return f"{year}05"
274-
elif season_lower == "spring":
275-
return f"{year}01"
276-
else:
277-
return ""
278-
279-
280278
async def main(start_year: int, end_year: int, seasons: list[str] = None) -> bool:
281279
"""
282280
Runs the SIS scraper for the specified range of years and seasons.
@@ -304,10 +302,15 @@ async def main(start_year: int, end_year: int, seasons: list[str] = None) -> boo
304302
semaphore = asyncio.Semaphore(50)
305303
limit_per_host = 20
306304

307-
# Create master subject name to subject code mapping
308-
print("Fetching subject name to subject code mapping...")
305+
print(
306+
f"Starting SIS scraper with settings:\n"
307+
f"\tYears: {start_year} - {end_year}\n"
308+
f"\tSeasons: {', '.join(season.capitalize() for season in seasons)}"
309+
)
310+
311+
print("Fetching subject name to code mapping...")
309312
async with aiohttp.ClientSession() as session:
310-
subject_name_code_map = await get_reverse_subject_map(
313+
subject_name_code_map = await get_subject_name_code_map(
311314
session, seasons=seasons
312315
)
313316

0 commit comments

Comments
 (0)