Skip to content

Commit 8f6bdfa

Browse files
committed
Refine asynchronous logic sections
With a bit of help from Claude Sonnet 4, the scraper has been further optimized such that data is scraped from every subject within a term in parallel, each one using its own client session to avoid session state conflicts in the SIS backend. There are currently still issues with the scraping logic as too many assumptions are being made about the scraped data. For example: after a couple of runs of the scraper, it turns out that not every faculty member has a non-null email address. This is going to require some investigating into the SIS data.
1 parent 753afc2 commit 8f6bdfa

File tree

1 file changed

+65
-32
lines changed

1 file changed

+65
-32
lines changed

app/scrapers/sis_scraper.py

Lines changed: 65 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,12 @@ async def process_class_details(
229229
course_details = course_data["course_detail"]
230230

231231
course_credits = course_details["credits"]
232-
course_credits["min"] = min(course_credits["min"], class_entry["creditHourLow"])
233-
course_credits["max"] = max(course_credits["max"], class_entry["creditHourHigh"])
232+
course_credits["min"] = min(
233+
course_credits["min"], class_entry["creditHourLow"] or 0
234+
)
235+
course_credits["max"] = max(
236+
course_credits["max"], class_entry["creditHourHigh"] or 0
237+
)
234238

235239
course_sections = course_details["sections"]
236240
# Use faculty RCS IDs instead of names
@@ -251,30 +255,48 @@ async def process_class_details(
251255

252256

253257
async def get_course_data(
254-
session: aiohttp.ClientSession, term: str, subject: str
255-
) -> list[dict]:
258+
semaphore: asyncio.Semaphore, term: str, subject: str
259+
) -> dict:
256260
"""
257261
Gets all course data for a given term and subject.
258262
263+
This function spawns its own client session to avoid session state conflicts with
264+
other subjects that may be processing concurrently.
265+
259266
In the context of this scraper, a "class" refers to a section of a course, while a
260267
"course" refers to the overarching course that may have multiple classes.
261268
262269
The data returned from SIS is keyed by classes, not courses. This function
263270
manipulates and aggregates this data such that the returned structure is keyed by
264271
courses instead, with classes as a sub-field of each course.
265-
266-
Think of this as a main function that calls all other helper functions and aggregates
267-
all the data into a single, manageable structure.
268272
"""
269-
class_data = await class_search(session, term, subject)
270-
course_data = {}
271-
async with asyncio.TaskGroup() as tg:
272-
for entry in class_data:
273-
tg.create_task(process_class_details(session, course_data, entry))
274-
return course_data
275273

276-
277-
async def main():
274+
async with semaphore:
275+
# Limit connections per session
276+
connector = aiohttp.TCPConnector(limit_per_host=5)
277+
timeout = aiohttp.ClientTimeout(total=60)
278+
279+
async with aiohttp.ClientSession(
280+
connector=connector, timeout=timeout
281+
) as session:
282+
try:
283+
# Reset search state on server before fetching class data
284+
await reset_class_search(session, term)
285+
class_data = await class_search(session, term, subject)
286+
course_data = {}
287+
async with asyncio.TaskGroup() as tg:
288+
for entry in class_data:
289+
tg.create_task(
290+
process_class_details(session, course_data, entry)
291+
)
292+
print(f"Completed processing subject: {subject}")
293+
return course_data
294+
except aiohttp.ClientError as e:
295+
print(f"Error processing subject {subject}: {e}")
296+
return {}
297+
298+
299+
async def main() -> bool:
278300
"""
279301
A JSESSIONID cookie is required before accessing any course data, which can be
280302
obtained on the first request to any SIS page. The cookie should automatically be
@@ -284,37 +306,48 @@ async def main():
284306
to fetch classes from a term and subject.
285307
"""
286308

287-
# Helper function to reset search state and fetch course data for a subject
288-
async def reset_and_get_course_data(
289-
session: aiohttp.ClientSession, term: str, subject: str
290-
):
291-
await reset_class_search(session, term)
292-
return await get_course_data(session, term, subject)
293-
294309
term = "202509"
295310
all_course_data = {}
296311

297312
try:
313+
# Limit concurrent client sessions
314+
semaphore = asyncio.Semaphore(10)
315+
316+
print("Fetching subject list...")
298317
async with aiohttp.ClientSession() as session:
299318
subjects = await get_subjects(session, term)
300-
tasks = []
319+
print(f"Found {len(subjects)} subjects")
320+
321+
# Stores all course data for a term
322+
all_course_data = {}
323+
324+
# Process subjects in parallel, each with its own session
325+
tasks: list[asyncio.Task] = []
326+
async with asyncio.TaskGroup() as tg:
301327
for subject in subjects:
302-
all_course_data[subject["code"]] = {
328+
subject_code = subject["code"]
329+
all_course_data[subject_code] = {
303330
"subject_name": subject["description"],
304331
"courses": {},
305332
}
306-
tasks.append(
307-
asyncio.create_task(
308-
reset_and_get_course_data(session, term, subject["code"])
309-
)
310-
)
311-
task_results = await asyncio.gather(*tasks)
333+
task = tg.create_task(get_course_data(semaphore, term, subject_code))
334+
tasks.append(task)
335+
312336
for i, subject in enumerate(subjects):
313-
all_course_data[subject["code"]]["courses"] = task_results[i]
337+
course_data = tasks[i].result()
338+
all_course_data[subject["code"]]["courses"] = course_data
339+
340+
# Write all data for term to JSON file
314341
with open(f"{term}.json", "w") as f:
315342
json.dump(all_course_data, f, indent=4)
343+
344+
print(f"Successfully processed {len(all_course_data)} subjects")
345+
316346
except Exception as e:
317-
print(e.with_traceback())
347+
print(f"Error in main: {e}")
348+
import traceback
349+
350+
traceback.print_exc()
318351
return False
319352
return True
320353

0 commit comments

Comments
 (0)