@@ -229,8 +229,12 @@ async def process_class_details(
229229 course_details = course_data ["course_detail" ]
230230
231231 course_credits = course_details ["credits" ]
232- course_credits ["min" ] = min (course_credits ["min" ], class_entry ["creditHourLow" ])
233- course_credits ["max" ] = max (course_credits ["max" ], class_entry ["creditHourHigh" ])
232+ course_credits ["min" ] = min (
233+ course_credits ["min" ], class_entry ["creditHourLow" ] or 0
234+ )
235+ course_credits ["max" ] = max (
236+ course_credits ["max" ], class_entry ["creditHourHigh" ] or 0
237+ )
234238
235239 course_sections = course_details ["sections" ]
236240 # Use faculty RCS IDs instead of names
@@ -251,30 +255,48 @@ async def process_class_details(
251255
252256
253257async def get_course_data (
254- session : aiohttp . ClientSession , term : str , subject : str
255- ) -> list [ dict ] :
258+ semaphore : asyncio . Semaphore , term : str , subject : str
259+ ) -> dict :
256260 """
257261 Gets all course data for a given term and subject.
258262
263+ This function spawns its own client session to avoid session state conflicts with
264+ other subjects that may be processing concurrently.
265+
259266 In the context of this scraper, a "class" refers to a section of a course, while a
260267 "course" refers to the overarching course that may have multiple classes.
261268
262269 The data returned from SIS is keyed by classes, not courses. This function
263270 manipulates and aggregates this data such that the returned structure is keyed by
264271 courses instead, with classes as a sub-field of each course.
265-
266- Think of this as a main function that calls all other helper functions and aggregates
267- all the data into a single, manageable structure.
268272 """
269- class_data = await class_search (session , term , subject )
270- course_data = {}
271- async with asyncio .TaskGroup () as tg :
272- for entry in class_data :
273- tg .create_task (process_class_details (session , course_data , entry ))
274- return course_data
275273
276-
277- async def main ():
274+ async with semaphore :
275+ # Limit connections per session
276+ connector = aiohttp .TCPConnector (limit_per_host = 5 )
277+ timeout = aiohttp .ClientTimeout (total = 60 )
278+
279+ async with aiohttp .ClientSession (
280+ connector = connector , timeout = timeout
281+ ) as session :
282+ try :
283+ # Reset search state on server before fetching class data
284+ await reset_class_search (session , term )
285+ class_data = await class_search (session , term , subject )
286+ course_data = {}
287+ async with asyncio .TaskGroup () as tg :
288+ for entry in class_data :
289+ tg .create_task (
290+ process_class_details (session , course_data , entry )
291+ )
292+ print (f"Completed processing subject: { subject } " )
293+ return course_data
294+ except aiohttp .ClientError as e :
295+ print (f"Error processing subject { subject } : { e } " )
296+ return {}
297+
298+
299+ async def main () -> bool :
278300 """
279301 A JSESSIONID cookie is required before accessing any course data, which can be
280302 obtained on the first request to any SIS page. The cookie should automatically be
@@ -284,37 +306,48 @@ async def main():
284306 to fetch classes from a term and subject.
285307 """
286308
287- # Helper function to reset search state and fetch course data for a subject
288- async def reset_and_get_course_data (
289- session : aiohttp .ClientSession , term : str , subject : str
290- ):
291- await reset_class_search (session , term )
292- return await get_course_data (session , term , subject )
293-
294309 term = "202509"
295310 all_course_data = {}
296311
297312 try :
313+ # Limit concurrent client sessions
314+ semaphore = asyncio .Semaphore (10 )
315+
316+ print ("Fetching subject list..." )
298317 async with aiohttp .ClientSession () as session :
299318 subjects = await get_subjects (session , term )
300- tasks = []
319+ print (f"Found { len (subjects )} subjects" )
320+
321+ # Stores all course data for a term
322+ all_course_data = {}
323+
324+ # Process subjects in parallel, each with its own session
325+ tasks : list [asyncio .Task ] = []
326+ async with asyncio .TaskGroup () as tg :
301327 for subject in subjects :
302- all_course_data [subject ["code" ]] = {
328+ subject_code = subject ["code" ]
329+ all_course_data [subject_code ] = {
303330 "subject_name" : subject ["description" ],
304331 "courses" : {},
305332 }
306- tasks .append (
307- asyncio .create_task (
308- reset_and_get_course_data (session , term , subject ["code" ])
309- )
310- )
311- task_results = await asyncio .gather (* tasks )
333+ task = tg .create_task (get_course_data (semaphore , term , subject_code ))
334+ tasks .append (task )
335+
312336 for i , subject in enumerate (subjects ):
313- all_course_data [subject ["code" ]]["courses" ] = task_results [i ]
337+ course_data = tasks [i ].result ()
338+ all_course_data [subject ["code" ]]["courses" ] = course_data
339+
340+ # Write all data for term to JSON file
314341 with open (f"{ term } .json" , "w" ) as f :
315342 json .dump (all_course_data , f , indent = 4 )
343+
344+ print (f"Successfully processed { len (all_course_data )} subjects" )
345+
316346 except Exception as e :
317- print (e .with_traceback ())
347+ print (f"Error in main: { e } " )
348+ import traceback
349+
350+ traceback .print_exc ()
318351 return False
319352 return True
320353
0 commit comments