Skip to content

Commit 6bf8839

Browse files
committed
Outline scraper function prototypes
I'm basically following Single Responsibility Principle as I'm writing this new SIS scraper. The course_search() method, previously get_courses_in_subject(), now gives the caller more control over how results should be sorted. There are a lot of other miscellaneous changes included with this commit, but they're all basically for the sake of SRP.
1 parent 6ac2b22 commit 6bf8839

File tree

1 file changed

+75
-22
lines changed

1 file changed

+75
-22
lines changed

app/scrapers/sis_scraper.py

Lines changed: 75 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,26 @@
11
import asyncio
22
import json
33
import time
4+
from enum import Enum
45

56
import aiohttp
67
import bs4
78

89

10+
class CourseColumn(str, Enum):
11+
COURSE_TITLE = "courseTitle"
12+
SUBJECT_DESCRIPTION = "subjectDescription"
13+
COURSE_NUMBER = "courseNumber"
14+
SECTION = "sequenceNumber"
15+
CRN = "courseReferenceNumber"
16+
TERM = "term"
17+
18+
919
async def get_subjects(
1020
session: aiohttp.ClientSession, term: str
1121
) -> list[dict[str, str]]:
1222
"""
13-
Fetches the list of subjects for a given term from the SIS API.
23+
Fetches the list of subjects for a given term from SIS.
1424
1525
Returned data format is as follows:
1626
[
@@ -22,56 +32,98 @@ async def get_subjects(
2232
]
2333
"""
2434
url = "https://sis9.rpi.edu/StudentRegistrationSsb/ssb/classSearch/get_subject"
25-
async with session.get(
26-
url, params={"term": term, "offset": 1, "max": 100}
27-
) as response:
35+
params = {"term": term, "offset": 1, "max": 100}
36+
async with session.get(url, params=params) as response:
2837
response.raise_for_status()
2938
data = await response.json()
3039
return data
3140

3241

33-
async def reset_subject_search(session: aiohttp.ClientSession, term: str) -> None:
42+
async def reset_course_search(session: aiohttp.ClientSession, term: str) -> None:
3443
"""
35-
Resets the subject search state on the SIS server.
44+
Resets the term and subject search state on the SIS server.
3645
37-
Must be re-called before each new attempt to fetch courses from a different subject.
38-
Otherwise, the server will continue returning results from the last subject accessed.
46+
Must be called before each attempt to fetch courses from a subject in the given term.
47+
Otherwise, the server will continue returning the same results from the last subject
48+
accessed, or no data if attempting to access data from a different term.
3949
"""
4050
url = "https://sis9.rpi.edu/StudentRegistrationSsb/ssb/term/search?mode=search"
41-
async with session.get(url, params={"term": term}) as response:
51+
params = {"term": term}
52+
async with session.get(url, params=params) as response:
4253
response.raise_for_status()
4354

4455

45-
async def get_courses_in_subject(
46-
session: aiohttp.ClientSession, term: str, subject: str
56+
async def course_search(
57+
session: aiohttp.ClientSession,
58+
term: str,
59+
subject: str,
60+
max_size: int = 1000,
61+
sort_column: CourseColumn = CourseColumn.SUBJECT_DESCRIPTION,
62+
sort_asc: bool = True,
4763
) -> list[dict[str, str]]:
4864
"""
49-
Fetches the list of courses for a given subject and term from the SIS API.
65+
Fetches the list of courses for a given subject and term from SIS.
5066
51-
reset_subject_search() must be called once before each call to this function.
67+
The term and subject search state on the SIS server must be reset before each call
68+
to this function.
5269
"""
53-
url = "https://sis9.rpi.edu/StudentRegistrationSsb/ssb/searchResults/searchResults"
70+
url = "https://sis9.rpi.edu/StudentRegistrationSsb/ssb/searchResults/searchResults?pageOffset=0"
5471
params = {
5572
"txt_subject": subject,
5673
"txt_term": term,
57-
"pageOffset": 0,
58-
"pageMaxSize": 1000,
59-
"sortColumn": "subjectDescription",
60-
"sortDirection": "asc",
74+
"pageMaxSize": max_size,
75+
"sortColumn": sort_column,
76+
"sortDirection": "asc" if sort_asc else "desc",
6177
}
6278
async with session.get(url, params=params) as response:
6379
response.raise_for_status()
6480
data = await response.json()
6581
return data
6682

6783

84+
async def get_course_details(session: aiohttp.ClientSession, term: str, crn: str):
85+
url = (
86+
"https://sis9.rpi.edu/StudentRegistrationSsb/ssb/searchResults/getClassDetails"
87+
)
88+
89+
90+
async def get_course_description(session: aiohttp.ClientSession, term: str, crn: str):
91+
url = "https://sis9.rpi.edu/StudentRegistrationSsb/ssb/searchResults/getCourseDescription"
92+
93+
94+
async def get_course_attributes(session: aiohttp.ClientSession, term: str, crn: str):
95+
url = "https://sis9.rpi.edu/StudentRegistrationSsb/ssb/searchResults/getSectionAttributes"
96+
97+
98+
async def get_course_restrictions(session: aiohttp.ClientSession, term: str, crn: str):
99+
url = (
100+
"https://sis9.rpi.edu/StudentRegistrationSsb/ssb/searchResults/getRestrictions"
101+
)
102+
103+
104+
async def get_course_corequisites(session: aiohttp.ClientSession, term: str, crn: str):
105+
url = (
106+
"https://sis9.rpi.edu/StudentRegistrationSsb/ssb/searchResults/getCorequisites"
107+
)
108+
109+
110+
async def get_course_prerequisites(session: aiohttp.ClientSession, term: str, crn: str):
111+
url = "https://sis9.rpi.edu/StudentRegistrationSsb/ssb/searchResults/getSectionPrerequisites"
112+
113+
114+
async def get_course_crosslists(session: aiohttp.ClientSession, term: str, crn: str):
115+
url = (
116+
"https://sis9.rpi.edu/StudentRegistrationSsb/ssb/searchResults/getXlstSections"
117+
)
118+
119+
68120
async def main():
69121
"""
70122
A JSESSIONID cookie is required before accessing any course data, which can be
71123
obtained on the first request to any SIS page. The cookie should automatically be
72124
included in subsequent requests made with the same aiohttp session.
73125
74-
reset_subject_search() must be called before each new attempt to fetch courses from
126+
reset_course_search() must be called before each new attempt to fetch courses from
75127
a different subject.
76128
"""
77129
async with aiohttp.ClientSession(
@@ -93,9 +145,10 @@ async def main():
93145
) as session:
94146
term = "202409"
95147
subjects = await get_subjects(session, term)
96-
await reset_subject_search(session, term)
97-
data = await get_courses_in_subject(session, term, subjects[0]["code"])
98-
print(json.dumps(data, indent=4))
148+
await reset_course_search(session, term)
149+
data = await course_search(session, term, subjects[0]["code"])
150+
# print(json.dumps(data, indent=4))
151+
return True
99152

100153

101154
if __name__ == "__main__":

0 commit comments

Comments
 (0)