Skip to content

Commit 0a23d77

Browse files
committed
Fix course restriction parsing
The reason for this change is detailed in the comment added to the code. Basically, SIS data sucks and data for a single restriction can sometimes be split into multiple lines, causing each line to count as a separate restriction.
1 parent cb7c8d2 commit 0a23d77

File tree

5 files changed

+96
-77
lines changed

5 files changed

+96
-77
lines changed
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
{
22
"Introductory Level Course": "FRSH",
33
"Writing Intensive": "WRIT",
4-
"Communication Intensive": "COMM",
54
"Culminating Exp/Capstone": "CULM",
5+
"Communication Intensive": "COMM",
66
"PDII Option for Engr Majors": "PDII",
77
"HASS Inquiry": "HINQ",
8-
"Extended term Study Abroad": "INTL",
9-
"Data Intensive I": "DI1",
108
"Hybrid:Online/In-Person Course": "HYBR",
11-
"Online Course": "ONLI",
129
"In-Person Course": "INPE",
10+
"Online Course": "ONLI",
11+
"Extended term Study Abroad": "INTL",
12+
"Data Intensive I": "DI1",
1313
"Data Intensive II": "DI2"
1414
}
Lines changed: 59 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,108 +1,108 @@
11
{
22
"Freshman": "FR",
3-
"Sophomore": "SO",
4-
"Senior": "SR",
53
"Junior": "JR",
6-
"Communication": "COMM",
7-
"& Comm": "EMAC",
8-
"Architecture": "ARCH",
9-
"Fifth-Year": "FY",
10-
"Information Technology": "ITEC",
11-
"Distance": "D",
12-
"Graduate Student": "GR",
4+
"Sophomore": "SO",
5+
"Graduate": "GR",
136
"Aeronautical Engineering": "AERO",
147
"Biomedical Engineering": "BMED",
158
"Chemical Engineering": "CHEG",
169
"Civil Engineering": "CIVL",
1710
"Computer & Systems Engineering": "CSYS",
11+
"Decision Sciences & Engr Syst": "DSES",
1812
"Electrical Engineering": "ELEC",
1913
"Undeclared Engineering": "ENGR",
2014
"Environmental Engineering": "ENVE",
21-
"Engineering Physics": "EPHY",
2215
"Electric Power Engr": "EPOW",
2316
"Engineering Science": "ESCI",
17+
"Information Technology": "ITEC",
2418
"Materials Engineering": "MATL",
2519
"Mechanical Engineering": "MECL",
26-
"Industrial & Management Engr": "MGTE",
27-
"Nuclear Engineering": "NUCL",
28-
"Building Sciences": "BLSC",
29-
"Mathematics": "MATH",
30-
"Hartford": "H",
31-
"Troy": "T",
32-
"Decision Sciences & Engr Syst": "DSES",
3320
"Manufacturing Systems Engr": "MFSE",
21+
"Industrial & Management Engr": "MGTE",
3422
"Engr Principles in Tech. Educ.": "MSTE",
23+
"Nuclear Engineering": "NUCL",
3524
"Nuclear Engineering & Science": "NUCS",
3625
"Transportation Engineering": "TRAN",
26+
"Distance": "D",
27+
"Engineering Physics": "EPHY",
3728
"Mechanics": "MECH",
38-
"Computer Science": "CSCI",
39-
"Biochemistry & Biophysics": "BCBP",
40-
"Bioinformatics & Molec Biology": "BFMB",
41-
"Biology": "BIOL",
42-
"School of Engineering": "E",
43-
"Management": "MGMT",
29+
"Senior": "SR",
30+
"Elect Media, Arts, & Comm": "EMAC",
4431
"Electronic Arts": "EART",
32+
"Graduate Student": "GR",
4533
"Undergraduate": "UG",
46-
"Graduate": "GR",
47-
"Arts & Soc Sci": "H",
34+
"Biology": "BIOL",
35+
"Humanities, Arts & Soc Sci": "H",
4836
"Master of Fine Arts": "MFA",
37+
"Computer Science": "CSCI",
38+
"Architecture": "ARCH",
39+
"Mathematics": "MATH",
4940
"Science & Technology Studies": "STS",
41+
"Troy": "T",
5042
"Doctor of Philosophy": "PHD",
5143
"Master of Bus. Admin.": "MBA",
52-
"School of Science": "S",
44+
"Communication": "COMM",
45+
"Building Sciences": "BLSC",
46+
"School of Engineering": "E",
5347
"Chemistry": "CHEM",
54-
"Economics": "ECON",
55-
"Applied Physics": "APHY",
56-
"Physics": "PHYS",
48+
"Management": "MGMT",
5749
"Games & Simulation Arts & Sci": "GSAS",
50+
"Hartford": "H",
5851
"Cognitive Science": "COGS",
59-
"School of Architecture": "A",
60-
"Lally School of Mgt & Tech": "M",
61-
"Undeclared": "UNGS",
62-
"Information Tech & Web Science": "ITWS",
63-
"Innovation and Society": "DSIS",
64-
"Business and Management": "BMGT",
65-
"Tech & Society": "STSO",
66-
"Sustainability Studies": "SUST",
67-
"Quant Finance & Risk Analytics": "QFRA",
68-
"Systems Engr and Tech Mgmt": "SETM",
52+
"Economics": "ECON",
6953
"Philosophy": "PHIL",
7054
"Psychology": "PSYC",
71-
"Undeclared Science": "USCI",
55+
"Science, Tech & Society": "STSO",
56+
"Information Tech & Web Science": "ITWS",
57+
"Design, Innovation and Society": "DSIS",
58+
"Business and Management": "BMLW",
59+
"Sustainability Studies": "SUST",
7260
"Applied Mathematics": "APMA",
73-
"Undeclared Major": "0000",
74-
"Operations Research & Stats": "ORST",
75-
"MS Management": "MS-MGMT",
76-
"General Engineering": "ENGR",
77-
"The Arts": "ARTS",
78-
"Applied Sciences": "ASCI",
79-
"Undecided Science": "USCI",
80-
"& Design": "COMD",
81-
"Music": "MUSC",
61+
"Undeclared": "UNGS",
62+
"Biochemistry & Biophysics": "BCBP",
63+
"Bioinformatics & Molec Biology": "BFMB",
64+
"Undeclared Science": "USCI",
65+
"Lally School of Mgt & Tech": "M",
66+
"Fifth-Year": "FY",
67+
"Applied Physics": "APHY",
68+
"Physics": "PHYS",
8269
"Biological Neuroscience": "BLNS",
70+
"Communication, Media, & Design": "COMD",
71+
"Music": "MUSC",
8372
"Computational Biology": "CBIO",
84-
"Psychological Science": "PSYS",
8573
"Environmental Science": "ENVS",
8674
"Geology": "GEOL",
87-
"Biotechnology & Health Econ": "BTHE",
8875
"Master of Science": "MS",
8976
"Business Analytics": "BSAN",
90-
"Management (LSE)": "MGLS",
91-
"Tech Commercialization Entrepr": "TCE",
77+
"Psychological Science": "PSYS",
78+
"Quant Finance & Risk Analytics": "QFRA",
79+
"Systems Engr and Tech Mgmt": "SETM",
9280
"Supply Chain Management": "SPCM",
81+
"Undeclared School": "U",
82+
"School of Science": "S",
83+
"Tech Commercialization Entrepr": "TCE",
84+
"School of Architecture": "A",
9385
"School of Humanities & Soc Sci": "HU",
86+
"Hydrogeology": "HGEO",
87+
"Lighting": "LGHT",
9488
"Communication & Rhetoric": "CMRT",
9589
"Human-Computer Interaction": "HCIN",
9690
"Technical Communication": "TCOM",
97-
"Hydrogeology": "HGEO",
98-
"Lighting": "LGHT",
99-
"Undeclared School": "U",
100-
"Interdisciplinary Science": "ISCI",
10191
"Aerospace Engineering": "ASPC",
92+
"Interdisciplinary Science": "ISCI",
93+
"Undergraduate Education Office": "UE",
10294
"Critical Game Design": "CGDS",
95+
"Biotechnology & Health Econ": "BTHE",
10396
"Architectural Sciences": "ARCS",
104-
"Master of Engineering": "ME",
105-
"Undergraduate Education Office": "UE",
10697
"Business Administration": "BADM",
98+
"Management (LSE)": "MGLS",
99+
"Undeclared Major": "0000",
100+
"Operations Research & Stats": "ORST",
101+
"MS Management": "MS-MGMT",
102+
"Master of Engineering": "ME",
103+
"General Engineering": "ENGR",
104+
"The Arts": "ARTS",
105+
"Applied Sciences": "ASCI",
106+
"Undecided Science": "USCI",
107107
"No School Designated": "00"
108108
}

app/scrapers/code_mappings/subject_name_code_map.json

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
{
22
"Administrative Courses": "ADMN",
3-
"Aerospace Studies": "USAF",
43
"Architecture": "ARCH",
54
"Arts": "ARTS",
65
"Astronomy": "ASTR",
@@ -23,32 +22,33 @@
2322
"Environmental Engineering": "ENVE",
2423
"General Institute Offerings": "GENL",
2524
"Information Technology": "ITEC",
26-
"Interdiscip. Environmental": "IENV",
27-
"Interdisciplinary H&SS": "IHSS",
2825
"Interdisciplinary Science": "ISCI",
29-
"Languages": "LANG",
3026
"Lighting": "LGHT",
31-
"Literature": "LITR",
3227
"Management": "MGMT",
33-
"Management - EXEC MBA": "EMBA",
3428
"Materials Science &Engineering": "MTLE",
35-
"Math Prgmg, Probab,& Math Stat": "MATP",
3629
"Mathematics": "MATH",
37-
"Military Science": "USAR",
38-
"Naval Science": "USNA",
30+
"Mech, Aero, Nucl Engr": "MANE",
3931
"Philosophy": "PHIL",
4032
"Physics": "PHYS",
4133
"Psychology": "PSYC",
4234
"Sci & Tech Studies - Humanity": "STSH",
4335
"Sci & Tech Studies - Soc Sci": "STSS",
36+
"Aerospace Studies": "USAF",
37+
"Interdiscip. Environmental": "IENV",
38+
"Interdisciplinary H&SS": "IHSS",
39+
"Languages": "LANG",
40+
"Literature": "LITR",
41+
"Management - EXEC MBA": "EMBA",
42+
"Math Prgmg, Probab,& Math Stat": "MATP",
43+
"Military Science": "USAR",
44+
"Naval Science": "USNA",
4445
"Writing": "WRIT",
45-
"Mech, Aero, Nucl Engr": "MANE",
4646
"Cognitive Science": "COGS",
4747
"Industrial and Systems Engr": "ISYE",
4848
"Information Technlgy & Web Sci": "ITWS",
4949
"Business (H)": "BUSN",
5050
"Games & Simulation Arts & Sci": "GSAS",
51-
"HASS Inquiry": "INQR",
5251
"Independent Learning Exper": "ILEA",
53-
"Science, Technology & Society": "STSO"
52+
"Science, Technology & Society": "STSO",
53+
"HASS Inquiry": "INQR"
5454
}

app/scrapers/sis_api.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,7 @@ async def get_class_restrictions(session: aiohttp.ClientSession, term: str, crn:
382382
key = f"not_{key_base}" if must_or_cannot == "Cannot" else key_base
383383
restriction_list = restrictions_data[key]
384384
i += 1
385+
next_content_string = ""
385386
while i < len(restrictions_content):
386387
next_content = restrictions_content[i]
387388
if next_content.string is None:
@@ -390,13 +391,27 @@ async def get_class_restrictions(session: aiohttp.ClientSession, term: str, crn:
390391
)
391392
i += 1
392393
continue
393-
next_content_string = next_content.string.strip()
394+
# SIS separates one restriction item into multiple <span> tags if it contains
395+
# commas, so a restriction item is only complete when parentheses are closed.
396+
#
397+
# For example, a restriction item "Communication, Media, & Design (COMD)"
398+
# would be split into three <span> tags:
399+
#
400+
# <span>Communication</span>
401+
# <span>Media</span>
402+
# <span> & Design (COMD)</span>
403+
if next_content_string == "":
404+
next_content_string = next_content.string.lstrip()
405+
else:
406+
next_content_string += f",{next_content.string}"
394407
# Stop if another restriction header is encountered
395408
if re.match(restriction_header_pattern, next_content_string) or re.match(
396409
special_approvals_pattern, next_content_string
397410
):
398411
break
399-
restriction_list.append(next_content_string)
412+
if re.match(r".*\(.*\)", next_content_string):
413+
restriction_list.append(next_content_string.strip())
414+
next_content_string = ""
400415
i += 1
401416
return restrictions_data
402417

app/scrapers/sis_scraper.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ async def process_class_details(
9393
crosslists_data = crosslists_task.result()
9494

9595
# Build attribute name to code map
96+
# Attributes are known to be in the format "Attribute Name CODE"
97+
# Note the double space between name and code
9698
if attribute_name_code_map is not None:
9799
for attribute in attributes_data:
98100
attribute_split = attribute.split()
@@ -113,6 +115,8 @@ async def process_class_details(
113115
attribute_name_code_map[attribute_name] = attribute_code
114116

115117
# Build restriction name to code map
118+
# Restrictions are known to be in the format "Restriction Name (CODE)"
119+
# Note the parentheses around the code
116120
if restriction_name_code_map is not None:
117121
restriction_pattern = r"(.*)\((.*)\)"
118122
for restriction_type in restrictions_data:

0 commit comments

Comments
 (0)