Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/data-pipeline/websoc-scraper/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"@icssc/libwebsoc-next": "1.1.1",
"@packages/db": "workspace:*",
"@packages/stdlib": "workspace:*",
"@types/node": "22.9.3",
"cheerio": "1.0.0",
"cross-fetch": "4.0.0"
},
Expand Down
161 changes: 43 additions & 118 deletions apps/data-pipeline/websoc-scraper/src/lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import { and, asc, eq, gte, inArray, lte, sql } from "@packages/db/drizzle";
import type { WebsocSectionFinalExam } from "@packages/db/schema";
import {
calendarTerm,
course,
websocCourse,
websocDepartment,
websocInstructor,
Expand All @@ -36,7 +35,6 @@ import {
parseStartAndEndTimes,
sleep,
} from "@packages/stdlib";
import { load } from "cheerio";

/**
* WebSoc allows us to scrape up to 900 sections per chunk.
Expand All @@ -48,7 +46,7 @@ const SECTIONS_PER_CHUNK = 891;
* Section codes 98000-99999 are reserved for Study Abroad and Registrar testing.
* These are not associated with any department that is searchable directly through WebSoc.
*/
const LAST_SECTION_CODE = "97999";
const LAST_SECTION_CODE = 97999;

const geCategories = [
"GE-1A",
Expand Down Expand Up @@ -78,44 +76,12 @@ const geCategoryToFlag: Record<(typeof geCategories)[number], keyof CourseGEUpda

const geColumns = Object.values(geCategoryToFlag) as string[];

export async function getDepts(db: ReturnType<typeof database>) {
const response = await fetch("https://www.reg.uci.edu/perl/WebSoc").then((x) => x.text());

const $ = load(response);

const termsFromWebsoc = $("form")
.eq(1)
.find("select")
.eq(2)
.text()
.replace(/\t/g, "")
.replace(/ {4}/g, "")
.split("\n")
.map((x) =>
x
.split(".")
.filter((y) => y !== " ")
.map((y) => y.trim()),
)
.filter((x) => x[0].length)
.map((x) => (x.length === 1 ? "ALL" : x[0]))
.filter((x) => x !== "ALL");

const termsFromDb = await db
.select({ department: course.department })
.from(course)
.then((rows) => Array.from(new Set(rows.map((row) => row.department))));

return Array.from(new Set(termsFromWebsoc.concat(termsFromDb))).toSorted();
}

async function getTermsToScrape(db: ReturnType<typeof database>) {
const now = new Date();
return db
.select({
name: calendarTerm.id,
lastScraped: websocMeta.lastScraped,
lastDeptScraped: websocMeta.lastDeptScraped,
})
.from(calendarTerm)
.leftJoin(websocMeta, eq(websocMeta.name, calendarTerm.id))
Expand Down Expand Up @@ -416,12 +382,7 @@ const courseUpdateSet = Object.fromEntries(
Object.entries(allCourseCols).filter(([key]) => !geColumns.includes(key)),
);

const doChunkUpsert = async (
db: ReturnType<typeof database>,
term: Term,
resp: WebsocResponse,
department: string | null,
) =>
const doChunkUpsert = async (db: ReturnType<typeof database>, term: Term, resp: WebsocResponse) =>
await db.transaction(async (tx) => {
const updatedAt = new Date();
const schools = await tx
Expand Down Expand Up @@ -696,8 +657,10 @@ const doChunkUpsert = async (
});
const websocMetaValues = {
name: termToName(term),
// update this on every scrape so that, even if we partially fail to scrape this term,
// the next scraping attempt will try another term first (if such another candidate term exists)
// before retrying this one
lastScraped: updatedAt,
lastDeptScraped: department,
};
await tx
.insert(websocMeta)
Expand Down Expand Up @@ -811,121 +774,83 @@ async function scrapeGEsForTerm(db: ReturnType<typeof database>, term: Term) {
console.log(`Updated GE data for ${updates.size} courses`);
}

export async function scrapeTerm(
db: ReturnType<typeof database>,
term: Term,
departments: string[],
) {
export async function scrapeTerm(db: ReturnType<typeof database>, term: Term) {
const name = termToName(term);
console.log(`Scraping term ${name}`);
const sectionCodeBounds = await db
.execute(
sql<Array<{ section_code: string }>>`
.execute<{ section_code: number }>(
sql<{ section_code: number }>`
SELECT section_code FROM (
SELECT LPAD(section_code::TEXT, 5, '0') AS section_code,
SELECT section_code,
(ROW_NUMBER() OVER (ORDER BY section_code)) AS rownum
FROM ${websocSection} WHERE ${websocSection.year} = ${term.year} AND ${websocSection.quarter} = ${term.quarter}
)
WHERE MOD(rownum, ${SECTIONS_PER_CHUNK}) = 0 OR MOD(rownum, ${SECTIONS_PER_CHUNK}) = 1;
WHERE MOD(rownum, ${SECTIONS_PER_CHUNK}) = 0;
`,
)
.then((xs) => xs.map((x) => x.section_code));
if (departments.length) {
console.log(`Resuming scraping run at department ${departments[0]}.`);
for (const department of departments) {
console.log(`Scraping department ${department}`);
const resp = await request(term, {
department,
cancelledCourses: "Include",
}).then(normalizeResponse);
if (resp.schools.length) await doChunkUpsert(db, term, resp, department);
await sleep(1000);
}
} else if (!sectionCodeBounds.length) {
console.log("This term has never been scraped before. Falling back to department-wise scrape.");
for (const department of await getDepts(db)) {
console.log(`Scraping department ${department}`);
const resp = await request(term, {
department,
cancelledCourses: "Include",
}).then(normalizeResponse);
if (resp.schools.length) await doChunkUpsert(db, term, resp, department);
await sleep(1000);
}
} else {
console.log("Performing chunk-wise scrape.");
for (let i = 0; i < sectionCodeBounds.length; i += 2) {
const lower = sectionCodeBounds[i] as `${number}`;
const upper = (sectionCodeBounds[i + 1] ?? LAST_SECTION_CODE) as `${number}`;
await ingestChunk(db, term, lower, upper);
}

console.log("Performing chunk-wise scrape.");
let lastKnownCode = 0;
for (const bound of sectionCodeBounds) {
await ingestChunk(db, term, lastKnownCode + 1, bound);
lastKnownCode = bound;
}

if (lastKnownCode < LAST_SECTION_CODE) {
await ingestChunk(db, term, lastKnownCode + 1, LAST_SECTION_CODE);
}

await scrapeGEsForTerm(db, term);
const lastScraped = new Date();
const values = { name, lastScraped, lastDeptScraped: null };
await db.transaction(async (tx) => {
await tx
.insert(websocMeta)
.values(values)
.onConflictDoUpdate({ target: websocMeta.name, set: values });
});
const values = { name, lastScraped: new Date() };
await db
.insert(websocMeta)
.values(values)
.onConflictDoUpdate({ target: websocMeta.name, set: values });
}

async function ingestChunk(
db: ReturnType<typeof database>,
term: Term,
lower: `${number}`,
upper: `${number}`,
lower: number,
upper: number,
) {
const sectionCodes = `${lower}-${upper}`;
console.log(`Scraping chunk ${sectionCodes}`);
const codeRangePretty = `${lower.toString().padStart(5, "0")}-${upper.toString().padStart(5, "0")}`;
console.log(`Scraping chunk ${codeRangePretty}`);
try {
const resp = await request(term, {
sectionCodes,
sectionCodes: codeRangePretty,
cancelledCourses: "Include",
}).then(normalizeResponse);
if (resp.schools.length) await doChunkUpsert(db, term, resp, null);
if (resp.schools.length) await doChunkUpsert(db, term, resp);
await sleep(1000);
} catch (e) {
/*
assuming network, etc. conditions are fine, we have more than 900 sections here
this means we somehow overran our 1% tolerance
that's okay; we can be suboptimal this time so we get all the sections that exist.
we're going to recompute the chunks at the start of the next scrape,
so that one will run optimally, given no such failure occurs again

we're going to bisect this chunk and try the two halves separately; eventually,
we'll have <= 900 valid sections in a chunk and we'll be in the clear
*/
const lowerInt = Number.parseInt(lower, 10);
const upperInt = Number.parseInt(upper, 10);
const rangeLength = upperInt - lowerInt + 1;
// this isn't necessarily fatal; it's possible that we would have gotten more than 900 sections, which is disallowed
// let's just try again here; after the first scrape of this term, we'll get the code ranges right >95% of the time
const rangeLength = upper - lower + 1;
if (rangeLength < 900) {
// okay, no way this was a chunk overrun
throw e;
}

console.log(`Chunk ${sectionCodes} failed (probably too large); bisecting and trying again...`);
console.log(
`Chunk ${codeRangePretty} failed (probably too large); bisecting and trying again...`,
);

const middleInt = lowerInt + Math.floor((upperInt - lowerInt) / 2);
await ingestChunk(db, term, lower, middleInt.toString().padStart(5, "0") as `${number}`);
await ingestChunk(db, term, (middleInt + 1).toString().padStart(5, "0") as `${number}`, upper);
const middle = lower + Math.floor((upper - lower) / 2);
await ingestChunk(db, term, lower, middle);
await ingestChunk(db, term, middle + 1, upper);
}
}

export async function doScrape(db: ReturnType<typeof database>) {
console.log("websoc-scraper starting");
const termsInDatabase = await getTermsToScrape(db);
console.log(termsInDatabase);
const term = termsInDatabase.find((x) => x.lastDeptScraped !== null) ?? termsInDatabase[0];
const term = termsInDatabase[0];
if (term?.name) {
try {
const departments = await getDepts(db);
await scrapeTerm(
db,
nameToTerm(term.name),
term?.lastDeptScraped ? departments.slice(departments.indexOf(term.lastDeptScraped)) : [],
);
await scrapeTerm(db, nameToTerm(term.name));
} catch (e) {
console.error(e);
}
Expand Down
2 changes: 1 addition & 1 deletion apps/data-pipeline/websoc-scraper/tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"extends": "../../../tsconfig.json",
"compilerOptions": {
"baseUrl": ".",
"types": ["./worker-configuration.d.ts"],
"types": ["./worker-configuration.d.ts", "node"],
"paths": {
"$lib": ["./src/lib.ts"]
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE "websoc_meta" DROP COLUMN IF EXISTS "last_dept_scraped";
Loading
Loading