This repository has been archived by the owner on Oct 2, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.rb
81 lines (77 loc) · 2.83 KB
/
scrape.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
require 'json'
require 'mechanize'
require 'mongo'
require 'nokogiri'
require 'thread'
require 'yaml'
EXPLORECOURSES_URL = 'http://explorecourses.stanford.edu/print?page=0&q=%25&catalog=&filter-coursestatus-Active=on&descriptions=on&collapse=&academicYear=&catalog='
def getMechanizeInstance()
m = Mechanize.new
m.user_agent = 'Mac Safari'
return m
end
def getCourses(m, courses_array)
source = Nokogiri::HTML(m.get(EXPLORECOURSES_URL).body)
courses = source.css("div.searchResult")
courses.each do |course|
c = Hash.new
number = course.at_css("span.courseNumber")
unless number.nil?
c["number"] = number.content.chomp(':')
end
title = course.at_css("span.courseTitle")
unless title.nil?
c["title"] = title.content
end
description = course.at_css("div.courseDescription")
unless description.nil?
c["description"] = description.content
end
attributes = course.at_css("div.courseAttributes")
unless attributes.nil?
data = attributes.content.strip()
data = data.gsub(/\r\n/m, "")
data_items = data.split('|')
attributes_hash = Hash.new
for data_item in data_items
data_item_details = data_item.strip().split(/[:,]/)
key = data_item_details[0].downcase
if key == "units"
data = Hash.new
units_bounds = data_item_details[1].split("-")
data["lower"] = units_bounds[0].strip().to_i
data["upper"] = units_bounds[units_bounds.length - 1].strip().to_i
attributes_hash[key] = data
elsif key == "grading"
grading_types = data_item_details[1].split("or")
data = Array.new
grading_types.each do |grading_type|
data << grading_type.strip()
end
attributes_hash[key] = data
else
data = Array.new
for i in 1..(data_item_details.length - 1)
data << data_item_details[i].strip()
end
attributes_hash[key] = data
end
end
c["attributes"] = attributes_hash
end
instructors = Array.new
instructors_data = course.css(".courseAttributes a")
instructors_data.each do |instructor_data|
instructors << instructor_data.content
end
c["instructors"] = instructors
courses_array << c
end
end
# Generate thread pool that pulls items off the queue
m = getMechanizeInstance()
courses = []
getCourses(m, courses)
File.open("courses_dump.yml", "w") do |file|
file << YAML::dump(courses)
end