webscraper/scrape_fields.py at master · iu-cewit/webscraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# This script scrapes data from a website listing field categories, fields, and
# majors; used dryscrape for javascript-rendered page; works in python2.7 only

from web_list_scraper import *
import dryscrape
import re


def main():
    page = 'https://bigfuture.collegeboard.org/majors-careers'
    all_soup = get_soup_js(page)

    # get just the part of the page with majors
    root_class = 'treeview major-categories-treeview tree-root'
    major_tree = get_lists(all_soup, root_class)

    # first convert result set to soup to use generators
    major_soup = bs4.BeautifulSoup(str(major_tree))
    pretty = major_soup.prettify()
    pretty_list = [x.strip() for x in pretty.split('\n')]

    # get a list of just majors at the lowest level in the tree
    majors = []
    i = 0
    while i < len(pretty_list):
        if re.match(r'^<a', pretty_list[i]) is not None:
            majors.append(pretty_list[i + 1])
        i += 1

    # get a list of all categories, branches, and majors
    all_text = []
    for element in pretty_list:
        if element[0].isalpha():
            all_text.append(element)

    # build a dictionary of fields {major: category}
    categories = ['Arts and Humanities', 'Business', 'Health and Medicine',
                  'Multi-/Interdisciplinary Studies',
                  'Public and Social Services',
                  'Science, Math, and Technology', 'Social Sciences',
                  'Trades and Personal Services']

    category_indexes = {}
    for cat in categories:
        category_indexes[get_index(cat, all_text)] = cat

    cat_keys = sorted(category_indexes, key=category_indexes.get)

    fields = {}
    for major in majors:
        i = 0
        while i < len(cat_keys) - 1:
            if get_index(major, all_text) < cat_keys[i+1]:
                fields[major] = category_indexes[cat_keys[i]]
                i = len(cat_keys) - 1
            else:
                i += 1

    write_dict(fields, ['Major', 'Category'], 'fields.csv')


def write_dict(data_dict, fieldnames, filename):
    """Writes the contents of the dict to a csv file

    dict, list of str, str -> none"""
    with open(filename, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames)
        writer.writeheader()
        for item in data_dict.items():
            writer.writerow({'Major': item[0], 'Category': item[1]})


def get_soup_js(url):
    """Returns soup from javascript-rendered webpage

    str -> soup"""
    session = dryscrape.Session()
    session.visit(url)
    response = session.body()
    soup = bs4.BeautifulSoup(response)
    return soup


def write_soup(soup, filename):
    """Writes the contents of a list to a text file

    list of str -> none"""
    with open(filename, 'w',) as file:
        file.write(str(soup))


def get_index(item, list_of_items):
    """Returns the index number of an item in a list

    obj, list -> int"""
    i = 0
    if item not in list_of_items:
        return None
    for obj in list_of_items:
        if obj == item:
            return i
        else:
            i += 1


if __name__ == '__main__':
    main()