-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_music.py
102 lines (70 loc) · 2.84 KB
/
scrape_music.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
def get_html_text(html):
try:
return html.text
except AttributeError:
return np.nan
def parse_title(html_title):
title_text = html_title.text
try:
pub_end_index = title_text.index("'")
pub = title_text[:pub_end_index]
except ValueError:
pub = np.nan
year = title_text[-4:]
return pub, year
def scrape_page(soup):
"""scrape the html from a page for album of the year and convert to a data frame"""
rows = soup.findAll(class_="albumListRow")
list_length = len(rows)
title = soup.find("title")
# Exclude non year-end lists
if "so far" in title.text.lower():
return None
else:
pub, year = parse_title(title)
#Get the rank, title, and artist as a list
page_data = dict()
collect_terms = ["albumListTitle", "albumListDate", "albumListGenre", "scoreText", "scoreValue"]
for term in collect_terms:
page_data[term] = [get_html_text(row.find(class_=term)) for row in rows]
page_df = pd.DataFrame.from_dict(page_data)
page_df[["albumRank", "albumTitleArtist"]] = page_df.albumListTitle.str.split(".", n=1, expand=True)
page_df[["albumArtist", "albumTitle"]] = page_df.albumTitleArtist.str.rsplit("-", n=1, expand=True)
page_df.drop(columns=["albumListTitle", "albumTitleArtist"], inplace=True)
page_df['listSize'] = list_length
page_df['publication'] = pub
page_df['year'] = year
return page_df
def get_year_links(year_soup):
"""returns the links to all year end lists for a given year's html"""
base_page = "https://www.albumoftheyear.org"
return [base_page + child.a.attrs["href"] for child in year_soup.findAll(class_="criticListBlockTitle")]
def scrape_year(year):
"""scrapes all data for a year and returns a single df"""
url = "https://www.albumoftheyear.org/lists.php?y={}".format(year)
year_req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
year_soup = BeautifulSoup(urlopen(year_req))
list_links = get_year_links(year_soup)
all_year_df = []
for link in list_links:
req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(urlopen(req))
all_year_df.append(scrape_page(soup))
return pd.concat(all_year_df)
def scrape_years(year_range):
"""Performs scrape_year across multiple years
Returns a pandas DF
"""
all_year_df = []
for year in year_range:
all_year_df.append(scrape_year(year))
print(year)
return pd.concat(all_year_df)
if __name__ == "__main__":
# Start at the earliest year and first alphabetical reviewer
year_range = range(2000, 2018)
scrape_years(year_range).to_csv("test_df.csv")