-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathArtistScraper.py
183 lines (152 loc) · 6.93 KB
/
ArtistScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
'''
This script scrapes the Spotify API for all songs by all artists in a given
list. In this case, I'm scraping for all songs by all artists listed on the
Wikipedia page for List of most-streamed artists on Spotify
(https://en.wikipedia.org/wiki/List_of_most-streamed_artists_on_Spotify) in both
most monthly listeners and most followed (as listed on February 21, 2020).
'''
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time
import numpy as np
import re
def access_api(client_id, client_secret):
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) #spotify object to access API
return sp
def get_features_by_track_list(sp, track_id_list):
if len(track_id_list) > 100:
track_id_list = track_id_list[:100]
track_list = ['spotify:track:{}'.format(track_id) for track_id in track_id_list]
features = sp.audio_features(track_list)
return features
def get_albums_by_artist(sp, artist_uri):
#Pull all of the artist's albums
sp_albums = sp.artist_albums(artist_uri)
#Store artist's albums' names' and uris in separate lists
album_names = []
album_uris = []
for i in range(len(sp_albums['items'])):
album_names.append(sp_albums['items'][i]['name'])
album_uris.append(sp_albums['items'][i]['uri'])
return album_names, album_uris
def get_features_for_artist(sp, artist):
result = sp.search(artist, type='artist') #search query
artist_uri = result['artists']['items'][0]['uri']
album_names, album_uris = get_albums_by_artist(sp, artist_uri)
spotify_albums = {}
album_count = 0
for uri in album_uris: # each album
spotify_albums = album_songs(sp, uri, spotify_albums, artist, album_names, album_count)
print("Processed " + str(album_names[album_count]))
album_count += 1
sleep_min = 2
sleep_max = 5
start_time = time.time()
request_count = 0
for album in spotify_albums:
spotify_albums = audio_features(sp, album, spotify_albums)
request_count += 1
if request_count % 5 == 0:
print(str(request_count) + " ablums completed")
time.sleep(np.random.uniform(sleep_min, sleep_max))
print('Loop #: {}'.format(request_count))
print('Elapsed Time: {} seconds'.format(time.time() - start_time))
return spotify_albums
def feature_df(spotify_albums):
dic_df = {}
dic_df['album'] = []
dic_df['artist'] = []
dic_df['track_number'] = []
dic_df['id'] = []
dic_df['name'] = []
dic_df['uri'] = []
dic_df['acousticness'] = []
dic_df['danceability'] = []
dic_df['energy'] = []
dic_df['instrumentalness'] = []
dic_df['liveness'] = []
dic_df['loudness'] = []
dic_df['speechiness'] = []
dic_df['tempo'] = []
dic_df['valence'] = []
dic_df['popularity'] = []
for album in spotify_albums:
for feature in spotify_albums[album]:
dic_df[feature].extend(spotify_albums[album][feature])
df = pd.DataFrame.from_dict(dic_df)
df = df.sort_values('popularity', ascending=False).drop_duplicates('name').sort_index()
return df
def album_songs(sp, uri, spotify_albums, artist, album_names, album_count):
album = uri # assign album uri to a_name
spotify_albums[album] = {} # Creates dictionary for that specific album
# Create keys-values of empty lists inside nested dictionary for album
spotify_albums[album]['album'] = [] # create empty list
spotify_albums[album]['artist'] = []
spotify_albums[album]['track_number'] = []
spotify_albums[album]['id'] = []
spotify_albums[album]['name'] = []
spotify_albums[album]['uri'] = []
tracks = sp.album_tracks(album) # pull data on album tracks
for n in range(len(tracks['items'])): #for each song track
spotify_albums[album]['album'].append(album_names[album_count]) # append album name tracked via album_count
spotify_albums[album]['artist'].append(artist)
spotify_albums[album]['track_number'].append(tracks['items'][n]['track_number'])
spotify_albums[album]['id'].append(tracks['items'][n]['id'])
spotify_albums[album]['name'].append(tracks['items'][n]['name'])
spotify_albums[album]['uri'].append(tracks['items'][n]['uri'])
return spotify_albums
def audio_features(sp, album, spotify_albums):
# Add new key-values to store audio features
spotify_albums[album]['acousticness'] = []
spotify_albums[album]['danceability'] = []
spotify_albums[album]['energy'] = []
spotify_albums[album]['instrumentalness'] = []
spotify_albums[album]['liveness'] = []
spotify_albums[album]['loudness'] = []
spotify_albums[album]['speechiness'] = []
spotify_albums[album]['tempo'] = []
spotify_albums[album]['valence'] = []
spotify_albums[album]['popularity'] = []
# create a track counter
track_count = 0
for track in spotify_albums[album]['uri']:
# pull audio features per track
features = sp.audio_features(track)
# Append to relevant key-value
spotify_albums[album]['acousticness'].append(features[0]['acousticness'])
spotify_albums[album]['danceability'].append(features[0]['danceability'])
spotify_albums[album]['energy'].append(features[0]['energy'])
spotify_albums[album]['instrumentalness'].append(features[0]['instrumentalness'])
spotify_albums[album]['liveness'].append(features[0]['liveness'])
spotify_albums[album]['loudness'].append(features[0]['loudness'])
spotify_albums[album]['speechiness'].append(features[0]['speechiness'])
spotify_albums[album]['tempo'].append(features[0]['tempo'])
spotify_albums[album]['valence'].append(features[0]['valence'])
# popularity is stored elsewhere
pop = sp.track(track)
spotify_albums[album]['popularity'].append(pop['popularity'])
track_count+=1
return spotify_albums
def save_csv(df):
with open('artist_output.csv', 'w', newline='', encoding="utf-8") as f:
df.to_csv(f, index=False)
def main():
client_id = # SECRET
client_secret = # SECRET
sp = access_api(client_id, client_secret)
top_listened = pd.read_csv('top_monthly_listeners_feb_2020.csv')
top_followed = pd.read_csv('most_followed_feb_2020.csv')
top_artists = top_listened['Artist'].tolist()
top_artists.extend(top_followed['Artist'].tolist())
top_artists = list(set(top_artists))
idx = 0
df = pd.DataFrame()
for artist in top_artists:
spotify_albums = get_features_for_artist(sp, artist)
df_temp = feature_df(spotify_albums)
df = pd.concat([df, df_temp])
save_csv(df)
if __name__ == "__main__":
main()