Skip to content

Commit aaef105

Browse files
committed
[LastFm] add tag filtering to sortgenres.py and improve genre filtering
1 parent 71e29a0 commit aaef105

File tree

2 files changed

+55
-3
lines changed

2 files changed

+55
-3
lines changed

Diff for: LastFmMusicBrainz/lastfmcrawler.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,8 @@ def correct_tags(tags): # correct common spelling mistakes
262262
#tags = correct_tags(tags) # correct typos
263263

264264
#track_genres = [x for x in tags if x in genres] # search for tags with a genre
265-
track_genres= [x for x in tags] #use all tags
265+
266+
track_genres= [x for x in tags] #use all tags, intstead of filtering. Comment out this line if filtering is done during crawling
266267

267268
track_genres = mbcyag.filter_genre_results(track_genres) # filter out duplicates
268269

Diff for: LastFmMusicBrainz/sortgenres.py

+53-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/usr/bin/env python3
22
#sorts the genres array in any csv with the genres column
3+
#can work on lastfmdata and spotifydata
34
#usage: python3 sortgenres.py lastfmdata.csv genres.csv genres_meta.csv lastfmdata_sorted.csv
45

56
import csv
@@ -14,6 +15,47 @@
1415
genres=[]
1516
metagenres=[]
1617

18+
def normalizeTag(tagname):
19+
tagname = tagname.replace('-', ' ') # rock-pop -> rock pop
20+
tagname = tagname.replace('\'', '') # 90's -> 90s
21+
tagname = tagname.replace('+', ' ') # dance+and+electronica -> dance and electronica
22+
tagname = tagname.replace('/', ' ') # singer/songwriter -> singer songwriter
23+
tagname = tagname.replace('.', ' ') # post.rock -> post rock
24+
tagname = tagname.replace('_', ' ') # alternative_metal -> alternative metal
25+
return tagname
26+
27+
not_inflate_tags=['rock & roll']
28+
29+
def inflate_tags(tags):
30+
tags_new = []
31+
for t in tags: # 'alternative rock'
32+
if (t in not_inflate_tags):
33+
continue #skip not_inflate_tags tags
34+
if type(t) is not str: # some tags can be of type 'byte' (for example Oasis - Wonderwall)
35+
t = t.decode('UTF-8') # decode byte to string
36+
tt = t.split(' ') # ['alternative','rock']
37+
if ('classic' in tt and len(tt)>1):
38+
tt.remove('classic') #remove 'classic', if it was part of a genre like 'classic rock'
39+
tt += [t] # ['alternative', 'rock', 'alternative rock']
40+
tt = [x.rstrip() for x in tt]
41+
tags_new += tt
42+
43+
return tags_new
44+
45+
46+
def correct_tags(tags): # correct common spelling mistakes
47+
tags_correct = []
48+
search_for = ['electro','electonic','electronic dance', 'r&b', 'electro swing', 'synth pop', 'ragga', 'synthie pop', 'genre: deep house',
49+
'rhythm and blues', 'hellektro', 'pbrnb', '1960s', '1970s', '1980s', '1990s', '00s', 'hiphop', 'triphop', 'edm', 'psy trance','eurotrance','euro pop','euro dance','rock n roll','rock & roll','electro pop','psy trance','pop trance','punk pop']
50+
replace_with = ['electronic','electronic', 'electro dance' ,'rnb', 'electroswing', 'synthpop', 'reggae', 'synthpop', 'deep house',
51+
'rhythm & blues', 'aggrotech', 'alternative rnb', '60s', '70s', '80s', '90s', '2000s', 'hip hop', 'trip hop', 'electronic dance music', 'psytrance','euro trance','europop','eurodance','rock and roll','rock and roll','electropop','psytrance','trance pop','pop punk']
52+
for t in tags:
53+
for sfindex, sf in enumerate(search_for):
54+
t = t.replace(search_for[sfindex], replace_with[sfindex])
55+
tags_correct += [t]
56+
return tags_correct
57+
58+
1759
with open(genresfile, 'r') as genresdata: #all genres
1860
csvgenres=csv.reader(genresdata)
1961
with open(metagenresfile, 'r') as metagenresdata: # all metagenres (subset of genres)
@@ -56,8 +98,17 @@
5698
c=c.replace('"','""') #replace " with double "", csv convention
5799
if ci==genrecolumn: #at genres column
58100

59-
songgenres=c.replace('[','').replace(']','').replace('\'','').split(',') #convert list-string to list
60-
songgenres=[x.lstrip() for x in songgenres] #remove whitespace on the left of each genre
101+
songgenres = c.replace('[','').replace(']','').replace('\'','').split(',') #convert list-string to list
102+
songgenres = [x.lstrip() for x in songgenres] #remove whitespace on the left of each genre
103+
104+
#Filter and normalize tags (if they are not already)
105+
tags = songgenres #handle songgenres as unfiltered tags
106+
tags = inflate_tags(tags) # inflate tags (also fix utf8 tags), example: 'alternative rock' -> 'alternative rock','alternative','rock'
107+
tags = [normalizeTag(t) for t in tags] # normalize tags. for example replace '-' by ' '
108+
tags = correct_tags(tags) # correct typos
109+
110+
track_genres = [x for x in tags if x in genres] # search for tags with a genre. Filters out non-genre tags
111+
61112

62113
#sort genres by genres-list
63114
songgenres_sorted=[]

0 commit comments

Comments
 (0)