|
1 | 1 | #!/usr/bin/env python3
|
2 | 2 | #sorts the genres array in any csv with the genres column
|
| 3 | +#can work on lastfmdata and spotifydata |
3 | 4 | #usage: python3 sortgenres.py lastfmdata.csv genres.csv genres_meta.csv lastfmdata_sorted.csv
|
4 | 5 |
|
5 | 6 | import csv
|
|
14 | 15 | genres=[]
|
15 | 16 | metagenres=[]
|
16 | 17 |
|
| 18 | +def normalizeTag(tagname): |
| 19 | + tagname = tagname.replace('-', ' ') # rock-pop -> rock pop |
| 20 | + tagname = tagname.replace('\'', '') # 90's -> 90s |
| 21 | + tagname = tagname.replace('+', ' ') # dance+and+electronica -> dance and electronica |
| 22 | + tagname = tagname.replace('/', ' ') # singer/songwriter -> singer songwriter |
| 23 | + tagname = tagname.replace('.', ' ') # post.rock -> post rock |
| 24 | + tagname = tagname.replace('_', ' ') # alternative_metal -> alternative metal |
| 25 | + return tagname |
| 26 | + |
| 27 | +not_inflate_tags=['rock & roll'] |
| 28 | + |
| 29 | +def inflate_tags(tags): |
| 30 | + tags_new = [] |
| 31 | + for t in tags: # 'alternative rock' |
| 32 | + if (t in not_inflate_tags): |
| 33 | + continue #skip not_inflate_tags tags |
| 34 | + if type(t) is not str: # some tags can be of type 'byte' (for example Oasis - Wonderwall) |
| 35 | + t = t.decode('UTF-8') # decode byte to string |
| 36 | + tt = t.split(' ') # ['alternative','rock'] |
| 37 | + if ('classic' in tt and len(tt)>1): |
| 38 | + tt.remove('classic') #remove 'classic', if it was part of a genre like 'classic rock' |
| 39 | + tt += [t] # ['alternative', 'rock', 'alternative rock'] |
| 40 | + tt = [x.rstrip() for x in tt] |
| 41 | + tags_new += tt |
| 42 | + |
| 43 | + return tags_new |
| 44 | + |
| 45 | + |
| 46 | +def correct_tags(tags): # correct common spelling mistakes |
| 47 | + tags_correct = [] |
| 48 | + search_for = ['electro','electonic','electronic dance', 'r&b', 'electro swing', 'synth pop', 'ragga', 'synthie pop', 'genre: deep house', |
| 49 | + 'rhythm and blues', 'hellektro', 'pbrnb', '1960s', '1970s', '1980s', '1990s', '00s', 'hiphop', 'triphop', 'edm', 'psy trance','eurotrance','euro pop','euro dance','rock n roll','rock & roll','electro pop','psy trance','pop trance','punk pop'] |
| 50 | + replace_with = ['electronic','electronic', 'electro dance' ,'rnb', 'electroswing', 'synthpop', 'reggae', 'synthpop', 'deep house', |
| 51 | + 'rhythm & blues', 'aggrotech', 'alternative rnb', '60s', '70s', '80s', '90s', '2000s', 'hip hop', 'trip hop', 'electronic dance music', 'psytrance','euro trance','europop','eurodance','rock and roll','rock and roll','electropop','psytrance','trance pop','pop punk'] |
| 52 | + for t in tags: |
| 53 | + for sfindex, sf in enumerate(search_for): |
| 54 | + t = t.replace(search_for[sfindex], replace_with[sfindex]) |
| 55 | + tags_correct += [t] |
| 56 | + return tags_correct |
| 57 | + |
| 58 | + |
17 | 59 | with open(genresfile, 'r') as genresdata: #all genres
|
18 | 60 | csvgenres=csv.reader(genresdata)
|
19 | 61 | with open(metagenresfile, 'r') as metagenresdata: # all metagenres (subset of genres)
|
|
56 | 98 | c=c.replace('"','""') #replace " with double "", csv convention
|
57 | 99 | if ci==genrecolumn: #at genres column
|
58 | 100 |
|
59 |
| - songgenres=c.replace('[','').replace(']','').replace('\'','').split(',') #convert list-string to list |
60 |
| - songgenres=[x.lstrip() for x in songgenres] #remove whitespace on the left of each genre |
| 101 | + songgenres = c.replace('[','').replace(']','').replace('\'','').split(',') #convert list-string to list |
| 102 | + songgenres = [x.lstrip() for x in songgenres] #remove whitespace on the left of each genre |
| 103 | + |
| 104 | + #Filter and normalize tags (if they are not already) |
| 105 | + tags = songgenres #handle songgenres as unfiltered tags |
| 106 | + tags = inflate_tags(tags) # inflate tags (also fix utf8 tags), example: 'alternative rock' -> 'alternative rock','alternative','rock' |
| 107 | + tags = [normalizeTag(t) for t in tags] # normalize tags. for example replace '-' by ' ' |
| 108 | + tags = correct_tags(tags) # correct typos |
| 109 | + |
| 110 | + track_genres = [x for x in tags if x in genres] # search for tags with a genre. Filters out non-genre tags |
| 111 | + |
61 | 112 |
|
62 | 113 | #sort genres by genres-list
|
63 | 114 | songgenres_sorted=[]
|
|
0 commit comments