forked from tuwien-musicir/rp_extract
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rp_feature_io.py
486 lines (339 loc) · 17.5 KB
/
rp_feature_io.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
'''RP_Feature_IO
2015 by Thomas Lidy
load and save feature files from RP_extract
manipulate, merge, cut, align features and more helpful functions
supported formats:
read and write: CSV, ARFF
read only currently: HDF5, NPZ (Numpy Pickle)
'''
import os
import sys
import pandas as pd
# == CSV ==
def check_duplicates(file_ids):
'''check for duplicates in file_ids from CSV or feature files'''
dup = set([x for x in file_ids if file_ids.count(x) > 1])
if len(dup) > 0:
print >> sys.stderr, "DUPLICATE(S):", "; ".join(list(dup))
raise ValueError("Duplicate entries in file ids!")
def read_csv_features1(filename,separate_ids=True,id_column=0):
''' Read_CSV_features1
read features (and optionally one or more ids) from a CSV file
Note: in write_feature_files we use unicsv to store the features.
It will quote strings containing , and other characters if needed only (automatically)
Here we use pandas to import CSV as a pandas dataframe,
because it handles quoted filenames (containing ,) well (by contrast to other CSV readers)
Parameters:
:param filename: filename path to CSV file to read from
:param separate_ids: will split off the id column(s) from the features (containing eg. and index or filename string)
:param id_column: specify which is/are the id column(s) as integer or list, e.g. 0 (= first column) or [0,1] (= first two columns)
negative integers identify the columns backwards, i.e. -1 is the last column, -2 the second last column, and so on
:return: if separate_ids is True, it will return a tuple (ids, features)
with ids containing usually identifiers as list of strings and features the numeric data as numpy array;
if separate_ids is False, just the features array will returned (containing everything read from the CSV)
'''
import numpy as np
import pandas as pd
# we use pandas to import CSV as pandas dataframe,
# because it handles quoted filenames (containing ,) well (by contrast to other CSV readers)
dataframe = pd.read_csv(filename, sep=',',header=None)
# future option: this would be a way to set the file ids as index in the dataframe
# dataframe = pd.read_csv(filename, sep=',',header=None,index_col=0) # index_col=0 makes 1st column the rowname (index)
# convert to numpy matrix/array
feat = dataframe.as_matrix(columns=None)
if separate_ids:
ids = feat[:,id_column]
feat = np.delete(feat,id_column,1).astype(np.float) # delete id columns and return feature vectors as float type
return (ids,feat)
else:
return feat
def read_csv_features(filenamestub,ext,separate_ids=True,id_column=0,single_id_list=False,verbose=True):
''' Read_CSV_features:
read pre-analyzed features from multiple CSV files (with feature name extensions)
Parameters:
# filenamestub: full path to feature file name WITHOUT .extension
# ext: a list of file .extensions (e.g. 'rh','ssd','rp') to be read in
# separate_ids: if False, it will return a single matrix containing the id column
# if True, it will return a tuple: (ids, features) separating the id column from the features
# id_column: which of the CSV columns contains the ids (default = 0, i.e. first column)
# single_id_list: if separate_ids and single_id_list are True, this will return a single id list instead of a dictionary
#
# returns: if separate_ids == False:
a Python dict containing one entry per feature extension (ext), which is
a NumPy matrix containing all data including ids
if separate_ids == True:
a tuple of (ids, features) where both are Python dicts containing one entry per feature extension (ext)
ids: with all the ideas as a Numpy array containing strings
(if single_id_list == True, ids will be reduced to a single Python list, cause the arrays are supposed to be all identical)
features: NumPy matrix containing all numeric feature data
'''
# initialize empty dicts
feat = {}
ids = {}
for e in ext:
filename = filenamestub + "." + e
if separate_ids:
ids[e], feat[e] = read_csv_features1(filename,separate_ids,id_column)
else:
feat[e] = read_csv_features1(filename,separate_ids,id_column)
if verbose: print "Read:", e + ":\t", feat[e].shape[0], "audio file vectors,", feat[e].shape[1], "dimensions"
if separate_ids:
# check for ID consistency
ext = ids.keys()
for e in ext[1:]:
if not len(ids[ext[0]]) == len(ids[e]):
raise ValueError("Feature files have different number of entries! " +
", ".join([e + ': ' + str(len(ids[e])) for e in ext]) )
if not all(ids[ext[0]] == ids[e]):
raise ValueError("Ids not matching across feature files!")
# once consistent, check for duplicates
check_duplicates(ids[ext[0]].tolist())
if single_id_list:
# from the ids dict, we take only the first entry and convert the NumPy array to a list
ids = ids.values()[0].tolist()
return(ids,feat)
else:
return feat
def read_multiple_feature_files(list_of_filenames, common_path = '', feature_types =('rh','ssd','rp'), verbose=True):
'''Reads multiple feature input files and appends them to make a single feature matrix per feature type
and a single list of file ids.
list_of_filenames: Python list with file names (without extension) of multiple feature files (absolute or relative path)
common_path: will be added at the beginning of each file name, unless it is '' (default), then file names are treated as absolulte
feature_types: feature types (i.e. file extensions) to be read in
returns: tuple of (ids, features), where ids is a list of filenames that belong to the feature vects,
features is a dict which contains one element per feature type which is a Numpy array which contains feature vectors row-wise
'''
import numpy as np
if isinstance(list_of_filenames, str): # make list if only 1 string is passed
list_of_filenames = [list_of_filenames]
ids_out = []
feat_out = {}
for filen in list_of_filenames:
ids, feat = read_csv_features(common_path + os.sep + filen, feature_types, verbose=verbose, single_id_list=True)
ids_out.extend(ids)
for e in feat.keys(): # for each element in dict add to the feat_out dict
if e not in feat_out.keys(): # first file: create array
feat_out[e] = feat[e]
else: # others: append
feat_out[e] = np.append(feat_out[e],feat[e],axis=0)
return ids_out, feat_out
# == ARFF ==
# load_arff
# read arff file and bring to Numpy array format
# returns a tuple (features,classes)
def load_arff(arff_file):
import scipy.io.arff as arff
import numpy as np
arffdata, metadata = arff.loadarff(arff_file)
# GET CLASS DATA # TODO: check if we even have a class attribute
classes = arffdata['class']
# GET FEATURE DATA
# strip off class column and get correct NP 2D array with data
features = arffdata[metadata.names()[:-1]] #everything but the last column # TODO: check which column is the class column and remove that one
features = features.view(np.float).reshape(arffdata.shape + (-1,)) #converts the record array to a normal numpy array
return (features,classes)
# save_arff
# save data in Weka ARFF format
# adds typcial ARFF file headers (@relation, @attribute), optionally adds class label + writes data into ARFF format
# based on npz2arff arff writing code by Alexander Schindler
def save_arff(filename,dataframe,relation_name=None):
if relation_name is None:
relation_name = filename
out_file = open(filename, 'w')
out_file.write("@Relation {0}\n".format(relation_name))
for column in dataframe:
if column == "ID":
out_file.write("@Attribute ID string\n")
elif column == "class":
class_list = dataframe["class"].unique()
out_file.write("@Attribute class {{{0}}}\n".format(",".join(class_list)))
else:
# assume all other columns are numeric
out_file.write("@Attribute {0} numeric\n".format(column))
# now for the feature data
out_file.write("\n@Data\n")
dataframe.to_csv(out_file, header=None, index=None)
# NumPy variant:
#np.savetxt(out_file, a, delimiter=",")
out_file.close()
# == NPZ (Numpy Pickle) ==
# == HDF5 ==
def load_hdf5(hdf_filename):
store = pd.HDFStore(hdf_filename)
# .as_matrix(columns=None) converts to Numpy array (of undefined data column types)
data = store['data'].as_matrix(columns=None)
store.close()
return(data)
# == CONVERSION ==
# Csv2Arff
# convert feature files that are stored in CSV format to Weka ARFF format
# in_filenamestub, in_filenamestub: full file path and filname but without .rp, .rh etc. extension (will be added from feature types) for input and output feature files
# feature_types = ['rp','ssd','rh','mvd']
def csv2arff(in_filenamestub,out_filenamestub,feature_types,add_class=True):
from classes_io import classes_from_filename
ids, features = read_csv_features(in_filenamestub,feature_types)
for ext in feature_types:
# derive the class labels from the audio filenames (see function above)
# THIS VERSION splits by first / or \ (os.sep)
#classes = classes_from_filename(ids[ext]) if add_class else None
# THIS VERSION splits by first '.' as used e.g. in GTZAN collection: pop.00001.wav
classes = classes_from_filename(ids[ext],'.') if add_class else None
# CREATE DATAFRAME
# with ids
#df = to_dataframe(features[ext], None, ids[ext], classes)
# without ids
df = to_dataframe_for_arff(features[ext], classes=classes)
# WRITE ARFF
out_filename = out_filenamestub + "." + ext + ".arff"
print "Saving " + out_filename + " ..."
save_arff(out_filename,df)
print "Finished."
# convert NPZ (Numpy Pickle) to ARFF format
# adapted from Alex Schindlers npz2arff.py # untested: TODO: test!
def npz2arff(in_file, out_file, relation_name, include_filenames=False):
import numpy as np
npz = np.load(in_file)
# load data
data = pd.DataFrame(npz["data"], columns=npz["attribute_names"])
if include_filenames:
data["ID"] = npz["filenames"]
data["class"] = npz["labels"]
npz.close()
ordered_indexes = data.columns.tolist()
ordered_indexes.remove("labels")
if include_filenames:
ordered_indexes.remove("filenames")
save_arff(out_file,data,relation_name)
def csv2hdf5(csv_filename,hdf_filename,chunk_size=1000,verbose=True):
''' Csv2Hdf5
converting CSV files to HDF5 file format (using Pandas HDFStore)
Parameters:
csv_filename: input filename of file to convert
hdf_filename: output HDF5 filename
chunk_size: number of files to read chunk-wise from CSV and store iteratively in HDF5 (default: 1000)
verbose: if False no output will be printed
'''
import os
#import numpy as np
import pandas as pd
# we check and delete the filename, otherwise we would always append
if os.path.exists(hdf_filename):
os.remove(hdf_filename)
store = pd.HDFStore(hdf_filename)
# read all at once:
#dataframe = pd.read_csv(csv_filename, sep=',',header=None)
cnt = 0
csv_reader = pd.read_csv(csv_filename, sep=',',header=None,chunksize=chunk_size)
for chunk in csv_reader:
store.append('data', chunk)
cnt += chunk.shape[0]
if verbose: print "processed", cnt, "rows"
store.close()
if verbose: print "Finished."
# == FEATURE MANIPULATION ==
def concatenate_features(feat, feature_types = ('ssd', 'rh')):
''' concatenate features vectors of various feature types together
(all entries in feature dictionary must have same number of instances, but can have different dimensions
feat: feature dictionary, containing np.arrays for various feature types (named 'ssd', 'rh', etc.)
feature_types: tuple of strings with the names of feature types to concatenate (1 entry tuple or string is
allowed, will return this feature type only, without concatenation)
'''
import numpy as np
# in case only 1 feature type instad of tuple was passed
if isinstance(feature_types, str):
new_feat = feat[feature_types]
else:
# take the first feature type
new_feat = feat[feature_types[0]]
# and iteratively horizontally stack the remaining feature types
for e in feature_types[1:]:
new_feat = np.hstack((new_feat, feat[e]))
return new_feat
def sorted_feature_subset(features, ids_orig, ids_select):
'''
selects a (sorted) subset of the original features array
features: a feature dictionary, containing multiple np.arrays one for each feature type ('rh', 'ssd', etc.)
ids_orig: original labels/ids for the featurs, MUST be same length AND order as feature arrays
ids_select: set of ids in different order or subset of ids to be selected from original feature arrays
returns: sorted subset of the original features array
'''
new_feat = {}
for e in features.keys():
dataframe = pd.DataFrame(features[e], index = ids_orig)
new_feat[e] = dataframe.ix[ids_select].values
return new_feat
# == HELPER FUNCTIONS ==
def to_dataframe(feature_data, ids=None, attribute_labels=None):
'''converts np.array to Pandas dataframe with optional
ids (e.g. audio filenames) to be added as rownames (index)
and/or attribute names to be added as column labels
'''
dataframe = pd.DataFrame(feature_data, index = ids, columns=attribute_labels)
return dataframe
def to_dataframe_for_arff(feature_data, attribute_labels=None, ids=None, classes=None):
'''converts np.array + extra ids and/or classes to Pandas dataframe
ids (e.g. audio filenames) and classes can be provided optionally as list (will be excluded if omitted)
feature attribute labels also optionally as a list (will be generated if omitted)
'''
if attribute_labels is None:
attribute_labels = feature_data.dtype.names
if attribute_labels == None:
# if nothing is passed and nothing is stored in np array we create the attribute names
fdim = feature_data.shape[1]
attribute_labels = [("feat" + str(x)) for x in range(fdim)]
if feature_data.dtype == object: # convert to float for proper PD output to arff
feature_data = feature_data.astype(float)
dataframe = pd.DataFrame(feature_data, columns=attribute_labels)
if not ids is None:
dataframe["ID"] = ids
if not classes is None:
dataframe["class"] = pd.Categorical(classes) # classes
return dataframe
def load_or_analyze_features(input_path, feature_types = ['rp','ssd','rh'], save_features = False, output_file = None):
"""convenient function that can either load or freshly extract features
depending if input_path is...
a) a path: will recursively look for .wav and .mp3 files in path and freshly extract features
b) a .txt file: will take a list of filenames (one per line) and freshly extract features
c) another file: will load features in multiple csv file feature format
TODO:
d) a .npz, .h5 or .hdf5 file: will load the features from that file
e) a .wav or .mp3 file: will freshly extract features from that file
TODO: The audio analysis parameters of this function call are only needed when features are to be freshly extracted.
:param input_path:
:return:
"""
from rp_extract_batch import extract_all_files_generic
# not possible because of omitted file extensions in read_csv_features below
# if not os.path.exists(input_path):
# raise NameError("File or path does not exist: " + input_path)
if save_features and output_file is None:
raise ValueError("output_file must be specified if save_features is set to True!")
# if we got a directory, we do analysis, if we got a file, we load it
if os.path.isdir(input_path) or input_path.lower().endswith(('.txt','.wav','.mp3','.aif')): # FRESH ANALYSIS from input path or .txt file
print "Performing feature extraction from ", input_path
# BATCH RP FEATURE EXTRACTION:
# if output_file is given, will save features, otherwise not
ids, feat = extract_all_files_generic(input_path,output_file,feature_types,audiofile_types=('.wav','.mp3','.aif'))
else:
# LOAD from Feature File
ids, feat = read_csv_features(input_path,feature_types)
# from the ids dict, we take only the first entry and convert numpy array to list
ids = ids.values()[0].tolist()
# TODO: read .npz, .h5 or .hdf5
return ids, feat
if __name__ == '__main__':
# test CSV to ARFF
in_path = '/data/music/GTZAN/vec'
out_path = './feat'
filenamestub = 'GTZAN.python'
feature_types = ['rp','ssd','rh','mvd']
in_filenamestub = in_path + os.sep + filenamestub
out_filenamestub = out_path + os.sep + filenamestub
csv2arff(in_filenamestub,out_filenamestub,feature_types)
# try to load ARFF
feat_type = 'ssd'
arff_file = out_filenamestub + '.' + feat_type + '.arff'
features, classes = load_arff(arff_file)
print "Reading ", arff_file
print "classes:" , classes.shape
print "feature dimensions:", features.shape