-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_manupulation.py
339 lines (284 loc) · 11.1 KB
/
dataset_manupulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 3 17:36:52 2017
@author: buckler
"""
import os
import numpy as np
from collections import Iterable
class Note():
"""
d: means # (diesis)
"""
C = [0, 12, 24, 36, 48, 60, 72, 84, 96, 108, 120]
Cd = [1, 13, 25, 37, 49, 61, 73, 85, 97, 109, 121]
D = [2, 14, 26, 38, 50, 62, 74, 86, 98, 110, 122]
Dd = [3, 15, 27, 39, 51, 63, 75, 87, 99, 111]
E = [4, 16, 28, 40, 52, 64, 76, 88, 100, 112, 124]
F = [5, 17, 29, 41, 53, 65, 77, 89, 101, 113, 125]
Fd = [6, 18, 30, 42, 54, 66, 78, 90, 102, 114, 126]
G = [7, 19, 31, 43, 55, 67, 79, 91, 103, 115, 127]
Gd = [8, 20, 32, 44, 56, 68, 80, 92, 104, 116]
A = [9, 21, 33, 45, 57, 69, 81, 93, 105, 117]
Ad = [10, 22, 34, 46, 58, 70, 82, 94, 106, 118]
B = [11, 23, 35, 47, 59, 71, 83, 95, 107, 119]
class MajorKey():
C = [Note.C, Note.D, Note.E, Note.F, Note.G, Note.A, Note.B]
Cd = [Note.Cd, Note.Dd, Note.F, Note.Fd, Note.Gd, Note.Ad, Note.C]
D = [Note.D, Note.E, Note.Fd, Note.G, Note.A, Note.B, Note.Cd]
Dd = [Note.Dd, Note.F, Note.G, Note.Gd, Note.Ad, Note.C, Note.D]
E = [Note.E, Note.Fd, Note.Gd, Note.A, Note.B, Note.Cd, Note.Dd]
F = [Note.F, Note.G, Note.A, Note.Ad, Note.C, Note.D, Note.E]
Fd = [Note.Fd, Note.Gd, Note.Ad, Note.B, Note.Cd, Note.Dd, Note.F]
G = [Note.G, Note.A, Note.B, Note.C, Note.D, Note.E, Note.Fd]
Gd = [Note.Gd, Note.Ad, Note.C, Note.Cd, Note.Dd, Note.F, Note.G]
A = [Note.A, Note.B, Note.Cd, Note.D, Note.E, Note.Fd, Note.Gd]
Ad = [Note.Ad, Note.C, Note.D, Note.Dd, Note.F, Note.G, Note.A]
B = [Note.B, Note.Cd, Note.Dd, Note.E, Note.Fd, Note.Gd, Note.Ad]
def parsenotes(strNoteList):
"""
take a list of string with note (i.e. G) or key (i.e. Dd_Majkey) ad returns the midi number af all the note
:param strNoteList: example ['G','A','Db_MajKey']
:return: array of int
"""
notes = []
for n in strNoteList:
if n is 'A':
notes.append(Note.A)
if n is 'Ad':
notes.append(Note.Ad)
if n is 'B':
notes.append(Note.B)
if n is 'C':
notes.append(Note.C)
if n is 'Cd':
notes.append(Note.Cd)
if n is 'D':
notes.append(Note.D)
if n is 'Dd':
notes.append(Note.D)
if n is 'E':
notes.append(Note.E)
if n is 'F':
notes.append(Note.F)
if n is 'Fd':
notes.append(Note.Fd)
if n is 'G':
notes.append(Note.G)
if n is 'Gd':
notes.append(Note.Gd)
if n is 'AMaj':
notes.append(MajorKey.A)
if n is 'AdMaj':
notes.append(MajorKey.Ad)
if n is 'BMaj':
notes.append(MajorKey.B)
if n is 'CMaj':
notes.append(MajorKey.C)
if n is 'CdMaj':
notes.append(MajorKey.Cd)
if n is 'DMaj':
notes.append(MajorKey.D)
if n is 'DdMaj':
notes.append(MajorKey.D)
if n is 'EMaj':
notes.append(MajorKey.E)
if n is 'FMaj':
notes.append(MajorKey.F)
if n is 'FdMaj':
notes.append(MajorKey.Fd)
if n is 'GMaj':
notes.append(MajorKey.G)
if n is 'GdMaj':
notes.append(MajorKey.Gd)
return notes
def scanJson(jsonFile, instrument_family_strs='all', notes='all', instrument_source_strs='all', velocityMin=0, velocityMax=127, maxNumberOfFile=500):
selectedFile = []
single_dim_array = np.array([]) # TODO find a better way to flat all the note into sigle one dimensional array
single_dim_array.dtype = np.int8
if notes is not 'all': #need to put all notes in a one dimensional array
for x in np.hstack(notes):
if isinstance(x, Iterable):
for y in x:
single_dim_array = np.append(single_dim_array, y)
else:
single_dim_array = np.append(single_dim_array, x)
i=0
for key, value in jsonFile.items():
if value['instrument_family_str'] in instrument_family_strs or instrument_family_strs == 'all':
if value['pitch'] in single_dim_array or notes is 'all':
if value['instrument_source_str'] in instrument_source_strs or instrument_source_strs == 'all':
if value['velocity'] >= velocityMin and value['velocity'] <= velocityMax:
selectedFile.append(key+'.npy')
i+=1
if i >= maxNumberOfFile:
break
return selectedFile
# def __init__(id):
# global logger
# logger = logging.getLogger(str(id))
# # def crateLogger(id, logToFile):
# # global logger
# # logger = u.MyLogger(id, logToFile)
def load_DATASET(datasetPath, fileslist=None, verbose=False):
"""
Carica tutto il dataset (spettri) in una lista di elementi [filename , matrix ]
"""
print("Loading dataset from " + datasetPath)
dataset = list()
if fileslist is None:
print('load all file')
for root, dirnames, filenames in os.walk(datasetPath):
i = 0
for file in filenames:
if verbose:
print('load ' + file)
matrix = np.load(os.path.join(root, file))
data = [file, matrix]
dataset.append(data)
i += 1
else:
i = 0
print('load file from list')
for file in fileslist:
if verbose:
print('load '+file)
matrix = np.load(os.path.join(datasetPath, file))
data = [file, matrix]
dataset.append(data)
i += 1
print('{0} file loaded'.format(i))
return dataset
def awgn_padding_set(set_to_pad, loc=0.0, scale=1.0):
print("awgn_padding_set")
# find matrix with biggest second axis
dim_pad = np.amax([len(k[1][2]) for k in set_to_pad])
awgn_padded_set = []
for e in set_to_pad:
row, col = e[1].shape
# crete an rowXcol matrix with awgn samples
awgn_matrix = np.random.normal(loc, scale, size=(row, dim_pad - col))
awgn_padded_set.append([e[0], np.hstack((e[1], awgn_matrix))])
return awgn_padded_set
def reshape_set(set_to_reshape, net_type, channels=1):
"""
reshape the data in a form that keras want:
-for theano dim ordering: (nsample, channel, row ,col)
-for tensorflow not supported yet
-other not specified yet
:param set_to_reshape:
:param net_type: is the first type of layer used in the model
:param channels:
:return:
"""
print("reshape_set")
if net_type is 'convolutional2d':
n_sample = len(set_to_reshape)
row, col = set_to_reshape[0][1].shape
label = []
shaped_matrix = np.empty((n_sample, channels, row, col))
for i in range(len(set_to_reshape)):
label.append(set_to_reshape[i][0])
shaped_matrix[i][0] = set_to_reshape[i][1]
if net_type is 'dense':
#n_sample = len(set_to_reshape)
#row, col = set_to_reshape[0][1].shape
label = []
featSize = set_to_reshape[0][1].shape[0]
type = set_to_reshape[0][1].dtype
shaped_matrix = np.empty([1, featSize], dtype=type)
for sample in (set_to_reshape):
shaped_matrix = np.vstack([shaped_matrix, sample[1].T])
shaped_matrix = np.delete(shaped_matrix, 0, 0)
return shaped_matrix, label
def split_A3FALL_simple(data, train_tag=None):
'''
Splitta il dataset in train e test set: train tutti i background, mentre test tutto il resto
(da amplicare in modo che consenta lo split per la validation)
'''
print("split_A3FALL_simple")
if train_tag == None:
train_tag = ['classic_', 'rock_', 'ha_']
# if test_tag=None:
# test_tag=[]
data_train = [d for d in data if any(word in d[0] for word in
train_tag)] # controlla se uno dei tag è presente nnel nome del file e lo assegna al trainset
data_test = [d for d in data if d not in data_train] # tutto cioò che non è train diventa test
return data_train, data_test
def split_A3FALL_from_lists(data, listpath, namelist):
'''
Richede in ingresso la cartella dove risiedono i file di testo che elencano i vari segnali che farano parte di un voluto set di dati.
Inltre in namelist vanno specificati i nomi dei file di testo da usare.
Ritorna una lista contentete le liste dei dataset di shape: (len(namelist),data.shape)
'''
print("split_A3FALL_from_lists")
sets = list()
for name in namelist:
sets.append(select_list(os.path.join(listpath, name), data))
return sets
def select_list(filename, dataset):
'''
Dato in ingesso un file di testo, resituisce una array contenete i dati corrispondenti elencati nel file
'''
print("select_list")
subset = list()
with open(filename) as f:
content = f.readlines()
content = [x.strip().replace('.wav', '.npy') for x in content] # remove the '\n' at the end of string
subset = [s for s in dataset if
any(name in s[0] for name in content)] # select all the data presetn in the list
return subset
def normalize_data(data, mean=None, std=None):
'''
normalizza media e varianza del dataset passato
se data=None viene normalizzato tutto il dataset A3FALL
se mean e variance = None essi vengono calcolati in place sui data
'''
print("normalize_data")
if bool(mean) ^ bool(std): # xor operator
raise ("Error!!! Provide both mean and variance")
elif mean == None and std == None: # compute mean and variance of the passed data
data_conc = concatenate_matrix(data)
mean = np.mean(data_conc)
std = np.std(data_conc)
data_std = [[d[0], ((d[1] - mean) / std)] for d in data] # normalizza i dati: togle mean e divide per std
return data_std, mean, std
def concatenate_matrix(data):
"""
concatena gli spettri in un unica matrice: vule una lista e restituisce un array
:param data:
:return:
"""
print("concatenate_matrix")
data_ = data.copy()
data_.pop(0)
matrix = data[0][1]
for d in data_:
np.append(matrix, d[1], axis=1)
return matrix
def labelize_data(y):#TODO usare sklearn.preprocessing.LabelEncoder ?
"""
labellzza numericamente i nomi dei file
assegna 1 se è una caduta del manichino, 0 altrimenti
:param y:
:return:
"""
print("labelize_data")
i = 0
numeric_labels = list()
for d in y:
if 'rndy' in d:
numeric_labels.append(1)
else:
numeric_labels.append(0)
i += 1
return numeric_labels
#Create context: Mapping of multiple input frames into a single target frame
def create_context(dataset, look_back=1):
dataX, dataY = [], []
for i in range(len(dataset) - look_back - 1):
a = dataset[i:(i + look_back), :]
dataX.append(a)
dataY.append(dataset[i + look_back, :])
return np.array(dataX), np.array(dataY)