This is the main notebook. Note that the /all_samples/
folder contains mp3
files once downloaded from the website.
They have to be converted to wav
first.
Running Convert mp3 to wav.ipynb
takes care of this - the final audio folder should then be /all_samples_wav/
The trained model has been exported to Javascript. See cnn_philharmonia.h5
NOTE:
The rendered final confusion matrix uses black font, which does not mix well with the dark Github scheme.
For reference, better open it in a new tab: Click
import os
import numpy as np
import librosa as lr
import pandas as pd
import keras
import pickle
import keras.initializers
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from keras.models import Sequential
from keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
Num GPUs Available: 1
# Set directory for source files
ROOT = "all_samples_wav"
folders = os.listdir(ROOT) # names of subfolders
file_data = [] # filenames in each of the 58 folders
TARGET_SECONDS = 2
sr = 22050
target_num_samples = sr*TARGET_SECONDS
n_fft = 1024
hop_length = 512
n_mfcc = 13
for label in tqdm(range(len(folders))):
sub_dir = os.listdir(f'{ROOT}/{folders[label]}')
file_data.append(sub_dir)
100%|█████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 589.82it/s]
amounts = []
for i in range(len(file_data)):
amounts.append(len(file_data[i]))
col1 = np.array(folders)
col2 = np.array(amounts)
merge = {'folder': col1, 'amount': col2}
df = pd.DataFrame(merge, columns=['folder', 'amount'])
print(f'Total amount of samples: {sum(amounts)}')
df
Total amount of samples: 13681
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
folder | amount | |
---|---|---|
0 | banjo | 74 |
1 | bassoon | 720 |
2 | bass_clarinet | 944 |
3 | cello | 889 |
4 | clarinet | 846 |
5 | contrabassoon | 710 |
6 | cor_anglais | 691 |
7 | double_bass | 852 |
8 | flute | 878 |
9 | french_horn | 652 |
10 | guitar | 106 |
11 | mandolin | 80 |
12 | oboe | 596 |
13 | percussion | 148 |
14 | saxophone | 732 |
15 | trombone | 831 |
16 | trumpet | 485 |
17 | tuba | 972 |
18 | viola | 973 |
19 | violin | 1502 |
Load audio data
# audio_data = []
#
# for dirname, _, filenames in tqdm(os.walk(ROOT)):
# for filename in filenames:
# src = f'{dirname}/{filename}'
# audio_data.append(lr.load(src))
Save/Load audio_waves array
# Save
# with open('audio_data.pickle', 'wb') as f:
# pickle.dump(audio_data, f)
# Load
with open('audio_data.pickle', 'rb') as f:
audio_data = pickle.load(f)
Create dataframe overview
fname = []
classID = []
num_samples = []
df1 = pd.DataFrame(np.array(audio_data, dtype=object), columns=['signal', 'samplerate'])
for i in range(df1.shape[0]):
num_samples.append(len(df1['signal'].iloc[i]))
num_samples = np.array(num_samples)
for dirname, _, filenames in os.walk(ROOT):
for filename in filenames:
fname.append(filename)
classID.append(dirname[16:])
fname = np.array(fname)
classID = np.array(classID)
df1['num samples'] = num_samples
df1['seconds'] = df1['num samples']/df1['samplerate']
df1['fname'] = fname
df1['classID'] = classID
# round seconds
df1['seconds'] = df1['seconds'].apply(pd.to_numeric, errors='coerce').round(1)
df1
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
signal | samplerate | num samples | seconds | fname | classID | |
---|---|---|---|---|---|---|
0 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | 22050 | 76608 | 3.5 | banjo_A3_very-long_forte_normal.wav | banjo |
1 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | 22050 | 77184 | 3.5 | banjo_A3_very-long_piano_normal.wav | banjo |
2 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | 22050 | 84096 | 3.8 | banjo_A4_very-long_forte_normal.wav | banjo |
3 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | 22050 | 69696 | 3.2 | banjo_A4_very-long_piano_normal.wav | banjo |
4 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | 22050 | 85248 | 3.9 | banjo_A5_very-long_forte_normal.wav | banjo |
... | ... | ... | ... | ... | ... | ... |
13676 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | 22050 | 48960 | 2.2 | violin_Gs6_1_mezzo-forte_natural-harmonic.wav | violin |
13677 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | 22050 | 28800 | 1.3 | violin_Gs6_1_piano_arco-sul-tasto.wav | violin |
13678 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | 22050 | 34560 | 1.6 | violin_Gs6_1_piano_natural-harmonic.wav | violin |
13679 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | 22050 | 51840 | 2.4 | violin_Gs6_phrase_pianissimo_arco-normal.wav | violin |
13680 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | 22050 | 31680 | 1.4 | violin_Gs7_1_mezzo-forte_natural-harmonic.wav | violin |
13681 rows × 6 columns
%matplotlib inline
x = df1['seconds']
plt.hist(x, density=False, bins=150, range=(0,5)) # density=False would make counts
plt.ylabel('Amount of files')
plt.xlabel('Seconds');
print(f"Shortest sample length: {np.min(df1['num samples'])}sm (~ {np.min(df1['seconds'])} seconds), longest: {np.max(df1['num samples'])}sm (~ {np.max(df1['seconds'])} seconds)")
print(f"Average length in seconds: {np.round(np.mean(df1['seconds']), decimals=1)}")
Shortest sample length: 1728sm (~ 0.1 seconds), longest: 1711296sm (~ 77.6 seconds)
Average length in seconds: 1.9
Bring all soundfiles to the desired length of 2 seconds
processed_audio = []
for i in range(len(audio_data)):
signal = audio_data[i][0]
# shorten if too long, right-pad if too short
if len(signal) > target_num_samples:
processed_audio.append(signal[:target_num_samples])
if len(signal) < target_num_samples:
num_missing_samples = target_num_samples - len(signal)
last_dim_padding = (0, num_missing_samples)
processed_audio.append(np.pad(signal, last_dim_padding, mode='constant'))
processed_audio = np.array(processed_audio, dtype=float)
Save/load processed_audio_data
# Save
# with open('processed_audio.pickle', 'wb') as f:
# pickle.dump(processed_audio, f)
# Load
with open('processed_audio.pickle', 'rb') as f:
processed_audio = pickle.load(f)
processed_audio.shape
(13681, 44100)
Methods and extraction
def mfcc_scale(mfcc):
scaler = StandardScaler()
mfcc = scaler.fit_transform(np.array(mfcc))
return mfcc
def calc_mfcc(signal):
return lr.feature.mfcc(y=signal, n_mfcc=n_mfcc, sr=sr)
mfcc_features = []
for i in tqdm(range(len(processed_audio))):
mfcc_features.append(mfcc_scale(calc_mfcc(processed_audio[i])))
mfcc_features = np.array(mfcc_features)
100%|███████████████████████████████████████████████████████████████████████████| 13681/13681 [02:00<00:00, 113.28it/s]
Save/load mfcc_features
# Save
with open('mfcc_features.pickle', 'wb') as f:
pickle.dump(mfcc_features, f)
# Load
# with open('mfcc_features.pickle', 'rb') as f:
# mfcc_features = pickle.load(f)
print(processed_audio.shape)
print(mfcc_features.shape)
(13681, 44100)
(13681, 13, 87)
ex. shortened file
test_nr = 310
plt.figure(figsize=(12,2))
plt.plot(processed_audio[test_nr])
plt.title(f'{classID[test_nr]}')
plt.show()
plt.figure(figsize=(15, 2))
plt.imshow(mfcc_features[test_nr], vmin=0, vmax=1)
plt.title(classID[test_nr])
plt.show()
print(f'{fname[test_nr]}; original: {len(audio_data[test_nr][0])}sm -> {target_num_samples}sm')
bassoon_C4_long_forte_tremolo.wav; original: 101376sm -> 44100sm
ex. padded file
test_nr = 5001
plt.figure(figsize=(12,2))
plt.plot(processed_audio[test_nr])
plt.title(classID[test_nr])
plt.show()
plt.figure(figsize=(15, 2))
plt.imshow(mfcc_features[test_nr], vmin=0, vmax=1)
plt.title(classID[test_nr])
plt.show()
print(f'{fname[test_nr]}; original: {len(audio_data[test_nr][0])}sm -> {target_num_samples}sm')
double-bass_As3_1_mezzo-piano_arco-normal.wav; original: 28224sm -> 44100sm
Encoding the labels
'banjo' -> '0',
...
'violin' -> '20'
Followed by one-hot-encoding
label_encoder = LabelEncoder()
label_encoded = label_encoder.fit_transform(classID)
label_encoded = label_encoded[:, np.newaxis]
one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoded = one_hot_encoder.fit_transform(label_encoded)
Intuitive labelling for data, normalization of features, stratified splitting to account for
the inbalances in the dataset
X = mfcc_features
y = one_hot_encoded
X = (X-X.min())/(X.max()-X.min()) # Normalization
X_train, X_test, y_train, y_test = train_test_split(X, y,
stratify=y,
test_size=0.2,
shuffle=True,
random_state=8)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
stratify=y_train,
test_size=0.25,
shuffle=True,
random_state= 8)
input_shape = (X_train.shape[1], X_train.shape[2], 1)
input_shape
(13, 87, 1)
# X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
# X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)
print("X_train shape: {}".format(X_train.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_train shape: {}".format(y_train.shape))
print("y_test shape: {}".format(y_test.shape))
print("X_val shape: {}".format(y_train.shape))
print("y_val shape: {}".format(y_test.shape))
X_train shape: (8208, 13, 87)
X_test shape: (2737, 13, 87)
y_train shape: (8208, 20)
y_test shape: (2737, 20)
X_val shape: (8208, 20)
y_val shape: (2737, 20)
# Training parameters
num_epochs = 100
# num_steps = 1000
activation = 'relu'
last_act = 'Softmax'
kernel_init = 'he_normal'
dense_init = keras.initializers.HeNormal()
regularizer = l2(0.01)
padding = 'same'
loss = 'categorical_crossentropy'
optimizer = 'adam'
metrics = 'acc'
filter_dim = (3, 3)
# Early stopping parameters
callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
CNN Architecture
model = Sequential()
model.add(Conv2D(16, filter_dim, activation=activation, strides=(1, 1), padding=padding, input_shape=input_shape, kernel_initializer=kernel_init))
# model.add(MaxPool2D((2, 2), padding=padding))
model.add(Conv2D(32, filter_dim, activation=activation, strides=(1, 1), padding=padding, kernel_initializer=kernel_init))
# model.add(MaxPool2D((2, 2), padding=padding))
model.add(Conv2D(64, filter_dim, activation=activation, strides=(1, 1), padding=padding, kernel_initializer=kernel_init))
model.add(MaxPool2D((2, 2), padding=padding))
model.add(Conv2D(128, filter_dim, activation=activation, strides=(1, 1), padding=padding, kernel_initializer=kernel_init))
model.add(MaxPool2D((2, 2), padding=padding))
model.add(Flatten())
model.add(Dense(512, activation=activation, kernel_initializer=dense_init, kernel_regularizer=regularizer))
model.add(Dropout(0.3))
model.add(Dense(1024, activation=activation, kernel_initializer=dense_init, kernel_regularizer=regularizer))
model.add(Dropout(0.2))
model.add(Dense(512, activation=activation, kernel_initializer=dense_init, kernel_regularizer=regularizer))
model.add(Dense(20, activation=last_act))
model.compile(loss=loss, optimizer=optimizer, metrics=[metrics])
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d (Conv2D) (None, 13, 87, 16) 160
conv2d_1 (Conv2D) (None, 13, 87, 32) 4640
conv2d_2 (Conv2D) (None, 13, 87, 64) 18496
max_pooling2d (MaxPooling2D (None, 7, 44, 64) 0
)
conv2d_3 (Conv2D) (None, 7, 44, 128) 73856
max_pooling2d_1 (MaxPooling (None, 4, 22, 128) 0
2D)
flatten (Flatten) (None, 11264) 0
dense (Dense) (None, 512) 5767680
dropout (Dropout) (None, 512) 0
dense_1 (Dense) (None, 1024) 525312
dropout_1 (Dropout) (None, 1024) 0
dense_2 (Dense) (None, 512) 524800
dense_3 (Dense) (None, 20) 10260
=================================================================
Total params: 6,925,204
Trainable params: 6,925,204
Non-trainable params: 0
_________________________________________________________________
Train
history = model.fit(X_train, y_train,
# steps_per_epoch=num_steps,
initial_epoch=0,
epochs=num_epochs,
validation_data=(X_val, y_val),
shuffle=True,
callbacks=[callback])
Epoch 1/100
257/257 [==============================] - 9s 15ms/step - loss: 11.8157 - acc: 0.0994 - val_loss: 4.3723 - val_acc: 0.1096
Epoch 2/100
257/257 [==============================] - 3s 13ms/step - loss: 3.3876 - acc: 0.1423 - val_loss: 2.6532 - val_acc: 0.2135
Epoch 3/100
257/257 [==============================] - 4s 14ms/step - loss: 2.3974 - acc: 0.2753 - val_loss: 2.0553 - val_acc: 0.3790
Epoch 4/100
257/257 [==============================] - 4s 14ms/step - loss: 1.9936 - acc: 0.3922 - val_loss: 1.7637 - val_acc: 0.4613
Epoch 5/100
257/257 [==============================] - 4s 14ms/step - loss: 1.7774 - acc: 0.4821 - val_loss: 1.5653 - val_acc: 0.5738
Epoch 6/100
257/257 [==============================] - 4s 16ms/step - loss: 1.5393 - acc: 0.5869 - val_loss: 1.4171 - val_acc: 0.6451
Epoch 7/100
257/257 [==============================] - 4s 14ms/step - loss: 1.3719 - acc: 0.6692 - val_loss: 1.2762 - val_acc: 0.7135
Epoch 8/100
257/257 [==============================] - 4s 15ms/step - loss: 1.2217 - acc: 0.7308 - val_loss: 1.0089 - val_acc: 0.7946
Epoch 9/100
257/257 [==============================] - 4s 15ms/step - loss: 1.1247 - acc: 0.7621 - val_loss: 0.9527 - val_acc: 0.8187
Epoch 10/100
257/257 [==============================] - 5s 19ms/step - loss: 1.0573 - acc: 0.7829 - val_loss: 0.8747 - val_acc: 0.8538
Epoch 11/100
257/257 [==============================] - 5s 21ms/step - loss: 0.9793 - acc: 0.8121 - val_loss: 0.9028 - val_acc: 0.8344
Epoch 12/100
257/257 [==============================] - 5s 20ms/step - loss: 0.9300 - acc: 0.8226 - val_loss: 0.7652 - val_acc: 0.8885
Epoch 13/100
257/257 [==============================] - 5s 19ms/step - loss: 0.9083 - acc: 0.8337 - val_loss: 0.7629 - val_acc: 0.8808
Epoch 14/100
257/257 [==============================] - 5s 21ms/step - loss: 0.8517 - acc: 0.8477 - val_loss: 0.8344 - val_acc: 0.8626
Epoch 15/100
257/257 [==============================] - 5s 20ms/step - loss: 0.8504 - acc: 0.8551 - val_loss: 0.7596 - val_acc: 0.8863
Epoch 16/100
257/257 [==============================] - 6s 22ms/step - loss: 0.8169 - acc: 0.8560 - val_loss: 0.6970 - val_acc: 0.8955
Epoch 17/100
257/257 [==============================] - 6s 24ms/step - loss: 0.8045 - acc: 0.8651 - val_loss: 0.8369 - val_acc: 0.8509
Epoch 18/100
257/257 [==============================] - 6s 23ms/step - loss: 0.7686 - acc: 0.8738 - val_loss: 0.6849 - val_acc: 0.9046
Epoch 19/100
257/257 [==============================] - 5s 21ms/step - loss: 0.7603 - acc: 0.8767 - val_loss: 0.6521 - val_acc: 0.9126
Epoch 20/100
257/257 [==============================] - 5s 20ms/step - loss: 0.7231 - acc: 0.8858 - val_loss: 0.5968 - val_acc: 0.9265
Epoch 21/100
257/257 [==============================] - 5s 20ms/step - loss: 0.7096 - acc: 0.8871 - val_loss: 0.5953 - val_acc: 0.9276
Epoch 22/100
257/257 [==============================] - 5s 20ms/step - loss: 0.6842 - acc: 0.8955 - val_loss: 0.6714 - val_acc: 0.9079
Epoch 23/100
257/257 [==============================] - 6s 21ms/step - loss: 0.6647 - acc: 0.9014 - val_loss: 0.6314 - val_acc: 0.9115
Epoch 24/100
257/257 [==============================] - 5s 21ms/step - loss: 0.6476 - acc: 0.9012 - val_loss: 0.6140 - val_acc: 0.9145
Epoch 25/100
257/257 [==============================] - 6s 22ms/step - loss: 0.6479 - acc: 0.9035 - val_loss: 0.6001 - val_acc: 0.9243
Epoch 26/100
257/257 [==============================] - 5s 21ms/step - loss: 0.6592 - acc: 0.9016 - val_loss: 0.6729 - val_acc: 0.9075
Epoch 27/100
257/257 [==============================] - 6s 23ms/step - loss: 0.6404 - acc: 0.9107 - val_loss: 0.6027 - val_acc: 0.9254
Epoch 28/100
257/257 [==============================] - 6s 24ms/step - loss: 0.6258 - acc: 0.9072 - val_loss: 0.5969 - val_acc: 0.9200
Epoch 29/100
257/257 [==============================] - 6s 22ms/step - loss: 0.6038 - acc: 0.9131 - val_loss: 0.5776 - val_acc: 0.9327
Epoch 30/100
257/257 [==============================] - 7s 26ms/step - loss: 0.6176 - acc: 0.9066 - val_loss: 0.5390 - val_acc: 0.9338
Epoch 31/100
257/257 [==============================] - 6s 24ms/step - loss: 0.5748 - acc: 0.9200 - val_loss: 0.5827 - val_acc: 0.9232
Epoch 32/100
257/257 [==============================] - 7s 27ms/step - loss: 0.5794 - acc: 0.9191 - val_loss: 0.5942 - val_acc: 0.9200
Epoch 33/100
257/257 [==============================] - 6s 25ms/step - loss: 0.5807 - acc: 0.9206 - val_loss: 0.6176 - val_acc: 0.9200
Epoch 34/100
257/257 [==============================] - 6s 24ms/step - loss: 0.5699 - acc: 0.9228 - val_loss: 0.6367 - val_acc: 0.9057
Epoch 35/100
257/257 [==============================] - 7s 26ms/step - loss: 0.5413 - acc: 0.9297 - val_loss: 0.5140 - val_acc: 0.9375
Epoch 36/100
257/257 [==============================] - 6s 25ms/step - loss: 0.5350 - acc: 0.9292 - val_loss: 0.5321 - val_acc: 0.9390
Epoch 37/100
257/257 [==============================] - 6s 22ms/step - loss: 0.5454 - acc: 0.9243 - val_loss: 0.5655 - val_acc: 0.9287
Epoch 38/100
257/257 [==============================] - 5s 20ms/step - loss: 0.5379 - acc: 0.9308 - val_loss: 0.5317 - val_acc: 0.9291
Epoch 39/100
257/257 [==============================] - 6s 23ms/step - loss: 0.5359 - acc: 0.9291 - val_loss: 0.5547 - val_acc: 0.9327
Save/load the model
model.save("cnn_philharmonia.h5")
#model = keras.models.load_model("cnn_philharmonia.h5")
Export model.json and *.bins with weights
!tensorflowjs_converter --input_format keras "cnn_philharmonia.h5" ./jsmodel
plt.figure(figsize=(8,8))
plt.title('Loss Value')
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['loss', 'val_loss'])
print('loss', history.history['loss'][-1])
print('val_loss:', history.history['val_loss'][-1])
plt.show()
plt.figure(figsize=(8,8))
plt.title('Accuracy')
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.legend(['acc', 'val_acc'])
print('acc:', history.history['acc'][-1])
print('val_acc:', history.history['val_acc'][-1])
plt.show()
loss 0.5358906388282776
val_loss: 0.5546738505363464
acc: 0.9290935397148132
val_acc: 0.932748556137085
predictions = model.predict(X_test)
86/86 [==============================] - 1s 6ms/step
predictions = np.argmax(predictions, axis=1)
y_test = one_hot_encoder.inverse_transform(y_test)
cm = confusion_matrix(y_test, predictions)
plt.figure(figsize=(8,8))
sns.heatmap(cm, annot=True, xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_, fmt='d', cmap=plt.cm.Blues, cbar=False)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
# Classification report
print("\nClassification report:\n", classification_report(y_test, predictions, target_names=folders))
Classification report:
precision recall f1-score support
banjo 0.83 1.00 0.91 15
bassoon 0.97 0.97 0.97 189
bass_clarinet 0.97 0.98 0.97 144
cello 0.93 0.96 0.94 178
clarinet 0.95 0.95 0.95 169
contrabassoon 0.98 0.98 0.98 142
cor_anglais 0.95 0.96 0.95 138
double_bass 0.91 0.96 0.93 170
flute 0.92 0.99 0.96 176
french_horn 1.00 0.93 0.96 130
guitar 1.00 0.71 0.83 21
mandolin 0.70 0.88 0.78 16
oboe 0.93 0.97 0.95 119
percussion 0.78 0.60 0.68 30
saxophone 0.97 0.90 0.94 146
trombone 0.98 0.98 0.98 166
trumpet 0.96 0.90 0.93 97
tuba 1.00 0.97 0.99 195
viola 0.87 0.93 0.90 195
violin 0.97 0.93 0.95 301
accuracy 0.95 2737
macro avg 0.93 0.92 0.92 2737
weighted avg 0.95 0.95 0.95 2737