FAT2019 - Temporal|Timbral CNN
In [ ]:
import os
import zipfile
import multiprocessing
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
import librosa as lr
import matplotlib.pyplot as plt
from IPython.display import Audio
from librosa.display import specshow
root = '../input/freesound-audio-tagging-2019/'
tf.enable_v2_behavior()
assert tf.test.is_gpu_available()
tf.__version__
Exploratory data analysis¶
In [ ]:
# https://www.kaggle.com/c/freesound-audio-tagging-2019/discussion/93480
# https://www.kaggle.com/c/freesound-audio-tagging-2019/discussion/89108
invalid = {
'train_curated': [
'f76181c4.wav',
'77b925c2.wav',
'6a1f682a.wav',
'c7db12aa.wav',
'7752cc8a.wav',
'1d44b0bd.wav',
],
'train_noisy': [
'ff97b092.wav',
'e140e930.wav',
'02f274b2.wav',
'08b34136.wav',
'1af3bd88.wav',
'1fd4f275.wav',
'2f503375.wav',
'3496256e.wav',
'551a4b3b.wav',
'5a5761c9.wav',
'6d062e59.wav',
'769d131d.wav',
'8c712129.wav',
'988cf8f2.wav',
'9f4fa2df.wav',
'b1d2590c.wav',
'be273a3c.wav',
'd527dcf0.wav',
'e4faa2e1.wav',
'fa659a71.wav',
'fba392d8.wav',
]
}
def load_metadata(subset):
csv_path = os.path.join(root, subset + '.csv')
df = pd.read_csv(csv_path, index_col='fname')
df = df.drop(index=invalid.get(subset, []))
metadata = df.labels.str.get_dummies(',')
return metadata
metadata = load_metadata('train_curated')
tags = metadata.columns
In [ ]:
def show_tag_counts(metadata: pd.DataFrame):
counts = metadata.sum(axis=1).value_counts()
n = len(counts)
ax = counts.plot.bar(logy=True)
ax.set(
xlabel='no. tags',
ylabel='no. examples',
title='Number of examples with several tags',
)
plt.show()
show_tag_counts(load_metadata('train_curated'))
show_tag_counts(load_metadata('train_noisy'))
In [ ]:
def show_total_tags(metadata: pd.DataFrame):
ax = metadata.sum().plot.barh(figsize=(5, 30))
ax.set(title='Number of tags over all examples')
plt.show()
show_total_tags(load_metadata('train_curated'))
show_total_tags(load_metadata('train_noisy'))
In [ ]:
def correlations(metadata: pd.DataFrame):
corr = metadata.corr()
triangle = corr.where(np.triu(np.ones(corr.shape).astype(np.bool), 1))
pairwise = triangle.unstack().dropna()
return pairwise
for subset in ['train_curated', 'train_noisy']:
metadata = load_metadata(subset)
df = pd.DataFrame(correlations(metadata).nlargest(10), columns=['Correlation'])
display(df)
In [ ]:
def show_audio_duration_distribution(wavfiles):
durations = [lr.get_duration(filename=x) for x in tqdm(wavfiles)]
ax = pd.Series(durations).plot.hist(bins=30, logy=True)
ax.set_title('Audio duration')
plt.show()
for subset in ['train_curated', 'train_noisy']:
show_audio_duration_distribution(root + subset + '/' + load_metadata(subset).index)
Preprocessing¶
In [ ]:
hop = 1024
samplerate = 44100
bands = 256
print(f"Time resolution is {hop/samplerate:0.2f} seconds.")
frames = 256
print(f"Each spectrogram contains {frames*hop/samplerate:0.2f} seconds.")
def load(filename):
blob = tf.io.read_file(filename)
waveform, sr = tf.audio.decode_wav(blob)
tf.assert_equal(sr, 44100)
waveform = tf.transpose(waveform)
# Peak normalize.
waveform /= tf.reduce_max(waveform)
# Downsample if needed.
if samplerate != 44100:
waveform = tf.py_function(
lambda x: lr.resample(x.numpy(), 44100, samplerate, 'kaiser_fast'),
[waveform],
[tf.float32],
)[0]
# Loop too short audio files.
samples = int(hop * frames)
n = 1 + samples // tf.shape(waveform)[1]
waveform = tf.tile(waveform, [1, n])
return waveform
for training in [True, False]:
waveform = load('../input/freesound-audio-tagging-2019/train_curated/0006ae4e.wav')
display(Audio(waveform, rate=samplerate))
In [ ]:
def sample(row):
waveform = row['waveform']
samples = int(hop * frames)
waveform = tf.image.random_crop(waveform, [1, samples])
row['waveform'] = waveform
return row
def collate(row):
features = {'waveform': row['waveform']}
labels = {'tags': tf.cast(tf.stack([row[tag] for tag in tags]), tf.float32)}
return features, labels
def mix(features, labels):
features['waveform'] = features['waveform'].batch(2).map(lambda x: tf.reduce_mean(x, axis=0))
labels['tags'] = labels['tags'].batch(2).map(lambda x: tf.cast(tf.reduce_any(x > 0, axis=0), tf.float32))
return tf.data.Dataset.zip((features, labels))
def apply_by_key(row, f, in_key, out_key):
x = row[in_key]
y = f(x)
row[out_key] = y
return row
def create_dataset(metadata: pd.DataFrame, batch_size=32, training=True) -> tf.data.Dataset:
metadata = metadata.copy()
metadata['filename'] = root + '/' + metadata.index
d = tf.data.Dataset.from_tensor_slices(metadata.to_dict('list'))
d = d.map(lambda x: apply_by_key(x, load, 'filename', 'waveform'), tf.data.experimental.AUTOTUNE)
d = d.map(sample, tf.data.experimental.AUTOTUNE)
d = d.map(collate, tf.data.experimental.AUTOTUNE)
if training:
d = d.shuffle(512)
d = d.repeat()
d = tf.data.experimental.sample_from_datasets(
(d, d.window(2).flat_map(mix)),
weights=[1.0, 0.5],
)
d = d.batch(batch_size, drop_remainder=True)
d = d.prefetch(tf.data.experimental.AUTOTUNE)
return d
metadata = load_metadata('train_curated')
metadata.index = 'train_curated/' + metadata.index
dataset = create_dataset(metadata)
for i, minibatch in enumerate(tqdm(dataset)):
features, labels = minibatch
mono = features['waveform'].numpy()[0].mean(axis=0)
title = [tags[x] for x in np.where(labels['tags'][0].numpy())[0]]
display(title)
display(Audio(mono, rate=44100))
if i > 10:
break
features['waveform'].shape
Model¶
In [ ]:
from tensorflow.keras.layers import *
from l_lrap_metric_for_tf_keras import LWLRAP
class SpectralTransform(Layer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.windows = [hop*2, hop*4, hop*8]
self.filterbank = {
window: tf.signal.linear_to_mel_weight_matrix(
num_mel_bins=bands,
num_spectrogram_bins=window//2 + 1,
sample_rate=samplerate,
lower_edge_hertz=0.0,
upper_edge_hertz=samplerate/2,
) for window in self.windows
}
def call(self, x):
waveform = x
mels = []
for window in self.windows:
z = tf.signal.stft(waveform, window, hop, pad_end=True)
spectrogram = tf.abs(z)
filterbank = self.filterbank[window]
mel = tf.tensordot(spectrogram, filterbank, [3, 0])
mels.append(mel)
mel = tf.concat(mels, axis=1)
logpower = tf.math.log1p(mel)
logpower /= tf.reduce_max(logpower)
logpower = logpower * 2 - 1
logpower = tf.transpose(logpower, (0, 3, 2, 1))
return logpower
def create_model(name: str) -> tf.keras.Model:
samples = int(hop*frames)
channels = 1
x = tf.keras.Input(shape=(channels, samples), name='waveform')
inputs = [x]
x = SpectralTransform()(x)
x = BatchNormalization(axis=1)(x)
h = Conv2D(32, [1, 9], 2, use_bias=False, padding='same')(x)
p = Conv2D(32, [9, 1], 2, use_bias=False, padding='same')(x)
x = Concatenate()([h, p])
x = BatchNormalization()(x)
x = Activation('selu')(x)
w = x.shape[-1]
downsamples = 5
print(f"Halfing input resolution {downsamples} times.")
for i in range(downsamples):
f = w * 2**i
s = Conv2D(f, 1)(x)
x = Conv2D(f, 3, use_bias=False, padding='same')(x)
x = BatchNormalization()(x)
x = Add()([x, s])
x = Activation('selu')(x)
x = AvgPool2D()(x)
x = BatchNormalization()(x)
x = GlobalAveragePooling2D()(x)
x = Dense(len(tags), activation='sigmoid', name='tags')(x)
outputs = [x]
model = tf.keras.Model(inputs, outputs, name=name)
optimizer = tf.keras.optimizers.Adam(clipnorm=100.0)
model.compile(
loss=tf.keras.losses.BinaryCrossentropy(),
optimizer=optimizer,
metrics=[LWLRAP(len(tags))]
)
return model
tf.keras.backend.clear_session()
model = create_model('test')
model.save(model.name + '.h5')
model = tf.keras.models.load_model(model.name + '.h5', compile=False, custom_objects={'SpectralTransform': SpectralTransform})
model.summary()
Train model(s)¶
In [ ]:
from IPython.display import clear_output
from sklearn.model_selection import KFold
def train(name: str, training: pd.DataFrame, validation: pd.DataFrame) -> float:
tf.keras.backend.clear_session()
batch_size = 32
datasets = {
'training': create_dataset(training, batch_size=batch_size, training=True),
'validation': create_dataset(validation, batch_size=batch_size*2, training=False),
}
iterators = {k: iter(v.repeat()) for k, v in datasets.items()}
steps = {k: tf.data.experimental.cardinality(v).numpy() for k, v in datasets.items()}
steps['training'] = len(training)//batch_size
model = create_model(name)
patience = 1 # TODO 10
cost = {'training': [], 'validation': []}
while True:
for subset in datasets:
f = model.train_on_batch if subset == 'training' else model.test_on_batch
losses = []
for i in tqdm(range(steps[subset]), desc=subset.capitalize()):
minibatch = next(iterators[subset])
loss, lwlrap = f(*minibatch)
losses.append(loss)
cost[subset].append(np.mean(losses))
clear_output()
for subset in datasets:
plt.plot(cost[subset], label=subset)
plt.hlines(0.0243, 0, len(cost[subset]))
plt.legend()
plt.show()
learning_rate = tf.keras.backend.get_value(model.optimizer.lr)
if cost['validation'][-1] > min(cost['validation']):
patience -= 1
if not patience:
patience = 3
learning_rate *= 0.5
display(f"Halfing learning rate to {learning_rate}.")
tf.keras.backend.set_value(model.optimizer.lr, learning_rate)
if cost['validation'][-1] == min(cost['validation']):
display(f"Model improved to {cost['validation'][-1]}")
model.save_weights(model.name + '.h5')
if learning_rate < 1e-5:
return min(cost['validation'])
In [ ]:
metadata = load_metadata('train_curated')
metadata.index = 'train_curated/' + metadata.index
df = load_metadata('train_noisy')
df.index = 'train_noisy/' + df.index
one_hots = df[df.sum(axis=1) == 1]
losses = []
# TODO 5-fold
for i, (t, v) in enumerate(KFold(2, shuffle=True).split(metadata)):
training = metadata.iloc[t]
noisy = pd.concat(one_hots[one_hots[tag]==1].sample(50, replace=True) for tag in tags)
training = training.append(noisy).sample(frac=1.0)
validation = metadata.iloc[v]
name = f'model-{i}'
loss = train(name, training, validation)
losses.append(loss)
pd.Series(losses).to_csv('losses.csv')
Make leaderboard submission¶
In [ ]:
base = create_model('base')
models = []
for path in tf.io.gfile.glob('*-*.h5'):
model = tf.keras.models.clone_model(base)
model.load_weights(path)
models.append(model)
In [ ]:
df = pd.read_csv(root + 'sample_submission.csv', index_col='fname')
losses = pd.read_csv('losses.csv', header=None, index_col=0).values
weights = 1/(losses / losses.min())
crops = 32
for x in tqdm(df.index):
full_waveform = load(root + x)
activations = []
for model in models:
xs = tf.stack([tf.image.random_crop(full_waveform, [1, int(hop * frames)]) for _ in range(crops)])
ys = model.predict_on_batch(xs)
y = tf.reduce_max(ys, axis=0)
activations.append(y)
df.loc[x] = tf.reduce_mean(weights * activations, axis=0).numpy()
df.to_csv('submission.csv')
Comments
Comments powered by Disqus