kernelaf9db71ede
In [1]:
import pandas as pd
import librosa as lr
import tensorflow as tf
from tqdm import tqdm_notebook as tqdm
tf.enable_v2_behavior()
In [2]:
from tensorflow.contrib.framework.python.ops.audio_ops import decode_wav
hop = 1024
samplerate = 44100
bands = 256
print(f"Time resolution is {hop/samplerate:0.2f} seconds.")
frames = 256
print(f"Each spectrogram contains {frames*hop/samplerate:0.2f} seconds.")
def load(filename, training):
blob = tf.io.read_file(filename)
waveform, sr = decode_wav(blob)
tf.assert_equal(sr, 44100)
waveform = tf.transpose(waveform)
# Peak normalize.
waveform /= tf.reduce_max(waveform)
# Trim leading/trailing silence.
if training:
waveform = tf.py_function(
lambda x: lr.effects.trim(x.numpy()),
[waveform],
[tf.float32],
)[0]
# Downsample if needed.
if samplerate != 44100:
waveform = tf.py_function(
lambda x: lr.resample(x.numpy(), 44100, samplerate, 'kaiser_fast'),
[waveform],
[tf.float32],
)[0]
# Loop too short audio files.
samples = int(hop * frames)
n = 1 + samples // tf.shape(waveform)[1]
waveform = tf.tile(waveform, [1, n])
return waveform
In [3]:
from tensorflow.keras.layers import *
class SpectralTransform(Layer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.windows = [hop*2, hop*4, hop*8]
self.filterbank = {
window: tf.signal.linear_to_mel_weight_matrix(
num_mel_bins=bands,
num_spectrogram_bins=window//2 + 1,
sample_rate=samplerate,
lower_edge_hertz=0.0,
upper_edge_hertz=samplerate/2,
) for window in self.windows
}
def call(self, x):
waveform = x
mels = []
for window in self.windows:
z = tf.signal.stft(waveform, window, hop, pad_end=True)
spectrogram = tf.abs(z)
filterbank = self.filterbank[window]
mel = tf.tensordot(spectrogram, filterbank, [3, 0])
mels.append(mel)
mel = tf.concat(mels, axis=1)
logpower = tf.math.log1p(mel)
logpower /= tf.reduce_max(logpower)
logpower = logpower * 2 - 1
logpower = tf.transpose(logpower, (0, 3, 2, 1))
return logpower
class Pad2D(Layer):
def __init__(self, padding=(1, 1), mode='REFLECT', **kwargs):
super().__init__(**kwargs)
self.padding = padding
self.mode = mode
def call(self, x):
h, w = self.padding
paddings = [[0, 0], [h, h], [w, w], [0, 0]]
return tf.pad(x, paddings, self.mode)
In [4]:
models = []
for path in tf.io.gfile.glob('../input/freesound-audio-tagging-2019-model/*.h5'):
model = tf.keras.models.load_model(
path,
compile=False,
custom_objects={
'SpectralTransform': SpectralTransform,
'Pad2D': Pad2D
}
)
models.append(model)
In [ ]:
df = pd.read_csv('../input/freesound-audio-tagging-2019/sample_submission.csv', index_col='fname')
losses = pd.read_csv('../input/freesound-audio-tagging-2019-model/losses.csv', header=None, index_col=0).values
weights = 1/(losses / losses.min())
for x in tqdm(df.index):
full_waveform = load('../input/freesound-audio-tagging-2019/test/' + x, training=False)
activations = []
for model in models:
xs = tf.stack([tf.image.random_crop(full_waveform, [1, int(hop * frames)]) for _ in range(32)])
ys = model.predict_on_batch(xs)
y = tf.reduce_max(ys, axis=0)
activations.append(y)
df.loc[x] = tf.reduce_mean(weights*activations, axis=0).numpy()
df.to_csv('submission.csv')
Comments
Comments powered by Disqus