Skip to main content

kernelaf9db71ede

In [1]:
import pandas as pd
import librosa as lr
import tensorflow as tf
from tqdm import tqdm_notebook as tqdm

tf.enable_v2_behavior()
In [2]:
from tensorflow.contrib.framework.python.ops.audio_ops import decode_wav

hop = 1024
samplerate = 44100
bands = 256
print(f"Time resolution is {hop/samplerate:0.2f} seconds.")

frames = 256
print(f"Each spectrogram contains {frames*hop/samplerate:0.2f} seconds.")


def load(filename, training):
    blob = tf.io.read_file(filename)
    waveform, sr = decode_wav(blob)
    tf.assert_equal(sr, 44100)
    waveform = tf.transpose(waveform)
    
    # Peak normalize.
    waveform /= tf.reduce_max(waveform)

    # Trim leading/trailing silence.
    if training:
        waveform = tf.py_function(
            lambda x: lr.effects.trim(x.numpy()),
            [waveform],
            [tf.float32],
        )[0]

    # Downsample if needed.
    if samplerate != 44100:
        waveform = tf.py_function(
            lambda x: lr.resample(x.numpy(), 44100, samplerate, 'kaiser_fast'),
            [waveform],
            [tf.float32],
        )[0]

    # Loop too short audio files.
    samples = int(hop * frames)
    n = 1 + samples // tf.shape(waveform)[1]
    waveform = tf.tile(waveform, [1, n])
    
    return waveform
Time resolution is 0.02 seconds.
Each spectrogram contains 5.94 seconds.
In [3]:
from tensorflow.keras.layers import *


class SpectralTransform(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.windows = [hop*2, hop*4, hop*8]

        self.filterbank = {
            window: tf.signal.linear_to_mel_weight_matrix(
                num_mel_bins=bands,
                num_spectrogram_bins=window//2 + 1,
                sample_rate=samplerate,
                lower_edge_hertz=0.0,
                upper_edge_hertz=samplerate/2,
            ) for window in self.windows
        }

    def call(self, x):
        waveform = x
        mels = []
        for window in self.windows:
            z = tf.signal.stft(waveform, window, hop, pad_end=True)
            spectrogram = tf.abs(z)
            filterbank = self.filterbank[window]
            mel = tf.tensordot(spectrogram, filterbank, [3, 0])
            mels.append(mel)
        mel = tf.concat(mels, axis=1)

        logpower = tf.math.log1p(mel)
        logpower /= tf.reduce_max(logpower)
        logpower = logpower * 2 - 1
        logpower = tf.transpose(logpower, (0, 3, 2, 1))
        return logpower


class Pad2D(Layer):
    def __init__(self, padding=(1, 1), mode='REFLECT', **kwargs):
        super().__init__(**kwargs)
        self.padding = padding
        self.mode = mode

    def call(self, x):
        h, w = self.padding
        paddings = [[0, 0], [h, h], [w, w], [0, 0]]
        return tf.pad(x, paddings, self.mode)
In [4]:
models = []
for path in tf.io.gfile.glob('../input/freesound-audio-tagging-2019-model/*.h5'):

    model = tf.keras.models.load_model(
        path,
        compile=False, 
        custom_objects={
            'SpectralTransform': SpectralTransform, 
            'Pad2D': Pad2D
        }
    )

    models.append(model)
WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/resource_variable_ops.py:642: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
In [ ]:
df = pd.read_csv('../input/freesound-audio-tagging-2019/sample_submission.csv', index_col='fname')

losses = pd.read_csv('../input/freesound-audio-tagging-2019-model/losses.csv', header=None, index_col=0).values
weights = 1/(losses / losses.min())

for x in tqdm(df.index):
    full_waveform = load('../input/freesound-audio-tagging-2019/test/' + x, training=False)

    activations = []
    for model in models:
        xs = tf.stack([tf.image.random_crop(full_waveform, [1, int(hop * frames)]) for _ in range(32)])
        ys = model.predict_on_batch(xs)
        y = tf.reduce_max(ys, axis=0)
        activations.append(y)

    df.loc[x] = tf.reduce_mean(weights*activations, axis=0).numpy()

df.to_csv('submission.csv')
HBox(children=(IntProgress(value=0, max=1120), HTML(value='')))

Comments

Comments powered by Disqus