import matplotlib.pyplot as plt
import torch
from IPython.display import Audio
def plot_spectrogram(
waveform: torch.Tensor,
sample_rate: int,
title: str = "Spectrogram",
xlim: int = None,
):
"""From: https://pytorch.org/tutorials/beginner/audio_preprocessing_tutorial.html"""
waveform = waveform.numpy()
num_channels, _ = waveform.shape
fig, axes = plt.subplots(num_channels, 1)
if num_channels == 1:
axes = [axes]
for c in range(num_channels):
axes[c].specgram(waveform[c], Fs=sample_rate, scale="dB")
if num_channels > 1:
axes[c].set_ylabel(f"Channel {c+1}")
if xlim:
axes[c].set_xlim(xlim)
fig.suptitle(title)
plt.show(block=False)
Audio Data Augmentations¶
In this chapter, we will discuss common transformations that we can apply to audio signals in the time domain. We will refer to these as “audio data augmentations”.
Data augmentations are a set of methods that add modified copies to a dataset, from the existing data. This process creates many variations of natural data, and can act as a regulariser to reduce the problem of overfitting. It can also help deep neural networks become robust to complex variations of natural data, which improves their generalisation performance.
In the field of computer vision, the transformations that we apply to images are often very self-explanatory. Take this image, for example. It becomes clear that we are zooming in and removing the color of the image:
import numpy as np
from PIL import Image
with Image.open("../../book/images/janne/freddie_mercury.jpg") as img:
img = np.array(img)
fig, ax = plt.subplots(1, 2)
ax[0].imshow(img)
h, w, c = img.shape
cropped_img = img[int(h / 2) : int(h / 2) + 500, int(w / 2) : int(w / 2) + 400, :]
black_white_img = cropped_img[..., 0]
ax[1].imshow(black_white_img, cmap="gray")
ax[0].axis("off")
ax[1].axis("off")
plt.show()
Naturally, we cannot translate transformations from the vision domain directly to the audio domain. Before we explore a battery of audio data augmentations, we now list the currently available code libraries:
Code Libraries¶
Name |
Author |
Framework |
Language |
License |
Link |
|---|---|---|---|---|---|
Muda |
B. McFee et al. (2015) |
General Purpose |
Python |
ISC License |
|
Audio Degradation Toolbox |
M. Mauch et al. (2013) |
General Purpose |
MATLAB |
GNU General Public License 2.0 |
|
rubberband |
- |
General Purpose |
C++ |
GNU General Public License (non-commercial) |
|
audiomentations |
I. Jordal (2021) |
General Purpose |
Python |
MIT License |
|
tensorflow-io |
tensorflow.org |
TensorFlow |
Python |
Apache 2.0 License |
|
torchaudio |
pytorch.org |
PyTorch |
Python |
BSD 2-Clause “Simplified” License |
|
torch-audiomentations |
Asteroid (2021) |
PyTorch |
Python |
MIT License |
|
torchaudio-augmentations |
J. Spijkervet (2021) |
PyTorch |
Python |
MIT License |
Listening¶
One of the most essential, and yet overlooked, parts of music research is exploring and observing the data. This also applies to data augmentation research: one has to develop a general understanding of the effect of transformations that can be applied to audio. Even more so, when transformations are applied sequentially.
For instance, we will understand why a reverb applied before a frequency filter will sound different than when the reverb is applied after the frequency filter. Before we develop this intuition, let’s listen to a series of audio data augmenations.
import os
import random
import numpy as np
import soundfile as sf
import torch
from torch.utils import data
from torchaudio_augmentations import (
Compose,
Delay,
Gain,
HighLowPass,
Noise,
PitchShift,
PolarityInversion,
RandomApply,
RandomResizedCrop,
Reverb,
)
GTZAN_GENRES = [
"blues",
"classical",
"country",
"disco",
"hiphop",
"jazz",
"metal",
"pop",
"reggae",
"rock",
]
class GTZANDataset(data.Dataset):
def __init__(self, data_path, split, num_samples, num_chunks, is_augmentation):
self.data_path = data_path if data_path else ""
self.sr = 22050
self.split = split
self.num_samples = num_samples
self.num_chunks = num_chunks
self.is_augmentation = is_augmentation
self.genres = GTZAN_GENRES
self._get_song_list()
if is_augmentation:
self._get_augmentations()
def _get_song_list(self):
list_filename = os.path.join(self.data_path, "%s_filtered.txt" % self.split)
with open(list_filename) as f:
lines = f.readlines()
self.song_list = [line.strip() for line in lines]
def _get_augmentations(self):
transforms = [
RandomResizedCrop(n_samples=self.num_samples),
RandomApply([PolarityInversion()], p=0.8),
RandomApply([Noise(min_snr=0.3, max_snr=0.5)], p=0.3),
RandomApply([Gain()], p=0.2),
RandomApply([HighLowPass(sample_rate=22050)], p=0.8),
RandomApply([Delay(sample_rate=22050)], p=0.5),
RandomApply(
[PitchShift(n_samples=self.num_samples, sample_rate=22050)], p=0.4
),
RandomApply([Reverb(sample_rate=22050)], p=0.3),
]
self.augmentation = Compose(transforms=transforms)
def _adjust_audio_length(self, wav):
if self.split == "train":
random_index = random.randint(0, len(wav) - self.num_samples - 1)
wav = wav[random_index : random_index + self.num_samples]
else:
hop = (len(wav) - self.num_samples) // self.num_chunks
wav = np.array(
[
wav[i * hop : i * hop + self.num_samples]
for i in range(self.num_chunks)
]
)
return wav
def __getitem__(self, index):
line = self.song_list[index]
# get genre
genre_name = line.split("/")[0]
genre_index = self.genres.index(genre_name)
# get audio
audio_filename = os.path.join(self.data_path, "genres", line)
wav, fs = sf.read(audio_filename)
# adjust audio length
wav = self._adjust_audio_length(wav).astype("float32")
# data augmentation
if self.is_augmentation:
wav = (
self.augmentation(torch.from_numpy(wav).unsqueeze(0)).squeeze(0).numpy()
)
wav = torch.from_numpy(wav.reshape(1, -1))
return wav, self.sr, genre_index
def __len__(self):
return len(self.song_list)
def get_dataloader(
data_path=None,
split="train",
num_samples=22050 * 29,
num_chunks=1,
batch_size=16,
num_workers=0,
is_augmentation=False,
):
is_shuffle = True if (split == "train") else False
batch_size = batch_size if (split == "train") else (batch_size // num_chunks)
data_loader = data.DataLoader(
dataset=GTZANDataset(
data_path, split, num_samples, num_chunks, is_augmentation
),
batch_size=batch_size,
shuffle=is_shuffle,
drop_last=False,
num_workers=num_workers,
)
return data_loader
from torchaudio.datasets import GTZAN
train_loader = get_dataloader(
data_path="../../codes/split", split="train", is_augmentation=False
)
dataset = train_loader.dataset
idx = 5
print(f"Number of datapoints in the GTZAN dataset: f{len(dataset)}\n")
print(f"Selected track no.: {idx}")
audio, sr, genre = dataset[idx]
print(
f"Genre: {genre}\nSample rate: {sr}\nChannels: {audio.shape[0]}\nSamples: {audio.shape[1]}"
)
display(Audio(audio, rate=sr))
Number of datapoints in the GTZAN dataset: f442
Selected track no.: 5
Genre: 0
Sample rate: 22050
Channels: 1
Samples: 639450
Random Crop¶
Similar to how we can crop an image, so that only a subset of the image is represented, we can ‘crop’ a piece of audio by selecting a fragment between two time points $t_0 - t_1$.
Various terms for this exist, e.g.,: slicing, trimming,
Frequency Filter¶
Note
In these examples and the accompanying code, we assume the shape of audio ordered in our array is follows: (channel, time)
A frequency filter is applied to the signal. We can process the signal with either the LowPass or HighPass algorithm [47]. In a stochastic setting, we can determine which one to apply by, for example, a coin flip. Another filter parameter we can control stochastically is the cutoff frequency: the frequency at which the filter will be applied. All frequencies above the cut-off frequency are filtered from the signal for a low-pass filter (i.e., we let the low frequencies pass). Similarly for the high-pass filter, all frequencies below the cut-off frequency are filtered from the signal (i.e., we let the high frequencies pass).
from torch_audiomentations import LowPassFilter
taudio = LowPassFilter(
sample_rate=sr,
p=1.0,
min_cutoff_freq=3000,
max_cutoff_freq=3001,
)(audio.unsqueeze(0)).squeeze(0)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr)
print("LowPassFilter")
display(Audio(taudio, rate=sr))
plot_spectrogram(taudio, sr)
Original
LowPassFilter
Delay¶
The signal is delayed by a value that can be chosen arbitrarily. The delayed signal is added to the original signal with a volume factor, e.g.,, we can multiply the signal’s amplitude by 0.5.
from torchaudio_augmentations import Delay
taudio = Delay(sample_rate=sr, min_delay=200, max_delay=201)(audio)
print("Original")
display(Audio(audio, rate=sr))
# plot_spectrogram(audio, sr)
print(f"Delay of {200}ms")
display(Audio(taudio, rate=sr))
# plot_spectrogram(taudio, sr)
Original