#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2023 Apple Inc. All Rights Reserved.
#
import math
import torch
from utils import logger
[docs]def get_mfccs(
data: torch.Tensor,
sampling_rate: float,
num_mfccs: int,
window_length: float = 0.023,
) -> torch.Tensor:
"""Get Mel Frequency Cepstral Coefficients from an audio signal.
Explanation of Mel-Frequency Cepstral Coefficients (MFCCs):
> https://librosa.org/doc/main/generated/librosa.stft.html#librosa.stft
Args:
data: one channel of the audio signal, as a 1-D tensor.
sampling_rate: the sampling rate of the audio.
num_mfccs: the number of cepstral coefficients to use.
window_length: the window length used for computing the spectrogram. By
default, we choose 23ms, which is a good value for human speech.
"""
try:
from torchaudio.transforms import ( # Importing torchaudio takes ~0.6 s, but often not needed. That's why it is imported inside this function.
MFCC,
)
except ImportError:
logger.error("Torchaudio is not installed. Please install it.")
n_fft = sampling_rate * window_length
# Round to the nearest power of 2.
n_fft = 2 ** round(math.log2(n_fft))
return MFCC(
sample_rate=sampling_rate,
n_mfcc=num_mfccs,
melkwargs={
"n_fft": n_fft,
# librosa's default value: https://github.com/librosa/librosa/blob/71077174b9e73ae81d268f81551bb9667bf3693b/librosa/filters.py#L132
"n_mels": 128,
# librosa's default value: https://github.com/librosa/librosa/blob/71077174b9e73ae81d268f81551bb9667bf3693b/librosa/feature/spectral.py#L2027
"hop_length": 512,
"mel_scale": "slaney",
# librosa's default value: https://github.com/librosa/librosa/blob/71077174b9e73ae81d268f81551bb9667bf3693b/librosa/filters.py#L136
"norm": "slaney",
},
)(data.float())
[docs]def calculate_mfccs(
audio: torch.Tensor,
sampling_rate: float,
num_mfccs: int,
window_length: float = 0.023,
) -> torch.Tensor:
"""Calculate MFCCs on a batch of data.
Args:
audio: the audio signal, in [batch_size, num_channels, temporal_size]
order.
sampling_rate: the sampling rate of the audio signal.
num_mfccs: the number of coefficients to use.
window_length: the window length used for computing the spectrogram. By
default, we choose 23ms, which is a good value for human speech.
"""
if audio.dim() != 3:
raise ValueError(f"Expected 3 dimensions, got {audio.dim()}")
spectrogram_length = get_mfccs(
audio[0][0],
sampling_rate,
num_mfccs,
window_length=window_length,
).shape[1]
result = torch.empty(
[audio.shape[0], audio.shape[1], num_mfccs, spectrogram_length],
device=audio.device,
dtype=audio.dtype,
)
for i, element in enumerate(audio):
for j, channel in enumerate(element):
mfccs = get_mfccs(
channel, sampling_rate, num_mfccs, window_length=window_length
)
result[i, j] = mfccs
return result
[docs]def get_mfcc_features(
audio: torch.Tensor,
sampling_rate: float,
num_mfccs: int,
num_frames: int,
window_length: float = 0.023,
) -> torch.Tensor:
"""Get MFCC features for a batch of audio data.
Args:
audio: the audio signal, in [batch_size, temporal_size, num_channels]
order.
sampling_rate: the sampling rate of the audio signal.
num_mfccs: the number of coefficients to use.
window_length: the window length used for computing the spectrogram. By
default, we choose 23ms, which is a good value for human speech.
num_frames: each MFCC spectrogram gets dividied into @num_frames frames
(sub-time-slice temporal components) of length ceil(spectrogram_length/num_frames).
Returns:
MFCCs in [N, C, num_mfccs, num_frames, ceil(spectrogram_length/num_frames)] order.
"""
if audio.dim() != 3:
raise ValueError(f"Invalid audio.dim()={audio.dim()}")
if audio.shape[2] != 2:
raise ValueError(f"Invalid number of channels {audio.shape[2]}")
audio = audio.permute([0, 2, 1])
features = calculate_mfccs(
audio,
sampling_rate,
num_mfccs,
window_length=window_length,
) # Size: [N, C, num_mfccs, T].
return get_padded_features(
features=features,
num_frames=num_frames,
)
[docs]def get_padded_features(
features: torch.Tensor,
num_frames: int,
) -> torch.Tensor:
"""
Splits the temporal dimension (of length T) of MFCC features into
@num_frames sub-vectors (of length ``ceil(T/num_frames)``).
As T may not be divisible by @num_frames, pads the temporal dimension if required.
Args:
features: Tensor[batchsize x C(num_audio_channels) x num_mfccs x T]
num_frames: number of padded sub-vectors
Returns:
padded_features: Tensor (batchsize x C x num_mfccs x num_frames x ceil(T/num_frames))
"""
N, C, num_mfccs, T = features.shape
frame_length = math.ceil(T / num_frames)
if T % num_frames != 0:
padded_features = torch.zeros(
[
N,
C,
num_mfccs,
frame_length * num_frames,
],
dtype=features.dtype,
device=features.device,
)
padded_features[:, :, :, :T] = features
else:
padded_features = features
padded_features = padded_features.reshape(N, C, num_mfccs, num_frames, frame_length)
return padded_features