#!/usr/bin/env python # coding: utf-8 # This software component is licensed by ST under BSD 3-Clause license, # the "License"; You may not use this file except in compliance with the # License. You may obtain a copy of the License at: # https://opensource.org/licenses/BSD-3-Clause """ASC Feature Extraction example.""" import matplotlib.pyplot as plt import numpy as np import sys import librosa import librosa.display import scipy.fftpack as fft SR = 16000 N_FFT = 1024 N_MELS = 30 def create_col(y): assert y.shape == (1024,) # Create time-series window fft_window = librosa.filters.get_window('hann', N_FFT, fftbins=True) assert fft_window.shape == (1024,), fft_window.shape # Hann window y_windowed = fft_window * y assert y_windowed.shape == (1024,), y_windowed.shape # FFT fft_out = fft.fft(y_windowed, axis=0)[:513] assert fft_out.shape == (513,), fft_out.shape # Power spectrum S_pwr = np.abs(fft_out)**2 assert S_pwr.shape == (513,) # Generation of Mel Filter Banks mel_basis = librosa.filters.mel(SR, n_fft=N_FFT, n_mels=N_MELS, htk=False) assert mel_basis.shape == (30, 513) # Apply Mel Filter Banks S_mel = np.dot(mel_basis, S_pwr) S_mel.astype(np.float32) assert S_mel.shape == (30,) return S_mel def feature_extraction(y): assert y.shape == (1024, 32) S_mel = np.empty((30, 32), dtype=np.float32, order='C') for col_index in range(0, 32): S_mel[:, col_index] = create_col(y[:, col_index]) # Scale according to reference power S_mel = S_mel / S_mel.max() # Convert to dB S_log_mel = librosa.power_to_db(S_mel, top_db=80.0) assert S_log_mel.shape == (30, 32) return S_log_mel if __name__ == '__main__': wav_filename = sys.argv[1] if len(sys.argv) < 1: print('Filename not specified') exit(1) # Load audio file as a floating point time series y, _ = librosa.load(wav_filename, sr=SR, mono=True, dtype=np.float32) # Slice into overlapping frames frames = librosa.util.frame(y, frame_length=N_FFT, hop_length=512) # Extract features from overlapping frames (drop end-of-file samples) # S_log_Mel[0] = first feature (30x32) matrix # S_log_Mel[1] = second feature (30x32) matrix # ... nb_features = int(frames.shape[1] / 32) S_log_mel = np.empty((nb_features, 30, 32), dtype=np.float32, order='C') for i in range(0, nb_features): frame = frames[:, 0 + i:32 + i] S_log_mel[i] = feature_extraction(frame) # Plot first feature/spectrogram plt.figure(figsize=(10, 4)) librosa.display.specshow(S_log_mel[0], sr=SR, y_axis='mel', fmax=8000, x_axis='time', cmap='viridis') # plt.pcolormesh(S_log_Mel[0]) plt.colorbar(format='%+2.0f dB') plt.title('Mel spectrogram') plt.tight_layout() plt.show()