In [None]:
#!pip install ffmpeg-python
#!conda update ffmpeg
!conda install -c conda-forge ffmpeg-python -y #Success
!conda update ffmpeg -y # Needed for libopenh264.so lib missing issue

In [None]:
!pip install audio2numpy # Success

In [None]:
import ffmpeg
import numpy as np
from scipy.fft import fft, fftfreq
import audio2numpy as a2n
from IPython.display import Audio,Image,display
import subprocess
import re

In [None]:
try:
 stream = ffmpeg.input('test_video.ts')
 out = ffmpeg.output(stream['2'],'output_no16k.mp3',format='mp3')
 out.run()
except ffmpeg.Error as err:
 print(err.stderr)

In [None]:
try:
 out, err = (
 ffmpeg
 .input('/home/sagemaker-user/test/1.mp4')
 .output('tmp.mp3',format='mp3',ar='16000')
 .run(capture_stdout=True, capture_stderr=True, overwrite_output=True)
 )
except ffmpeg.Error as err:
 print(err.stderr)
 raise

In [None]:
def plot_freq(x1, sr):
 # Number of sample points
 N = len(x1)
 # sample spacing
 T = 1.0 / sr

 #y = np.sin(50.0 * 2.0*np.pi*x) + 0.5*np.sin(80.0 * 2.0*np.pi*x)
 y = x1
 yf = fft(y)
 xf = fftfreq(N, T)[:N//2]
 import matplotlib.pyplot as plt
 plt.plot(xf, 2.0/N * np.abs(yf[0:N//2]))
 plt.ylim(0,0.0012)
 plt.grid()
 plt.show()
 return yf

In [None]:
class ffmpegProcessor: 
 def extract_audio(self, filename, track=1):
 tmp_file = filename.split('.')[-2][-3:]
 tmp_file += '_tmp.mp3'
 try:
 stream = ffmpeg.input(filename)
 out, err = (
 #.output('-', format='f32le', acodec='pcm_f32le', ac=1, ar='16000')
 ffmpeg.output(stream[str(track)],tmp_file,format='mp3',ar='16000')
 .run(capture_stdout=True, capture_stderr=True,overwrite_output=True)
 )
 except ffmpeg.Error as err:
 print(err.stderr)
 raise
 #return np.frombuffer(out, np.float32)
 x,sr = a2n.audio_from_file(tmp_file)
 x1 = [x2[0] for x2 in x]
 x1_norm = x1/np.max(x1)
 return np.array(x1_norm), sr
 
ap = ffmpegProcessor()
x_ffmpeg, sr = ap.extract_audio('crowdnoise.mp3',0)

In [None]:
from scipy.fft import fft, fftfreq
import numpy as np

def extract_audio(filename, track=1):

 tmp_file = filename.split('/')[-1]
 tmp_file = tmp_file.split('.')[-2]
 tmp_file += f'_track{track}.mp3'
 print('Create tmp file',tmp_file)
 try:
 stream = ffmpeg.input(filename)
 out, err = (
 ffmpeg.output(stream[str(track)],tmp_file,format='mp3',ar='16000')
 .run(capture_stdout=True, capture_stderr=True,overwrite_output=True)
 )
 except ffmpeg.Error as err:
 print(err.stderr.decode('utf-8'))
 raise
 #return np.frombuffer(out, np.float32)
 x,sr = a2n.audio_from_file(tmp_file)
 x1 = [x2[0] for x2 in x]
 # Normaliz the audio based on peak value
 x1_norm = x1/np.max(x1)
 return np.array(x1_norm), sr
 
def fft_power_output(audio_t, sr, beg, end, low_cut,high_cut,plot_f=False):
 # Number of sample points
 #x = audio_t[]
 x = audio_t[beg:end]
 N = len(x)
 # sample spacing
 T = 1.0 / sr
 if plot_f:
 plt.figure()
 plt.title('Audio Signal in Time Domain')
 plt.plot(x)
 #plt.ylim(-0.5,0.5)
 
 yf = fft(x)
 xf = fftfreq(N, T)[:N//2]
 y_fft = np.abs(yf[0:N//2])
 if plot_f:
 plt.figure()
 plt.title('Audio Signal in Freq Domain')
 plt.plot(xf, 2.0/N * y_fft)
 #plt.ylim(0,0.0012)
 plt.grid()
 lc = int(low_cut*N//2)
 hc = int(-1*high_cut*N//2)
 #print(N,low_cut,lc,hc,len(y_fft))
 return np.sum(y_fft[:lc]), np.sum(y_fft[hc:])


def feature_extraction(media_path, track=1, wsize=5, low_cut=0.1,high_cut=0.1,plot_f=False):
 # read in audio file by ffmpeg and convert to 16bit codec
 x_ffmpeg, sr = extract_audio(media_path, track)
 if plot_f:
 plt.title('Over all Audio Signal in Time Domain')
 plt.plot(x_ffmpeg)
 #plt.ylim(-0.5,0.5)
 
 nsamples = len(x_ffmpeg)
 print(f'Sample rate of the radio is {sr}, total samples {nsamples}')
 nw = nsamples//(sr*wsize)
 print(f'Total length is {nsamples/sr}s with window size {wsize}s. Num of windows is {nw+1}')
 features=[]
 for i in range(nw):
 beg = i*sr*wsize
 end = (i+1)*sr*wsize
 print(f'Get FFT features from sample {beg} to {nsamples}')
 low, high = fft_power_output(x_ffmpeg, sr, beg, end, low_cut, high_cut, plot_f)
 features.append([low,high])

 beg = nw*sr*wsize
 if (nsamples-beg)/(sr*wsize) > 0.3:
 print(f'Get FFT features from sample {beg} to {nsamples}')
 low, high = fft_power_output(x_ffmpeg, sr, beg, nsamples, low_cut, high_cut, plot_f)
 features.append([low,high])
 else:
 print(f'Skip last {nsamples-beg} samples, {(nsamples-beg)/sr} sec, from {beg} to {nsamples}')
 
 
 return features

In [None]:
def split_audio(filename, track=1, len=5):
 totlen = getLength(filename)
 nwin = totlen//len
 if totlen%len/len > 0.3:
 print(totlen%len/len)
 nwin +=1
 else:
 print(f'Skip last {totlen%len} seconds')
 print(f'Total have {nwin} windows')
 for i in range(nwin):
 #tmp_file = filename.split('.')[-2]
 tmp_file = 'tmp_'
 tmp_file += str(i)
 tmp_file += '.mp3'
 try:
 stream = ffmpeg.input(filename,ss=i*len)
 out, err = (
 ffmpeg.output(stream[str(track)],tmp_file,t=len,format='mp3',ar='16000')
 .run(capture_stdout=True, capture_stderr=True,overwrite_output=True)
 )
 except ffmpeg.Error as err:
 print(err.stderr)
 raise
 display(Audio('tmp_'+str(i)+'.mp3'))
#split_audio('crowdnoise.mp3',0,5)