{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#!pip install ffmpeg-python\n", "#!conda update ffmpeg\n", "!conda install -c conda-forge ffmpeg-python -y #Success\n", "!conda update ffmpeg -y # Needed for libopenh264.so lib missing issue" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "!pip install audio2numpy # Success" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import ffmpeg\n", "import numpy as np\n", "from scipy.fft import fft, fftfreq\n", "import audio2numpy as a2n\n", "from IPython.display import Audio,Image,display\n", "import subprocess\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "try:\n", " stream = ffmpeg.input('test_video.ts')\n", " out = ffmpeg.output(stream['2'],'output_no16k.mp3',format='mp3')\n", " out.run()\n", "except ffmpeg.Error as err:\n", " print(err.stderr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "try:\n", " out, err = (\n", " ffmpeg\n", " .input('/home/sagemaker-user/test/1.mp4')\n", " .output('tmp.mp3',format='mp3',ar='16000')\n", " .run(capture_stdout=True, capture_stderr=True, overwrite_output=True)\n", " )\n", "except ffmpeg.Error as err:\n", " print(err.stderr)\n", " raise" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_freq(x1, sr):\n", " # Number of sample points\n", " N = len(x1)\n", " # sample spacing\n", " T = 1.0 / sr\n", "\n", " #y = np.sin(50.0 * 2.0*np.pi*x) + 0.5*np.sin(80.0 * 2.0*np.pi*x)\n", " y = x1\n", " yf = fft(y)\n", " xf = fftfreq(N, T)[:N//2]\n", " import matplotlib.pyplot as plt\n", " plt.plot(xf, 2.0/N * np.abs(yf[0:N//2]))\n", " plt.ylim(0,0.0012)\n", " plt.grid()\n", " plt.show()\n", " return yf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class ffmpegProcessor: \n", " def extract_audio(self, filename, track=1):\n", " tmp_file = filename.split('.')[-2][-3:]\n", " tmp_file += '_tmp.mp3'\n", " try:\n", " stream = ffmpeg.input(filename)\n", " out, err = (\n", " #.output('-', format='f32le', acodec='pcm_f32le', ac=1, ar='16000')\n", " ffmpeg.output(stream[str(track)],tmp_file,format='mp3',ar='16000')\n", " .run(capture_stdout=True, capture_stderr=True,overwrite_output=True)\n", " )\n", " except ffmpeg.Error as err:\n", " print(err.stderr)\n", " raise\n", " #return np.frombuffer(out, np.float32)\n", " x,sr = a2n.audio_from_file(tmp_file)\n", " x1 = [x2[0] for x2 in x]\n", " x1_norm = x1/np.max(x1)\n", " return np.array(x1_norm), sr\n", " \n", "ap = ffmpegProcessor()\n", "x_ffmpeg, sr = ap.extract_audio('crowdnoise.mp3',0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "from scipy.fft import fft, fftfreq\n", "import numpy as np\n", "\n", "def extract_audio(filename, track=1):\n", "\n", " tmp_file = filename.split('/')[-1]\n", " tmp_file = tmp_file.split('.')[-2]\n", " tmp_file += f'_track{track}.mp3'\n", " print('Create tmp file',tmp_file)\n", " try:\n", " stream = ffmpeg.input(filename)\n", " out, err = (\n", " ffmpeg.output(stream[str(track)],tmp_file,format='mp3',ar='16000')\n", " .run(capture_stdout=True, capture_stderr=True,overwrite_output=True)\n", " )\n", " except ffmpeg.Error as err:\n", " print(err.stderr.decode('utf-8'))\n", " raise\n", " #return np.frombuffer(out, np.float32)\n", " x,sr = a2n.audio_from_file(tmp_file)\n", " x1 = [x2[0] for x2 in x]\n", " # Normaliz the audio based on peak value\n", " x1_norm = x1/np.max(x1)\n", " return np.array(x1_norm), sr\n", " \n", "def fft_power_output(audio_t, sr, beg, end, low_cut,high_cut,plot_f=False):\n", " # Number of sample points\n", " #x = audio_t[]\n", " x = audio_t[beg:end]\n", " N = len(x)\n", " # sample spacing\n", " T = 1.0 / sr\n", " if plot_f:\n", " plt.figure()\n", " plt.title('Audio Signal in Time Domain')\n", " plt.plot(x)\n", " #plt.ylim(-0.5,0.5)\n", " \n", " yf = fft(x)\n", " xf = fftfreq(N, T)[:N//2]\n", " y_fft = np.abs(yf[0:N//2])\n", " if plot_f:\n", " plt.figure()\n", " plt.title('Audio Signal in Freq Domain')\n", " plt.plot(xf, 2.0/N * y_fft)\n", " #plt.ylim(0,0.0012)\n", " plt.grid()\n", " lc = int(low_cut*N//2)\n", " hc = int(-1*high_cut*N//2)\n", " #print(N,low_cut,lc,hc,len(y_fft))\n", " return np.sum(y_fft[:lc]), np.sum(y_fft[hc:])\n", "\n", "\n", "def feature_extraction(media_path, track=1, wsize=5, low_cut=0.1,high_cut=0.1,plot_f=False):\n", " # read in audio file by ffmpeg and convert to 16bit codec\n", " x_ffmpeg, sr = extract_audio(media_path, track)\n", " if plot_f:\n", " plt.title('Over all Audio Signal in Time Domain')\n", " plt.plot(x_ffmpeg)\n", " #plt.ylim(-0.5,0.5)\n", " \n", " nsamples = len(x_ffmpeg)\n", " print(f'Sample rate of the radio is {sr}, total samples {nsamples}')\n", " nw = nsamples//(sr*wsize)\n", " print(f'Total length is {nsamples/sr}s with window size {wsize}s. Num of windows is {nw+1}')\n", " features=[]\n", " for i in range(nw):\n", " beg = i*sr*wsize\n", " end = (i+1)*sr*wsize\n", " print(f'Get FFT features from sample {beg} to {nsamples}')\n", " low, high = fft_power_output(x_ffmpeg, sr, beg, end, low_cut, high_cut, plot_f)\n", " features.append([low,high])\n", "\n", " beg = nw*sr*wsize\n", " if (nsamples-beg)/(sr*wsize) > 0.3:\n", " print(f'Get FFT features from sample {beg} to {nsamples}')\n", " low, high = fft_power_output(x_ffmpeg, sr, beg, nsamples, low_cut, high_cut, plot_f)\n", " features.append([low,high])\n", " else:\n", " print(f'Skip last {nsamples-beg} samples, {(nsamples-beg)/sr} sec, from {beg} to {nsamples}')\n", " \n", " \n", " return features" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "def split_audio(filename, track=1, len=5):\n", " totlen = getLength(filename)\n", " nwin = totlen//len\n", " if totlen%len/len > 0.3:\n", " print(totlen%len/len)\n", " nwin +=1\n", " else:\n", " print(f'Skip last {totlen%len} seconds')\n", " print(f'Total have {nwin} windows')\n", " for i in range(nwin):\n", " #tmp_file = filename.split('.')[-2]\n", " tmp_file = 'tmp_'\n", " tmp_file += str(i)\n", " tmp_file += '.mp3'\n", " try:\n", " stream = ffmpeg.input(filename,ss=i*len)\n", " out, err = (\n", " ffmpeg.output(stream[str(track)],tmp_file,t=len,format='mp3',ar='16000')\n", " .run(capture_stdout=True, capture_stderr=True,overwrite_output=True)\n", " )\n", " except ffmpeg.Error as err:\n", " print(err.stderr)\n", " raise\n", " display(Audio('tmp_'+str(i)+'.mp3'))\n", "#split_audio('crowdnoise.mp3',0,5)" ] } ], "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 }