{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Evaluate the Spoken Language Classifier\n", "In this notebook, you will:\n", "1. Deploy your trained model to a sagemaker endpoint\n", "2. Run inference on the test dataset\n", "3. Calculate evaluation metrics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import libraries and load AWS credentials" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -U sagemaker tdqm" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sagemaker\n", "from sagemaker.estimator import Estimator\n", "from sagemaker.serializers import JSONSerializer\n", "from sagemaker.predictor import Predictor\n", "import boto3\n", "import json\n", "import tarfile\n", "import os\n", "import pandas as pd\n", "import sklearn\n", "from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score\n", "import matplotlib.pyplot as plt\n", "import seaborn as sn\n", "from scipy import stats\n", "import numpy as np\n", "\n", "from src.utils import classwise_f1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "role = sagemaker.get_execution_role()\n", "sess = sagemaker.session.Session()\n", "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", "region = boto3.session.Session().region_name\n", "\n", "bucket = sess.default_bucket()\n", "s3_voxforge_prefix = os.path.join('s3://' + bucket, 'voxforge')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Insert your training job ID here" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "training_job_id = ''" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Deploy model endpoint" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "image_uri = f'{account_id}.dkr.ecr.{region}.amazonaws.com/spoken-language-detection'\n", "model_path = f's3://{bucket}/models/{training_job_id}/output/model.tar.gz'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = sagemaker.Model(\n", " image_uri=image_uri,\n", " model_data=model_path,\n", " role=role\n", ")\n", "\n", "model.deploy(1, 'ml.m4.xlarge')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictor = Predictor(model.endpoint_name, serializer=JSONSerializer())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run inference on test set" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the test dataset metadata and collect the filenames (audio files are NOT downloaded)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_manifest_path = os.path.join(s3_voxforge_prefix, 'test_manifest.csv')\n", "\n", "test_df = pd.read_csv(test_manifest_path)\n", "test_df['fname'] = test_df['fname'].apply(lambda x : os.path.join(s3_voxforge_prefix, x))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make predictions on the test dataset in batches" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "files = test_df['fname'].tolist()\n", "\n", "n = 100\n", "predictions = []\n", "for batch in [files[i:i + n] for i in range(0, len(files), n)]:\n", " output = predictor.predict(batch)\n", " output = json.loads(output)\n", " \n", " predictions += output\n", " print(f'Files processed : {len(predictions)}', flush=True, end='\\r')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Collect the predictions into a dataframe along with their original file metadata" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#classes = list(test_df['class'].unique())\n", "classes = list(set(predictions)) # some models may not predict all languages found in test dataset\n", "\n", "results_df = test_df[['fname', 'class', 'source']]\n", "results_df['preds'] = predictions\n", "\n", "results_df = results_df[results_df['class'].isin(classes)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Save results to csv and upload to s3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "out_fname = 'test_results_{}.csv'.format(training_job_id)\n", "results_df.to_csv(out_fname, index=False)\n", "sess.upload_data(out_fname, bucket=bucket, key_prefix='results')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Micro Metrics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Calculate micro metrics (metrics on a per-class basis). This will calculate recall, precision, and F1 for each class" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_true = np.array(results_df['class'])\n", "y_pred = np.array(results_df['preds'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "micro_metrics = []\n", "for c in classes:\n", " one_v_all_true = (y_true == c).astype(np.int)\n", " one_v_all_pred = (y_pred == c).astype(np.int)\n", "\n", " micro_metrics.append({\n", " 'class' : c,\n", " 'recall' : recall_score(one_v_all_true, one_v_all_pred),\n", " 'precision' : precision_score(one_v_all_true, one_v_all_pred),\n", " 'f1' : f1_score(one_v_all_true, one_v_all_pred)\n", " })" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "micro_df = pd.DataFrame(micro_metrics, columns=['class', 'recall', 'precision', 'f1'])\n", "micro_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generate a confusion matrix to see how each language is misclassified" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cm = sklearn.metrics.confusion_matrix(y_true, y_pred, labels=classes, normalize='true')\n", "\n", "sn.heatmap(cm, yticklabels=classes, xticklabels=classes, cmap=\"YlGnBu\", annot=True, fmt='.2f')\n", "plt.xlabel('Predicted label')\n", "plt.ylabel('True label')\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Macro Metrics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Calculate macro metrics such as per-source accuracy and average accuracy accross all records" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "source_acc = results_df.groupby('source').apply(\n", " lambda x : (x['preds'] == x['class']).astype(float).mean()).mean()\n", "\n", "acc = np.mean((y_pred == y_true).tolist())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print('Source Normalized Accuracy : {}'.format(source_acc))\n", "print('Accuracy : {}'.format(acc))\n", "print('Recall : {}'.format(micro_df['recall'].mean()))\n", "print('Precision : {}'.format(micro_df['precision'].mean()))\n", "print('F1 : {}'.format(micro_df['f1'].mean()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Be sure to delete the endpoint after evaluation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictor.delete_endpoint()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 4 }