# Evaluate the Spoken Language Classifier
In this notebook, you will:
1. Deploy your trained model to a sagemaker endpoint
2. Run inference on the test dataset
3. Calculate evaluation metrics

### Import libraries and load AWS credentials

In [None]:
!pip install -U sagemaker tdqm

In [None]:
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.serializers import JSONSerializer
from sagemaker.predictor import Predictor
import boto3
import json
import tarfile
import os
import pandas as pd
import sklearn
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sn
from scipy import stats
import numpy as np

from src.utils import classwise_f1

In [None]:
role = sagemaker.get_execution_role()
sess = sagemaker.session.Session()
account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name

bucket = sess.default_bucket()
s3_voxforge_prefix = os.path.join('s3://' + bucket, 'voxforge')

Insert your training job ID here

In [None]:
training_job_id = ''

### Deploy model endpoint

In [None]:
image_uri = f'{account_id}.dkr.ecr.{region}.amazonaws.com/spoken-language-detection'
model_path = f's3://{bucket}/models/{training_job_id}/output/model.tar.gz'

In [None]:
model = sagemaker.Model(
 image_uri=image_uri,
 model_data=model_path,
 role=role
)

model.deploy(1, 'ml.m4.xlarge')

In [None]:
predictor = Predictor(model.endpoint_name, serializer=JSONSerializer())

### Run inference on test set

Load the test dataset metadata and collect the filenames (audio files are NOT downloaded)

In [None]:
test_manifest_path = os.path.join(s3_voxforge_prefix, 'test_manifest.csv')

test_df = pd.read_csv(test_manifest_path)
test_df['fname'] = test_df['fname'].apply(lambda x : os.path.join(s3_voxforge_prefix, x))

Make predictions on the test dataset in batches

In [None]:
files = test_df['fname'].tolist()

n = 100
predictions = []
for batch in [files[i:i + n] for i in range(0, len(files), n)]:
 output = predictor.predict(batch)
 output = json.loads(output)
 
 predictions += output
 print(f'Files processed : {len(predictions)}', flush=True, end='\r')

Collect the predictions into a dataframe along with their original file metadata

In [None]:
#classes = list(test_df['class'].unique())
classes = list(set(predictions)) # some models may not predict all languages found in test dataset

results_df = test_df[['fname', 'class', 'source']]
results_df['preds'] = predictions

results_df = results_df[results_df['class'].isin(classes)]

Save results to csv and upload to s3

In [None]:
out_fname = 'test_results_{}.csv'.format(training_job_id)
results_df.to_csv(out_fname, index=False)
sess.upload_data(out_fname, bucket=bucket, key_prefix='results')

### Micro Metrics

Calculate micro metrics (metrics on a per-class basis). This will calculate recall, precision, and F1 for each class

In [None]:
y_true = np.array(results_df['class'])
y_pred = np.array(results_df['preds'])

In [None]:
micro_metrics = []
for c in classes:
 one_v_all_true = (y_true == c).astype(np.int)
 one_v_all_pred = (y_pred == c).astype(np.int)

 micro_metrics.append({
 'class' : c,
 'recall' : recall_score(one_v_all_true, one_v_all_pred),
 'precision' : precision_score(one_v_all_true, one_v_all_pred),
 'f1' : f1_score(one_v_all_true, one_v_all_pred)
 })

In [None]:
micro_df = pd.DataFrame(micro_metrics, columns=['class', 'recall', 'precision', 'f1'])
micro_df

Generate a confusion matrix to see how each language is misclassified

In [None]:
cm = sklearn.metrics.confusion_matrix(y_true, y_pred, labels=classes, normalize='true')

sn.heatmap(cm, yticklabels=classes, xticklabels=classes, cmap="YlGnBu", annot=True, fmt='.2f')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()

### Macro Metrics

Calculate macro metrics such as per-source accuracy and average accuracy accross all records

In [None]:
source_acc = results_df.groupby('source').apply(
 lambda x : (x['preds'] == x['class']).astype(float).mean()).mean()

acc = np.mean((y_pred == y_true).tolist())

In [None]:
print('Source Normalized Accuracy : {}'.format(source_acc))
print('Accuracy : {}'.format(acc))
print('Recall : {}'.format(micro_df['recall'].mean()))
print('Precision : {}'.format(micro_df['precision'].mean()))
print('F1 : {}'.format(micro_df['f1'].mean()))

Be sure to delete the endpoint after evaluation

In [None]:
predictor.delete_endpoint()