# Custom classifier.

This notebook covers how to prepare a training dataset for a custom classifier in Amazon Comprehend leveraging the custom keywords that were generated from our word2vec model in the second notebook. 

We will build a custom sentiment classifier based on keywords semantically similar to the word "frustrated" and "awesome".



In [2]:
# library imports
import re
import numpy as np
import pandas as pd
import matplotlib
import csv
import boto3
import json
import time
import os
import datetime 

from sagemaker import get_execution_role
from sagemaker.session import Session

comprehend = boto3.client('comprehend')

# Specify S3 bucket and prefix that you want to use for model data
# Feel free to specify a different bucket here if you wish.
bucket = '<your-bucket>'
prefix = 'comprehend-custom-entity'

role = get_execution_role()
print(role)

arn:aws:iam::202860692096:role/service-role/AmazonSageMaker-ExecutionRole-20180529T141286
arn:aws:iam::202860692096:role/service-role/AmazonSageMaker-ExecutionRole-20180529T141286


In this example we will re-use the dataset that we wrangled and filtered for the telco domain. 

In [2]:
colnames=['text'] 
tweets = pd.read_csv('./data/tweet_telco.csv',encoding='utf-8',names=colnames, header=None)
print(tweets.shape)
tweets.head()

(32716, 1)


Unnamed: 0,text
0,@sprintcare is the worst customer service | @1...
1,@sprintcare is the worst customer service | @1...
2,@sprintcare is the worst customer service | @1...
3,@115714 y’all lie about your “great” connectio...
4,"@115714 whenever I contact customer support, t..."


<a id='data-wrangling'></a>

In order to create our dataset we need to label the dataset.

In order to find relevant records, we will be using our custom word2vec model to find semantically similar words to "frustrated" for negative tweets and "Awesome" for positive tweets. See the blazingtext_word2vec_telco_tweets.ipynb notebook for generating keywords.

In [4]:
tweets['match_negative']=tweets['text'].str.contains(r'(Really|cheated|annoyed|unhelpful|frustrated|upset|unhappy|angry|badly|bad|dissatisfied|disappointed|disgusted)', regex=True)


  if __name__ == '__main__':


In [5]:
tweets['match_positive']=tweets['text'].str.contains(r'(Awesome|AWESOME|Awesome!|Yay!|Hero|Whoop|#YouRock!|Super|Awww!)', regex=True)

  if __name__ == '__main__':



Let's add another column with our class label. This is required part of the Amazon Comprehend training dataset.

More information can be found here.

https://docs.aws.amazon.com/comprehend/latest/dg/cer-entity-list.html


In [6]:
tweets.loc[tweets['match_negative'] == True, 'label'] = 'NEGATIVE'
tweets.loc[tweets['match_positive'] == True, 'label'] = 'POSITIVE'

In [7]:
tweets.groupby('label').count()

Unnamed: 0_level_0,text,match_negative,match_positive
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NEGATIVE,1445,1445,1445
POSITIVE,254,254,254


Let's create our training and test file.

In [8]:
training_file = './data/negative_classifier_train.csv'
tweets.loc[tweets['label'].notnull(), ['label', 'text']].to_csv(training_file, encoding='utf-8', index=False)


In [9]:
def upload_to_s3(s3path, file):
    s3 = boto3.resource('s3')
    data = open(file, "rb")
    key = s3path
    s3.Bucket(bucket).put_object(Key=key, Body=data)

s3_train_key = prefix + "/train/negative_classifier_train.csv"

upload_to_s3(s3_train_key, training_file)

In [10]:
s3_train_data = 's3://{}/{}'.format(bucket, s3_train_key)
s3_output_job = 's3://{}/{}/{}'.format(bucket, prefix, 'output/train_job')
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://data-phi/comprehend-custom-entity/train/negative_classifier_train.csv


## Training our model

In [11]:

id = str(datetime.datetime.now().strftime("%s"))
training_job = comprehend.create_document_classifier(
    DocumentClassifierName='Custom-Negative-Classifier-'+ id,
    DataAccessRoleArn=role,
    InputDataConfig={
        'S3Uri': s3_train_data
    },
    OutputDataConfig={
        'S3Uri': s3_output_job
    },
    LanguageCode='en'
)

In [12]:
jobArn = training_job['DocumentClassifierArn']

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_custom_classifier = comprehend.describe_document_classifier(
        DocumentClassifierArn = jobArn
    )
    status = describe_custom_classifier["DocumentClassifierProperties"]["Status"]
    print("Custom classifier: {}".format(status))
    
    if status == "TRAINED" or status == "IN_ERROR":
        break
        
    time.sleep(60)

Custom classifier: SUBMITTED
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINED


## Confusion matrix

In [40]:
#Retrieve the S3URI from the model output and create jobkey variable.
job_output = describe_custom_classifier["DocumentClassifierProperties"]["OutputDataConfig"]["S3Uri"]
path_prefix = 's3://{}/'.format(bucket)
job_key = os.path.relpath(job_output, path_prefix)

In [41]:
#Download the model metrics
s3 = boto3.resource('s3')
s3.Bucket(bucket).download_file(job_key, './output.tar.gz')

In [42]:
#Unpack the gzip file
!tar xvzf ./output.tar.gz

output/
output/confusion_matrix.json


In [43]:
import json

with open('./output/confusion_matrix.json') as json_file:
    data = json.load(json_file)
print(json.dumps(data, indent=2, default=str))

{
  "confusion_matrix": [
    [
      142,
      2
    ],
    [
      5,
      20
    ]
  ],
  "labels": [
    "NEGATIVE",
    "POSITIVE"
  ],
  "type": "multi_class",
  "all_labels": [
    "NEGATIVE",
    "POSITIVE",
    "label"
  ]
}


In [None]:
!pip install tabulate

In [51]:
from IPython.display import HTML, display
import tabulate
table = [["","NEGATIVE","POSITIVE","(Predicted)"],
         ["NEGATIVE",data['confusion_matrix'][0][0], data['confusion_matrix'][0][1]],
         ["POSTIVE",data['confusion_matrix'][1][0], data['confusion_matrix'][1][1]],
         ["(Actual)"]]
display(HTML(tabulate.tabulate(table, tablefmt='html')))

0,1,2,3
,NEGATIVE,POSITIVE,(Predicted)
NEGATIVE,142,2,
POSTIVE,5,20,
(Actual),,,


In [14]:
model_arn = describe_custom_classifier["DocumentClassifierProperties"]["DocumentClassifierArn"]
print(model_arn)

arn:aws:comprehend:us-east-1:202860692096:document-classifier/Custom-Negative-Classifier-1575245563


In [5]:
inference_endpoint_response = comprehend.create_endpoint(
    EndpointName='inference-endpoint',
    ModelArn = model_arn,
    DesiredInferenceUnits = 2
)


In [None]:
endpoint_arn = inference_endpoint_response["EndpointArn"]

In [10]:
describe_response = comprehend.describe_endpoint(
    EndpointArn = endpoint_arn
)

In [None]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_response = comprehend.describe_endpoint(
        EndpointArn = endpoint_arn
    )
    status = describe_response["EndpointProperties"]["Status"]
    print("Endpoint: {}".format(status))
    
    if status == "IN_SERVICE" or status == "IN_ERROR":
        break
        
    time.sleep(60)

In [17]:

txt = "Great arena terrible wireless coverage here in vegas. @TMobile why is coverage so bad? @TMobileArena real bad at circus circus. MGM just ok."

response = comprehend.classify_document(
    Text= txt,
    EndpointArn = endpoint_arn
)

In [None]:
print(json.dumps(response, indent=2, default=str))