# Custom Entity Recognizer

### 1. Import libraries necessary for the notebook

In [None]:
from datetime import datetime
import boto3
import json

### 2. Identify your account number

In [None]:
sts_client = boto3.client("sts")
account_id = sts_client.get_caller_identity()["Account"]
print("Your account id is {}".format(account_id))

### 3. Create the bucket for the lab (should already exist from Lab1)

In [None]:
bucket_name = "comprehend-labs" + account_id + "-2"
print ("Bucket name used is " + bucket_name)
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')

if (s3.Bucket(bucket_name).creation_date is None):
 s3_client.create_bucket(Bucket=bucket_name)
 print ("Created bucket " + bucket_name)
else:
 print ("Bucket Exists")

### 4. Download the training data [entity list, docs], and the test data, then upload to the s3 bucket.

In [None]:
s3_client = boto3.client('s3')
s3_entity_prefix = 'entity-training'
host_name = 'http://d1fjxffqn7wkdo.cloudfront.net'
!wget {host_name}/aws-offerings.csv
response = s3_client.upload_file('./aws-offerings.csv', bucket_name, "{}/aws-offerings.csv".format(s3_entity_prefix))

!wget {host_name}/aws-offerings-docs.txt
response = s3_client.upload_file('./aws-offerings-docs.txt', bucket_name, "{}/aws-offerings-docs.txt".format(s3_entity_prefix))
 
!wget {host_name}/aws-offerings-test.txt
response = s3_client.upload_file('./aws-offerings-test.txt', bucket_name, "{}/aws-offerings-test.txt".format(s3_entity_prefix))

### 5. Let's take a look at the data

In [None]:
!head -20 aws-offerings.csv

In [None]:
!head -20 aws-offerings-docs.txt

### 6. Keep these outputs for the manual steps you're about to do. You can copy the outputs to a text doc locally (e.g., your laptop)

In [None]:
print("Entity List Location: s3://{}/{}/aws-offerings.csv".format(bucket_name,s3_entity_prefix))
print("Training Documents Location: s3://{}/{}/aws-offerings-docs.txt".format(bucket_name,s3_entity_prefix))
print("Test Documents Location: s3://{}/{}/aws-offerings-test.txt".format(bucket_name,s3_entity_prefix))
print("Bucket Path: s3://{}".format(bucket_name))

# Now let's go back to the console and kick off the jobs manually

---

## For extra credit, here are the steps to continue doing this in code

### 1. Add IAM permissions to SageMaker
For SageMaker to kick off trainig jobs, it needs the ability to pass a role to the Comprehend service. 
In the IAM console, add the following policy to the role that the SageMaker notebook created.

### 2. Get the ARN for the role we created in the first Lab

In [None]:
#This name should match the name of the role that was created in the first lab.
role_name_base = 'AmazonComprehendServiceRoleS3FullAccess-ComprehendLabs'
prefix_random_numbers = '' #If you added random numbers to the end of the 'ComprehendLabs' prefix, put them here
if not prefix_random_numbers:
 role_name = "{}{}".format(role_name_base,prefix_random_numbers)
else:
 role_name = role_name_base
iam_client = boto3.client("iam")
response = iam_client.get_role(
 RoleName=role_name
)
comprehend_arn = response['Role']['Arn']
print("The ARN for the role is {}".format(comprehend_arn))

### 3. Start training job

In [None]:
comprehend_client = boto3.client("comprehend")
response = comprehend_client.create_entity_recognizer(
 RecognizerName="Recognizer-Name-Goes-Here-{}".format(datetime.now()).replace(' ','-').replace(':','-').replace('.','-'),
 LanguageCode="en",
 DataAccessRoleArn=comprehend_arn,
 InputDataConfig={
 "EntityTypes": [
 {
 'Type': "AWS_OFFERING"
 }
 ],
 'EntityList': {
 'S3Uri': "s3://{}/{}/aws-offerings.csv".format(bucket_name,s3_entity_prefix)
 },
 'Documents': {
 'S3Uri': "s3://{}/{}/aws-offerings-docs.txt".format(bucket_name,s3_entity_prefix)
 },
 
 }
)
recognizer_arn = response["EntityRecognizerArn"]
print("The ARN for the entity recognizer is {}".format(recognizer_arn))

### 4. Check the status of the training job

In [None]:
response = comprehend_client.describe_entity_recognizer(
 EntityRecognizerArn=recognizer_arn
)
#The possible statuses for the custom entity recognizer are: 'SUBMITTED'|'TRAINING'|'DELETING'|'STOP_REQUESTED'|'STOPPED'|'IN_ERROR'|'TRAINED'
print("The status of the custom entity recognizer is {}".format(response['EntityRecognizerProperties']['Status']))

### 5. Lets look at how the training did

In [None]:
response = comprehend_client.describe_entity_recognizer(
 EntityRecognizerArn=recognizer_arn
)
if response['EntityRecognizerProperties']['Status'] == 'TRAINED':
 print(json.dumps(response['EntityRecognizerProperties']['RecognizerMetadata'], indent=2))
else:
 print ("Training job has not completed yet. Please wait to check training performance until it has.")

### 6. Start a batch entity recognition job

In [None]:
response = comprehend_client.describe_entity_recognizer(
 EntityRecognizerArn=recognizer_arn
)
if response['EntityRecognizerProperties']['Status'] == 'TRAINED':
 response = comprehend_client.start_entities_detection_job(
 JobName='AWS_OFFERING-001',
 EntityRecognizerArn=recognizer_arn,
 LanguageCode="en",
 DataAccessRoleArn=comprehend_arn,
 InputDataConfig={
 'S3Uri': "s3://{}/{}/aws-offerings-test.txt".format(bucket_name,s3_entity_prefix),
 'InputFormat': 'ONE_DOC_PER_LINE'
 },
 OutputDataConfig={
 'S3Uri': "s3://{}/{}/results/".format(bucket_name,s3_entity_prefix)
 }
 )
 job_id = response['JobId']
else:
 print ("Training job has not completed yet. Please wait to start batch entity recognitino job until it has.")

### 7. Check the status of the bacth transform job

In [None]:
response = comprehend_client.describe_entities_detection_job(
 JobId=job_id
)
print("The status of the batch entity detection job is {}".format(response['EntitiesDetectionJobProperties']['JobStatus']))

### 8. Download the output of the batch job

In [None]:
response = comprehend_client.describe_entities_detection_job(
 JobId=job_id
)
if response['EntitiesDetectionJobProperties']['JobStatus'] == "COMPLETED":
 output_s3_uri = response['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri']
 s3_key = output_s3_uri.replace("s3://{}/".format(bucket_name),'')
 s3.meta.client.download_file(bucket_name, s3_key, 'output.tar.gz')
 !tar zxvf output.tar.gz
else:
 print("Batch transformation job not complete. Please wait until this job is completed before attempting to view output.")

### 9. Let's review the test data and the output

In [None]:
response = comprehend_client.describe_entities_detection_job(
 JobId=job_id
)
if response['EntitiesDetectionJobProperties']['JobStatus'] == "COMPLETED":
 !head -20 aws-offerings-test.txt
else:
 print("Batch transformation job not complete. Please wait until this job is completed before attempting to view output.")

In [None]:
response = comprehend_client.describe_entities_detection_job(
 JobId=job_id
)
if response['EntitiesDetectionJobProperties']['JobStatus'] == "COMPLETED":
 !cat output
else:
 print("Batch transformation job not complete. Please wait until this job is completed before attempting to view output.")