In [None]:
import boto3
import pandas as pd
import numpy as np
import os
import time
import json
from sklearn import metrics
from datetime import datetime
from pytz import timezone

s3=boto3.client('s3')
rek=boto3.client('rekognition')

In [None]:
data_bucket = "PROVIDE_BUCKET_NAME" #provide a bucket name to upload the training and test datasets
region = boto3.session.Session().region_name

os.environ["BUCKET"] = data_bucket
os.environ["REGION"] = region

#create s3 bucket
if region=='us-east-1':
 !aws s3api create-bucket --bucket $BUCKET
else:
 !aws s3api create-bucket --bucket $BUCKET --create-bucket-configuration LocationConstraint=$REGION

In [None]:
# Upload training and test images to S3 bucket:
!aws s3 cp documents/train s3://{data_bucket}/train --recursive
!aws s3 cp documents/test s3://{data_bucket}/test --recursive

This function list all the items in the s3 bucket. The classes of the image data are retrived from the image key and the image.

In [None]:
def get_s3_bucket_items(bucket, prefix, start_after):
 list_items=[]

 paginator = s3.get_paginator('list_objects_v2')
 operation_parameters = {'Bucket': bucket,
 'Prefix': prefix,
 'StartAfter':start_after}
 page_iterator = paginator.paginate(**operation_parameters)
 for page in page_iterator:
 for item in page['Contents']:
 list_items.append(item['Key'])
 names=list(set([os.path.dirname(x)+'/' for x in list_items]))
 images=[x for x in list_items if x not in names and '.ipynb_checkpoints' not in x ]
 names=[x.replace(prefix,'').strip('/') for x in names if '.ipynb_checkpoints' not in x]
 return list_items, names, images

Specify your data bucket. This block code list all your image metadata in your s3 bucket and extracts the class from the image key and stores the image key and class in a list variable

In [None]:
images=[]

train_objects, names, train_images=get_s3_bucket_items(data_bucket, 'train', 'train/') 
images.append(train_images)

test_objects, names, test_images=get_s3_bucket_items(data_bucket, 'test', 'test/') 
images.append(test_images)

if type(images[0]) is list:
 images=[item for sublist in images for item in sublist]

#listing image classes and last ten image keys
names, images[:10]

Code below creates a label key for the documents. Label key is derived from the S3 path name (i.e. folder in which the images are stored)

In [None]:
def label_ids(data_name):
 labels=[label for label in data_name]
 idx_to_lab={v: k for v, k in enumerate(labels)}
 lab_to_idx={k: v for v, k in enumerate(labels)}
 return idx_to_lab, lab_to_idx

In [None]:
idx2label,label2idx=label_ids(names)
 
idx2label

Code block below creates a function to write manifest files for images to pass to Amazon Rekognition Custom Labels. Change timezone as appropiate.

In [None]:
eastern = timezone('US/Eastern')

def labelling(bucket, key, name ,label2idx ):
 annotation={
 "source-ref": "s3://"+bucket+'/'+key,
 "testdataset-classification_"+name: label2idx[name],
 "testdataset-classification_"+name+"-metadata": {
 "confidence": 1,

 "class-name": name,
 "human-annotated": "yes",
 "creation-date": datetime.now(eastern).strftime("%Y-%m-%d")+'T'+datetime.now(eastern).strftime("%H:%M:%S.%f")[:-3],
 "type": "groundtruth/image-classification"
 }
 }
 return annotation

In [None]:
manifest_train=[]
manifest_test=[]
 
for image in train_images: 
 manifest_train.append(labelling(data_bucket, image, image.split('/')[-2], label2idx ))
with open(os.getcwd()+'/manifest_train.txt', 'w', encoding="utf-8") as f:
 for item in manifest_train:
 json.dump(item, f)
 f.write('\n')
s3.upload_file(os.getcwd()+'/manifest_train.txt',data_bucket, 'manifest_train.txt') 

for image in test_images: 
 manifest_test.append(labelling(data_bucket, image, image.split('/')[-2], label2idx ))
with open(os.getcwd()+'/manifest_test.txt', 'w', encoding="utf-8") as f:
 for item in manifest_test:
 json.dump(item, f)
 f.write('\n')
s3.upload_file(os.getcwd()+'/manifest_test.txt',data_bucket, 'manifest_test.txt')

#listing last five manifest entries
manifest_train[:5]

Code block below creates a function to apply appropriate permission to S3 bucket policy so Rekognition can access the images.

In [None]:
def attach_bucket_policy(bucket):
 bucket_policy = {
 "Version": "2012-10-17",
 "Statement": [
 {
 "Sid": "AWSRekognitionS3AclBucketRead20191011",
 "Effect": "Allow",
 "Principal": {
 "Service": "rekognition.amazonaws.com"
 },
 "Action": [
 "s3:GetBucketAcl",
 "s3:GetBucketLocation"
 ],
 "Resource": "arn:aws:s3:::"+bucket
 },
 {
 "Sid": "AWSRekognitionS3GetBucket20191011",
 "Effect": "Allow",
 "Principal": {
 "Service": "rekognition.amazonaws.com"
 },
 "Action": [
 "s3:GetObject",
 "s3:GetObjectAcl",
 "s3:GetObjectVersion",
 "s3:GetObjectTagging"
 ],
 "Resource": "arn:aws:s3:::"+bucket+"/*"
 },
 {
 "Sid": "AWSRekognitionS3ACLBucketWrite20191011",
 "Effect": "Allow",
 "Principal": {
 "Service": "rekognition.amazonaws.com"
 },
 "Action": "s3:GetBucketAcl",
 "Resource": "arn:aws:s3:::"+bucket
 },
 {
 "Sid": "AWSRekognitionS3PutObject20191011",
 "Effect": "Allow",
 "Principal": {
 "Service": "rekognition.amazonaws.com"
 },
 "Action": "s3:PutObject",
 "Resource": "arn:aws:s3:::"+bucket+"/*",
 "Condition": {
 "StringEquals": {
 "s3:x-amz-acl": "bucket-owner-full-control"
 }
 }
 }
 ]
 }
 
 # Convert the policy from JSON dict to string
 bucket_policy = json.dumps(bucket_policy)

 # Set the new policy
 s3.put_bucket_policy(Bucket=bucket, Policy=bucket_policy)
 return print("Bucket bolicy added to {}".format(bucket))

In [None]:
attach_bucket_policy(data_bucket)

In [None]:
response_rekog = rek.create_project(
 ##Provide a rekognition custom labels project name
 ProjectName='Document-Classification-'+datetime.now().strftime("%S.%f")[:-3]
)

In [None]:
def train_rekognition_with_test_data(arn,version_name,bucket,prefix,train_manifest,test_manifest):
 response = rek.create_project_version(
 ProjectArn=arn,
 VersionName=version_name,
 OutputConfig={
 'S3Bucket': bucket,
 'S3KeyPrefix': prefix
 },
 TrainingData={
 'Assets': [
 {
 'GroundTruthManifest': {
 'S3Object': {
 'Bucket': bucket,
 'Name': train_manifest,
 
 }
 }
 },
 ]
 },
 TestingData={
 'Assets': [
 {
 'GroundTruthManifest': {
 'S3Object': {
 'Bucket': bucket,
 'Name': test_manifest,
 
 }
 }
 },
 ],
 },
 
 )
 return response
 

This code below trains a rekognition model. 
1. If your your dataset has a specified testset in the s3 bucket, the function "train_rekognition_with_test_data" is called which has a parameter for testdata manifest.
2. If your dataset does not have a specified testset, the second function is called which tells rekognition to autocreate a test set automatically

In [None]:
version_name='v1' #model version
prefix=version_name #S3 prefix where all model artifacts will be stored
response_rekog_model=train_rekognition_with_test_data(
 response_rekog['ProjectArn'],version_name,data_bucket,prefix,'manifest_train.txt', 'manifest_test.txt'
)

### Evaluating Model Performance 

Rekognition results are processed and presented in this section.
Code block below calls the "describe_project_versions" function to get the status of the training job and continues to wait till its completed. The training may take about 40-50 mins.


In [None]:
response = None
repeat = True
status = ''
submit_datetime = None
end_datetime = None

while True:
 res=rek.describe_project_versions(
 ProjectArn=response_rekog['ProjectArn'],
 VersionNames=[
 version_name,
 ]
 )
 
 status = res['ProjectVersionDescriptions'][0]['Status']
 submit_datetime = res['ProjectVersionDescriptions'][0]['CreationTimestamp']
 
 if status not in ['TRAINING_COMPLETED']:
 end_datetime = datetime.now(eastern)
 if repeat:
 print('.', end = '')
 time.sleep(20)
 else:
 end_datetime = res['ProjectVersionDescriptions'][0]['TrainingEndTimestamp']
 break
 
print('Job status: ' + status)
print('Elasped time: {}'.format(end_datetime - submit_datetime))

if status == 'TRAINING_COMPLETED':
 buckets = res['ProjectVersionDescriptions'][0]['EvaluationResult']['Summary']['S3Object']["Bucket"]
 keys = res['ProjectVersionDescriptions'][0]['EvaluationResult']['Summary']['S3Object']["Name"]
 
 print('bucket:{}, key:{}'.format(buckets,keys))

In [None]:
#### Download prediction summary output from rekognition

s3.download_file(buckets,keys , os.getcwd()+'/rekog_output.json')
confusion_matrix_file = 'rekog_output.json'
data = None
with open(confusion_matrix_file) as f:
 data = json.load(f)

In [None]:
label=[]
f1,precision,recall,num_images=[],[],[],[]
for item in data['LabelEvaluationResults']:
 label.append(item['Label'])
 num_images.append(item['NumberOfTestingImages'])
 recall.append(item['Metrics']['Recall'])
 f1.append(item['Metrics']['F1Score'])
 precision.append(item['Metrics']['Precision'])
label.append("MEAN")
num_images.append(sum(num_images))
for e in [f1,recall,precision]:
 e.append(np.mean(e))

In [None]:
result_rekog = pd.DataFrame({'label':label, 'f1':f1,'precision':precision,'recall':recall, 'number of images':num_images})
result_rekog

In [None]:
#Start the model
project_arn=response_rekog_model['ProjectVersionArn']
min_inference_units="1"


os.environ["PROJECT_ARN"] = project_arn
os.environ["MIN_INFERENCE_UNITS"] = min_inference_units

!aws rekognition start-project-version \
 --project-version-arn $PROJECT_ARN \
 --min-inference-units $MIN_INFERENCE_UNITS \
 --region $REGION

In [None]:
#Copy the project version ARN to provide with CloudFormation Template

print (response_rekog_model['ProjectVersionArn'])