{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import boto3\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "import time\n", "import json\n", "from sklearn import metrics\n", "from datetime import datetime\n", "from pytz import timezone\n", "\n", "s3=boto3.client('s3')\n", "rek=boto3.client('rekognition')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_bucket = \"PROVIDE_BUCKET_NAME\" #provide a bucket name to upload the training and test datasets\n", "region = boto3.session.Session().region_name\n", "\n", "os.environ[\"BUCKET\"] = data_bucket\n", "os.environ[\"REGION\"] = region\n", "\n", "#create s3 bucket\n", "if region=='us-east-1':\n", " !aws s3api create-bucket --bucket $BUCKET\n", "else:\n", " !aws s3api create-bucket --bucket $BUCKET --create-bucket-configuration LocationConstraint=$REGION" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Upload training and test images to S3 bucket:\n", "!aws s3 cp documents/train s3://{data_bucket}/train --recursive\n", "!aws s3 cp documents/test s3://{data_bucket}/test --recursive" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This function list all the items in the s3 bucket. The classes of the image data are retrived from the image key and the image." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_s3_bucket_items(bucket, prefix, start_after):\n", " list_items=[]\n", "\n", " paginator = s3.get_paginator('list_objects_v2')\n", " operation_parameters = {'Bucket': bucket,\n", " 'Prefix': prefix,\n", " 'StartAfter':start_after}\n", " page_iterator = paginator.paginate(**operation_parameters)\n", " for page in page_iterator:\n", " for item in page['Contents']:\n", " list_items.append(item['Key'])\n", " names=list(set([os.path.dirname(x)+'/' for x in list_items]))\n", " images=[x for x in list_items if x not in names and '.ipynb_checkpoints' not in x ]\n", " names=[x.replace(prefix,'').strip('/') for x in names if '.ipynb_checkpoints' not in x]\n", " return list_items, names, images" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Specify your data bucket. This block code list all your image metadata in your s3 bucket and extracts the class from the image key and stores the image key and class in a list variable" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "images=[]\n", "\n", "train_objects, names, train_images=get_s3_bucket_items(data_bucket, 'train', 'train/') \n", "images.append(train_images)\n", "\n", "test_objects, names, test_images=get_s3_bucket_items(data_bucket, 'test', 'test/') \n", "images.append(test_images)\n", "\n", "if type(images[0]) is list:\n", " images=[item for sublist in images for item in sublist]\n", "\n", "#listing image classes and last ten image keys\n", "names, images[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Code below creates a label key for the documents. Label key is derived from the S3 path name (i.e. folder in which the images are stored)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def label_ids(data_name):\n", " labels=[label for label in data_name]\n", " idx_to_lab={v: k for v, k in enumerate(labels)}\n", " lab_to_idx={k: v for v, k in enumerate(labels)}\n", " return idx_to_lab, lab_to_idx" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "idx2label,label2idx=label_ids(names)\n", " \n", "idx2label" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Code block below creates a function to write manifest files for images to pass to Amazon Rekognition Custom Labels. Change timezone as appropiate." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "eastern = timezone('US/Eastern')\n", "\n", "def labelling(bucket, key, name ,label2idx ):\n", " annotation={\n", " \"source-ref\": \"s3://\"+bucket+'/'+key,\n", " \"testdataset-classification_\"+name: label2idx[name],\n", " \"testdataset-classification_\"+name+\"-metadata\": {\n", " \"confidence\": 1,\n", "\n", " \"class-name\": name,\n", " \"human-annotated\": \"yes\",\n", " \"creation-date\": datetime.now(eastern).strftime(\"%Y-%m-%d\")+'T'+datetime.now(eastern).strftime(\"%H:%M:%S.%f\")[:-3],\n", " \"type\": \"groundtruth/image-classification\"\n", " }\n", " }\n", " return annotation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "manifest_train=[]\n", "manifest_test=[]\n", " \n", "for image in train_images: \n", " manifest_train.append(labelling(data_bucket, image, image.split('/')[-2], label2idx ))\n", "with open(os.getcwd()+'/manifest_train.txt', 'w', encoding=\"utf-8\") as f:\n", " for item in manifest_train:\n", " json.dump(item, f)\n", " f.write('\\n')\n", "s3.upload_file(os.getcwd()+'/manifest_train.txt',data_bucket, 'manifest_train.txt') \n", "\n", "for image in test_images: \n", " manifest_test.append(labelling(data_bucket, image, image.split('/')[-2], label2idx ))\n", "with open(os.getcwd()+'/manifest_test.txt', 'w', encoding=\"utf-8\") as f:\n", " for item in manifest_test:\n", " json.dump(item, f)\n", " f.write('\\n')\n", "s3.upload_file(os.getcwd()+'/manifest_test.txt',data_bucket, 'manifest_test.txt')\n", "\n", "#listing last five manifest entries\n", "manifest_train[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Code block below creates a function to apply appropriate permission to S3 bucket policy so Rekognition can access the images." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def attach_bucket_policy(bucket):\n", " bucket_policy = {\n", " \"Version\": \"2012-10-17\",\n", " \"Statement\": [\n", " {\n", " \"Sid\": \"AWSRekognitionS3AclBucketRead20191011\",\n", " \"Effect\": \"Allow\",\n", " \"Principal\": {\n", " \"Service\": \"rekognition.amazonaws.com\"\n", " },\n", " \"Action\": [\n", " \"s3:GetBucketAcl\",\n", " \"s3:GetBucketLocation\"\n", " ],\n", " \"Resource\": \"arn:aws:s3:::\"+bucket\n", " },\n", " {\n", " \"Sid\": \"AWSRekognitionS3GetBucket20191011\",\n", " \"Effect\": \"Allow\",\n", " \"Principal\": {\n", " \"Service\": \"rekognition.amazonaws.com\"\n", " },\n", " \"Action\": [\n", " \"s3:GetObject\",\n", " \"s3:GetObjectAcl\",\n", " \"s3:GetObjectVersion\",\n", " \"s3:GetObjectTagging\"\n", " ],\n", " \"Resource\": \"arn:aws:s3:::\"+bucket+\"/*\"\n", " },\n", " {\n", " \"Sid\": \"AWSRekognitionS3ACLBucketWrite20191011\",\n", " \"Effect\": \"Allow\",\n", " \"Principal\": {\n", " \"Service\": \"rekognition.amazonaws.com\"\n", " },\n", " \"Action\": \"s3:GetBucketAcl\",\n", " \"Resource\": \"arn:aws:s3:::\"+bucket\n", " },\n", " {\n", " \"Sid\": \"AWSRekognitionS3PutObject20191011\",\n", " \"Effect\": \"Allow\",\n", " \"Principal\": {\n", " \"Service\": \"rekognition.amazonaws.com\"\n", " },\n", " \"Action\": \"s3:PutObject\",\n", " \"Resource\": \"arn:aws:s3:::\"+bucket+\"/*\",\n", " \"Condition\": {\n", " \"StringEquals\": {\n", " \"s3:x-amz-acl\": \"bucket-owner-full-control\"\n", " }\n", " }\n", " }\n", " ]\n", " }\n", " \n", " # Convert the policy from JSON dict to string\n", " bucket_policy = json.dumps(bucket_policy)\n", "\n", " # Set the new policy\n", " s3.put_bucket_policy(Bucket=bucket, Policy=bucket_policy)\n", " return print(\"Bucket bolicy added to {}\".format(bucket))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "attach_bucket_policy(data_bucket)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "response_rekog = rek.create_project(\n", " ##Provide a rekognition custom labels project name\n", " ProjectName='Document-Classification-'+datetime.now().strftime(\"%S.%f\")[:-3]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def train_rekognition_with_test_data(arn,version_name,bucket,prefix,train_manifest,test_manifest):\n", " response = rek.create_project_version(\n", " ProjectArn=arn,\n", " VersionName=version_name,\n", " OutputConfig={\n", " 'S3Bucket': bucket,\n", " 'S3KeyPrefix': prefix\n", " },\n", " TrainingData={\n", " 'Assets': [\n", " {\n", " 'GroundTruthManifest': {\n", " 'S3Object': {\n", " 'Bucket': bucket,\n", " 'Name': train_manifest,\n", " \n", " }\n", " }\n", " },\n", " ]\n", " },\n", " TestingData={\n", " 'Assets': [\n", " {\n", " 'GroundTruthManifest': {\n", " 'S3Object': {\n", " 'Bucket': bucket,\n", " 'Name': test_manifest,\n", " \n", " }\n", " }\n", " },\n", " ],\n", " },\n", " \n", " )\n", " return response\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This code below trains a rekognition model. \n", "1. If your your dataset has a specified testset in the s3 bucket, the function \"train_rekognition_with_test_data\" is called which has a parameter for testdata manifest.\n", "2. If your dataset does not have a specified testset, the second function is called which tells rekognition to autocreate a test set automatically" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "version_name='v1' #model version\n", "prefix=version_name #S3 prefix where all model artifacts will be stored\n", "response_rekog_model=train_rekognition_with_test_data(\n", " response_rekog['ProjectArn'],version_name,data_bucket,prefix,'manifest_train.txt', 'manifest_test.txt'\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Evaluating Model Performance \n", "\n", "Rekognition results are processed and presented in this section.\n", "Code block below calls the \"describe_project_versions\" function to get the status of the training job and continues to wait till its completed. The training may take about 40-50 mins.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "response = None\n", "repeat = True\n", "status = ''\n", "submit_datetime = None\n", "end_datetime = None\n", "\n", "while True:\n", " res=rek.describe_project_versions(\n", " ProjectArn=response_rekog['ProjectArn'],\n", " VersionNames=[\n", " version_name,\n", " ]\n", " )\n", " \n", " status = res['ProjectVersionDescriptions'][0]['Status']\n", " submit_datetime = res['ProjectVersionDescriptions'][0]['CreationTimestamp']\n", " \n", " if status not in ['TRAINING_COMPLETED']:\n", " end_datetime = datetime.now(eastern)\n", " if repeat:\n", " print('.', end = '')\n", " time.sleep(20)\n", " else:\n", " end_datetime = res['ProjectVersionDescriptions'][0]['TrainingEndTimestamp']\n", " break\n", " \n", "print('Job status: ' + status)\n", "print('Elasped time: {}'.format(end_datetime - submit_datetime))\n", "\n", "if status == 'TRAINING_COMPLETED':\n", " buckets = res['ProjectVersionDescriptions'][0]['EvaluationResult']['Summary']['S3Object'][\"Bucket\"]\n", " keys = res['ProjectVersionDescriptions'][0]['EvaluationResult']['Summary']['S3Object'][\"Name\"]\n", " \n", " print('bucket:{}, key:{}'.format(buckets,keys))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#### Download prediction summary output from rekognition\n", "\n", "s3.download_file(buckets,keys , os.getcwd()+'/rekog_output.json')\n", "confusion_matrix_file = 'rekog_output.json'\n", "data = None\n", "with open(confusion_matrix_file) as f:\n", " data = json.load(f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "label=[]\n", "f1,precision,recall,num_images=[],[],[],[]\n", "for item in data['LabelEvaluationResults']:\n", " label.append(item['Label'])\n", " num_images.append(item['NumberOfTestingImages'])\n", " recall.append(item['Metrics']['Recall'])\n", " f1.append(item['Metrics']['F1Score'])\n", " precision.append(item['Metrics']['Precision'])\n", "label.append(\"MEAN\")\n", "num_images.append(sum(num_images))\n", "for e in [f1,recall,precision]:\n", " e.append(np.mean(e))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "result_rekog = pd.DataFrame({'label':label, 'f1':f1,'precision':precision,'recall':recall, 'number of images':num_images})\n", "result_rekog" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Start the model\n", "project_arn=response_rekog_model['ProjectVersionArn']\n", "min_inference_units=\"1\"\n", "\n", "\n", "os.environ[\"PROJECT_ARN\"] = project_arn\n", "os.environ[\"MIN_INFERENCE_UNITS\"] = min_inference_units\n", "\n", "!aws rekognition start-project-version \\\n", " --project-version-arn $PROJECT_ARN \\\n", " --min-inference-units $MIN_INFERENCE_UNITS \\\n", " --region $REGION" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Copy the project version ARN to provide with CloudFormation Template\n", "\n", "print (response_rekog_model['ProjectVersionArn'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 4 }