{ "cells": [ { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import os\n", "from collections import namedtuple\n", "from collections import defaultdict\n", "from collections import Counter\n", "from datetime import datetime\n", "import itertools\n", "import base64\n", "import glob\n", "import json\n", "import random\n", "import time\n", "import imageio\n", "import numpy as np\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import shutil\n", "from matplotlib.backends.backend_pdf import PdfPages\n", "from sklearn.metrics import confusion_matrix\n", "import boto3\n", "import botocore\n", "import sagemaker\n", "from urllib.parse import urlparse" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "role = sagemaker.get_execution_role()\n", "region = boto3.session.Session().region_name\n", "s3 = boto3.client('s3')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Job parameters" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "BUCKET=\"tanmcrae-greengrass-blog\"" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "bucket_region = s3.head_bucket(Bucket=BUCKET)['ResponseMetadata']['HTTPHeaders']['x-amz-bucket-region']\n", "assert bucket_region == region, \"Your S3 bucket {} and this notebook need to be in the same region.\".format(BUCKET)" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "blue-box-large-job-public\n" ] } ], "source": [ "MANIFEST = \"blue_box_large_job.json\"\n", "JOB_NAME = \"blue-box-large-job-public\"\n", "EXP_NAME = 'blue-box'\n", "print(JOB_NAME)\n" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [], "source": [ "USE_AUTO_LABELING = False\n", "RUN_FULL_AL_DEMO = False \n", "USE_PRIVATE_WORKFORCE = False" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## specifying categories" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "CLASS_NAME = \"storage box\"" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Label space is ['storage box']\n", "uploaded s3://tanmcrae-greengrass-blog/ground-truth/blue-box/class_labels.json\n" ] } ], "source": [ "CLASS_LIST = [CLASS_NAME]\n", "print(\"Label space is {}\".format(CLASS_LIST))\n", "\n", "json_body = {\n", " 'labels': [{'label': label} for label in CLASS_LIST]\n", "}\n", "with open('class_labels.json', 'w') as f:\n", " json.dump(json_body, f)\n", "\n", "LABEL_KEY = \"ground-truth/{}/class_labels.json\".format(EXP_NAME)\n", "s3.upload_file('class_labels.json', BUCKET, LABEL_KEY)\n", "print (\"uploaded s3://{}/{}\".format(BUCKET, LABEL_KEY))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create the instruction template\n" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "uploaded template to s3://tanmcrae-greengrass-blog/ground-truth/blue-box/instructions.template\n" ] } ], "source": [ "def make_template(test_template=False, save_fname='instructions.template'):\n", " template = r\"\"\"\n", "\n", "\n", " \n", " \n", "
    \n", "
  1. Inspect the image
  2. \n", "
  3. Determine if there are visible blue storage box in the picture.
  4. \n", "
  5. Outline the storage box in the image using the provided “Box” tool.
  6. \n", "
\n", "\n", "

Good Example

\n", "

\n", "

\n", "

\n", "

\n", "

Bad Example

\n", "\n", "

The bounding boxes below are bad as it didn't cover the entire box.

\n", "

\n", "

\n", "

The bounding boxes below are bad as it's not tight around storage box.

\n", "

\n", "

\n", "

The labeling below are bad as it didn't cover the full

\n", "

\n", "

\n", "\n", "
\n", "\n", " \n", "

Label every blue storage box in the picture. Boxes should fit tight. If the target goes off the screen, label up to the edge of the image. Do not label if it completely cannot be seen.

\n", "

\n", "

\n", "

\n", "

\n", "


\n", "

Bad examples

\n", "

The bounding boxes below are bad as it didn't cover the entire box.

\n", "

\n", "

\n", "

The bounding boxes below are bad as it's not tight around storage box.

\n", "

\n", "

\n", "

The labeling below are bad as it only labeled part of the storage box

\n", "

\n", "

\n", "\n", "
\n", " \n", "
\n", " \"\"\"\n", " with open(save_fname, 'w') as f:\n", " f.write(template)\n", " \n", "template_name = 'instructions.template'\n", "# make_template(test_template=True, save_fname='instructions.html')\n", "make_template(test_template=False, save_fname=template_name)\n", "s3.upload_file(template_name, BUCKET, EXP_NAME + '/' + template_name)\n", "\n", "print(\"uploaded template to s3://{}/ground-truth/{}/{}\".format(BUCKET, EXP_NAME, template_name))" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [], "source": [ "private_workteam_arn = \"arn:aws:sagemaker:us-east-1:854681337758:workteam/private-crowd/greengrass-blog\"\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create job" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "task_title: Draw a box around storage box in the picture\n", "JOB_NAME: blue-box-large-job-public\n" ] }, { "data": { "text/plain": [ "['image', 'object', 'detection', 'storage box']" ] }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "task_description = 'Dear Annotator, please draw a box around the yellow or blue storage box in the picture. Thank you!'\n", "task_keywords = ['image', 'object', 'detection', CLASS_NAME]\n", "task_title = 'Draw a box around storage box in the picture'\n", "\n", "print(\"task_title: {}\".format(task_title))\n", "print(\"JOB_NAME: {}\".format(JOB_NAME))\n", "task_keywords" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [], "source": [ "# Specify ARNs for resources needed to run an object detection job.\n", "ac_arn_map = {'us-west-2': '081040173940',\n", " 'us-east-1': '432418664414',\n", " 'us-east-2': '266458841044',\n", " 'eu-west-1': '568282634449',\n", " 'ap-northeast-1': '477331159723'}\n", "\n", "prehuman_arn = 'arn:aws:lambda:{}:{}:function:PRE-BoundingBox'.format(region, ac_arn_map[region])\n", "acs_arn = 'arn:aws:lambda:{}:{}:function:ACS-BoundingBox'.format(region, ac_arn_map[region]) \n", "labeling_algorithm_specification_arn = 'arn:aws:sagemaker:{}:027400017018:labeling-job-algorithm-specification/object-detection'.format(region)\n", "public_workteam_arn = 'arn:aws:sagemaker:{}:394669845002:workteam/public-crowd/default'.format(region)" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "human_task_config = {\n", " \"AnnotationConsolidationConfig\": {\n", " \"AnnotationConsolidationLambdaArn\": acs_arn,\n", " },\n", " \"PreHumanTaskLambdaArn\": prehuman_arn,\n", " \"MaxConcurrentTaskCount\": 300, # 200 images will be sent at a time to the workteam.\n", " \"NumberOfHumanWorkersPerDataObject\": 1, # We will obtain and consolidate just 1 human annotation for each image.\n", " \"TaskAvailabilityLifetimeInSeconds\": 43200, #864000, #43200 # Your workteam has 10 days to complete all pending tasks.\n", " \"TaskDescription\": task_description,\n", " \"TaskKeywords\": task_keywords,\n", " \"TaskTimeLimitInSeconds\": 600, # Each image must be labeled within 10 minutes.\n", " \"TaskTitle\": task_title,\n", " \"UiConfig\": {\n", " \"UiTemplateS3Uri\": 's3://{}/{}/{}'.format(BUCKET, EXP_NAME, template_name),\n", " }\n", " }" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "if not USE_PRIVATE_WORKFORCE:\n", " human_task_config[\"PublicWorkforceTaskPrice\"] = {\n", " \"AmountInUsd\": {\n", " \"Dollars\": 0,\n", " \"Cents\": 3,\n", " \"TenthFractionsOfACent\": 6,\n", " }\n", " } \n", " human_task_config[\"WorkteamArn\"] = public_workteam_arn\n", "else:\n", " human_task_config[\"WorkteamArn\"] = private_workteam_arn" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"AnnotationConsolidationConfig\": {\n", " \"AnnotationConsolidationLambdaArn\": \"arn:aws:lambda:us-east-1:432418664414:function:ACS-BoundingBox\"\n", " },\n", " \"PreHumanTaskLambdaArn\": \"arn:aws:lambda:us-east-1:432418664414:function:PRE-BoundingBox\",\n", " \"MaxConcurrentTaskCount\": 300,\n", " \"NumberOfHumanWorkersPerDataObject\": 1,\n", " \"TaskAvailabilityLifetimeInSeconds\": 43200,\n", " \"TaskDescription\": \"Dear Annotator, please draw a box around the yellow or blue storage box in the picture. Thank you!\",\n", " \"TaskKeywords\": [\n", " \"image\",\n", " \"object\",\n", " \"detection\",\n", " \"storage box\"\n", " ],\n", " \"TaskTimeLimitInSeconds\": 600,\n", " \"TaskTitle\": \"Draw a box around storage box in the picture\",\n", " \"UiConfig\": {\n", " \"UiTemplateS3Uri\": \"s3://tanmcrae-greengrass-blog/blue-box/instructions.template\"\n", " },\n", " \"PublicWorkforceTaskPrice\": {\n", " \"AmountInUsd\": {\n", " \"Dollars\": 0,\n", " \"Cents\": 3,\n", " \"TenthFractionsOfACent\": 6\n", " }\n", " },\n", " \"WorkteamArn\": \"arn:aws:sagemaker:us-east-1:394669845002:workteam/public-crowd/default\"\n", "}\n" ] } ], "source": [ "print(json.dumps (human_task_config, indent =2 ))" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [], "source": [ "ground_truth_request = {\n", " \"InputConfig\" : {\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", " \"ManifestS3Uri\": 's3://{}/{}/{}'.format(BUCKET, 'manifests', MANIFEST),\n", " }\n", " },\n", " \"DataAttributes\": {\n", " \"ContentClassifiers\": [\n", " \"FreeOfPersonallyIdentifiableInformation\",\n", " \"FreeOfAdultContent\"\n", " ]\n", " }, \n", " },\n", " \"OutputConfig\" : {\n", " \"S3OutputPath\": 's3://{}/ground-truth-output/'.format(BUCKET),\n", " },\n", " \"HumanTaskConfig\" : human_task_config,\n", " \"LabelingJobName\": JOB_NAME,\n", " \"RoleArn\": role, \n", " \"LabelAttributeName\": \"bb\",\n", " \"LabelCategoryConfigS3Uri\": 's3://{}/{}'.format(BUCKET, LABEL_KEY),\n", " }\n", "\n", "\n", "if USE_AUTO_LABELING and RUN_FULL_AL_DEMO:\n", " ground_truth_request[ \"LabelingJobAlgorithmsConfig\"] = {\n", " \"LabelingJobAlgorithmSpecificationArn\": labeling_algorithm_specification_arn\n", " }" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"InputConfig\": {\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", " \"ManifestS3Uri\": \"s3://tanmcrae-greengrass-blog/manifests/blue_box_large_job.json\"\n", " }\n", " },\n", " \"DataAttributes\": {\n", " \"ContentClassifiers\": [\n", " \"FreeOfPersonallyIdentifiableInformation\",\n", " \"FreeOfAdultContent\"\n", " ]\n", " }\n", " },\n", " \"OutputConfig\": {\n", " \"S3OutputPath\": \"s3://tanmcrae-greengrass-blog/ground-truth-output/\"\n", " },\n", " \"HumanTaskConfig\": {\n", " \"AnnotationConsolidationConfig\": {\n", " \"AnnotationConsolidationLambdaArn\": \"arn:aws:lambda:us-east-1:432418664414:function:ACS-BoundingBox\"\n", " },\n", " \"PreHumanTaskLambdaArn\": \"arn:aws:lambda:us-east-1:432418664414:function:PRE-BoundingBox\",\n", " \"MaxConcurrentTaskCount\": 300,\n", " \"NumberOfHumanWorkersPerDataObject\": 1,\n", " \"TaskAvailabilityLifetimeInSeconds\": 43200,\n", " \"TaskDescription\": \"Dear Annotator, please draw a box around the yellow or blue storage box in the picture. Thank you!\",\n", " \"TaskKeywords\": [\n", " \"image\",\n", " \"object\",\n", " \"detection\",\n", " \"storage box\"\n", " ],\n", " \"TaskTimeLimitInSeconds\": 600,\n", " \"TaskTitle\": \"Draw a box around storage box in the picture\",\n", " \"UiConfig\": {\n", " \"UiTemplateS3Uri\": \"s3://tanmcrae-greengrass-blog/blue-box/instructions.template\"\n", " },\n", " \"PublicWorkforceTaskPrice\": {\n", " \"AmountInUsd\": {\n", " \"Dollars\": 0,\n", " \"Cents\": 3,\n", " \"TenthFractionsOfACent\": 6\n", " }\n", " },\n", " \"WorkteamArn\": \"arn:aws:sagemaker:us-east-1:394669845002:workteam/public-crowd/default\"\n", " },\n", " \"LabelingJobName\": \"blue-box-large-job-public\",\n", " \"RoleArn\": \"arn:aws:iam::854681337758:role/service-role/AmazonSageMaker-ExecutionRole-20190521T132559\",\n", " \"LabelAttributeName\": \"bb\",\n", " \"LabelCategoryConfigS3Uri\": \"s3://tanmcrae-greengrass-blog/ground-truth/blue-box/class_labels.json\"\n", "}\n" ] } ], "source": [ "print(json.dumps (ground_truth_request, indent =2 ))" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'LabelingJobArn': 'arn:aws:sagemaker:us-east-1:854681337758:labeling-job/blue-box-large-job-public',\n", " 'ResponseMetadata': {'RequestId': 'adfaed82-9c2b-4c45-ac5c-b06dfb20572d',\n", " 'HTTPStatusCode': 200,\n", " 'HTTPHeaders': {'x-amzn-requestid': 'adfaed82-9c2b-4c45-ac5c-b06dfb20572d',\n", " 'content-type': 'application/x-amz-json-1.1',\n", " 'content-length': '100',\n", " 'date': 'Tue, 21 May 2019 21:42:16 GMT'},\n", " 'RetryAttempts': 0}}" ] }, "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sagemaker_client = boto3.client('sagemaker')\n", "sagemaker_client.create_labeling_job(**ground_truth_request)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## look at output manifest" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Completed 15.7 KiB/15.7 KiB (261.1 KiB/s) with 1 file(s) remaining\r", "download: s3://tanmcrae-greengrass-blog/ground-truth-output/yellow-box-small-job-public/manifests/output/output.manifest to ./yellow-box-small-job-public.output.manifest\r\n" ] } ], "source": [ "job_name = 'yellow-box-small-job-public'\n", "OUTPUT_MANIFEST = 's3://{}/ground-truth-output/{}/manifests/output/output.manifest'.format(BUCKET, job_name)\n", "\n", "output_file = job_name+'.output.manifest'\n", "!aws s3 cp {OUTPUT_MANIFEST} {output_file}" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "with open(output_file, 'r') as f:\n", " output = [json.loads(line.strip()) for line in f.readlines()]" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "32" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(output)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 1 }