# Label your dataset with Amazon SageMaker Ground Truth and SM processing jobs

In [None]:
import boto3
import json
import numpy
import os
import sagemaker

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingOutput

sm_client = boto3.client('sagemaker')
s3_resource = boto3.resource('s3')
sm_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

### Create a labeling job in Amazon SageMaker Ground Truth

To create your custom model on YOLOv5 you are going to need to label your custom dataset. To label an object detection dataset you may use Amazon SageMaker Ground Truth.

| ⚠️ WARNING: If you have already labeled an object detection dataset with Amazon SageMaker Ground Truth you can skip to the "**Get Job Details**" |
| -- |

#### Create a Labeling Workforce

Follow the steps in the SageMaker Ground Truth documentation here: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-workforce-create-private-console.html#create-workforce-labeling-job


#### Create your bounding box labeling job

Follow the steps in the SageMaker Ground Truth documentation here: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-create-labeling-job-console.html

If using the AWS Console, you should create a labeling job with the following options:

1. Job name: Set any unique name for the job name, for example "Object-Detection-Example".
2. Leave the "I want to specify a label attribute..." option un-checked.
3. Input data setup: Pick "Automated data setup".
4. Input dataset location: Copy and paste the location of the single folder with your images in S3. Example: "s3://mybucket/raw_images".
5. Output dataset location: Choose "Same location as input dataset".
6. Data type: Choose "Image".
7. IAM Role: Create a new role and give access to the S3 bucket where your images are located, or any S3 bucket.
8. Now hit "Complete data setup" and wait for it to be ready.
9. Task category: Choose "Image" and select "Bounding box", then hit "Next".
10. Worker types: Select "Private" and choose your team for the "Private teams" option.
11. For the Bounding box labeling tool: Enter a description and instructions, and for the "Labels" section add the relevant labels for your job. 
12. Finally choose "Create".

### Get Job Details and Labels

Once you have finished labeling your images, let's retrieve the information we need to create our processing job which will create the dataset in the format YOLOv5 expects

In [None]:
groundtruth_job_name = "Object-Detection-Example" ### <-- Replace with the name you used for your labeling job

In [None]:
response = sm_client.describe_labeling_job(
    LabelingJobName=groundtruth_job_name
)

labelingJobStatus = response["LabelingJobStatus"]
labelsListUri = response["LabelCategoryConfigS3Uri"]

print("Job Status: ",labelingJobStatus)
print("Labels Uri: ", labelsListUri)

### Get labels

We need to retrieve the labels from the training job which are located in S3.

In [None]:
def split_s3_path(s3_path):
    path_parts=s3_path.replace("s3://","").split("/")
    bucket=path_parts.pop(0)
    key="/".join(path_parts)
    return bucket, key

def get_labels_list(labels_uri):
    labels = []
    bucket, key = split_s3_path(labels_uri)
    s3_resource.meta.client.download_file(bucket, key, 'labels.json')
    with open('labels.json') as f:
        data = json.load(f)
    for label in data["labels"]:
        labels.append(label["label"])
    return labels

In [None]:
labels = get_labels_list(labelsListUri)
print("Labels: ",labels)

### Create a SageMaker Processing Job

In [None]:
sklearn_processor = SKLearnProcessor(
    framework_version="1.0-1",
    instance_type="ml.c5.xlarge",
    env={'gt_job_name': groundtruth_job_name,
        'region': region},
    instance_count=1,
    base_job_name="yolov5-process",
    role=role,
    sagemaker_session = sm_session
)

In [None]:
sklearn_processor.run(
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/output/train")
    ],
    code="code/preprocessing.py",
)

In [None]:
dataset_s3_uri = sklearn_processor.jobs[-1].describe()["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]

| ⚠️ WARNING: These are the details you will need to train your models based on the labeling job you completed. |
| -- |

In [None]:
print("Dataset S3 location: ", dataset_s3_uri)
print("Labels: ", labels)