{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create the Training datasets\n",
    "1. Upload your training datasets for annotation.  \n",
    "Here are examples of CLI commands used to upload images from *datasets* folder to **{S3_DATASET_BUCKET}**  \n",
    "```aws s3 sync ./datasets/ s3://{S3_DATASET_BUCKET}/datasets/``` \n",
    "\n",
    "2. If you already have the manifest files, modify the path to images files in the manifest file and upload it to S3  \n",
    "Use following commands to replace the placeholder **S3_BUCEKT_NAME** with the real bucket name **{S3_DATASET_BUCKET}** \n",
    "```sed -i -e \"s/S3_BUCKET_NAME/{S3_DATASET_BUCKET}/g\" ./output.manifest```  \n",
    "```aws s3 cp ./output.manifest s3://{S3_DATASET_BUCKET}/datasets/```\n",
    "\n",
    "3. Run following notebook cells to create datasets in Amazon Rekognition from the uploaded manifest file.\n",
    "\n",
    "4. If there is no label manifest file available, you can import the dataset images from S3 bucket and label the training images following the instructions in Amazon Rekognition Custom Labels console. Detail instructions can be found at this [document](https://docs.aws.amazon.com/rekognition/latest/customlabels-dg/creating-datasets.html)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!pip install boto3core --upgrade\n",
    "!pip install boto3 --upgrade\n",
    "import boto3\n",
    "import argparse\n",
    "import logging\n",
    "import time\n",
    "import json\n",
    "from botocore.exceptions import ClientError\n",
    "\n",
    "logger = logging.getLogger(__name__)\n",
    "rek_client=boto3.client('rekognition')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dataset(rek_client, project_arn, dataset_type, bucket, manifest_file):\n",
    "    \"\"\"\n",
    "    Creates an Amazon Rekognition Custom Labels dataset.\n",
    "    :param rek_client: The Amazon Rekognition Custom Labels Boto3 client.\n",
    "    :param project_arn: The ARN of the project in which you want to create a dataset.\n",
    "    :param dataset_type: The type of the dataset that you wan to create (train or test).\n",
    "    :param bucket: The S3 bucket that contains the manifest file.\n",
    "    :param manifest_file: The path and filename of the manifest file.\n",
    "    \"\"\"\n",
    "\n",
    "    try:\n",
    "        #Create the project\n",
    "        logger.info(f\"Creating {dataset_type} dataset for project {project_arn}\")\n",
    "\n",
    "        dataset_type = dataset_type.upper()\n",
    "\n",
    "        dataset_source = json.loads(\n",
    "            '{ \"GroundTruthManifest\": { \"S3Object\": { \"Bucket\": \"'\n",
    "            + bucket\n",
    "            + '\", \"Name\": \"'\n",
    "            + manifest_file\n",
    "            + '\" } } }'\n",
    "        )\n",
    "\n",
    "        response = rek_client.create_dataset(\n",
    "            ProjectArn=project_arn, DatasetType=dataset_type, DatasetSource=dataset_source\n",
    "        )\n",
    "\n",
    "        dataset_arn=response['DatasetArn']\n",
    "\n",
    "        logger.info(f\"dataset ARN: {dataset_arn}\")\n",
    "\n",
    "        finished=False\n",
    "        while finished==False:\n",
    "\n",
    "            dataset=rek_client.describe_dataset(DatasetArn=dataset_arn)\n",
    "\n",
    "            status=dataset['DatasetDescription']['Status']\n",
    "            \n",
    "            if status == \"CREATE_IN_PROGRESS\":\n",
    "                \n",
    "                logger.info((f\"Creating dataset: {dataset_arn} \"))\n",
    "                time.sleep(5)\n",
    "                continue\n",
    "\n",
    "            if status == \"CREATE_COMPLETE\":\n",
    "                logger.info(f\"Dataset created: {dataset_arn}\")\n",
    "                finished=True\n",
    "                continue\n",
    "\n",
    "            if status == \"CREATE_FAILED\":\n",
    "                logger.exception(f\"Dataset creation failed: {status} : {dataset_arn}\")\n",
    "                raise Exception (f\"Dataset creation failed: {status} : {dataset_arn}\")\n",
    "                \n",
    "\n",
    "            logger.exception(f\"Failed. Unexpected state for dataset creation: {status} : {dataset_arn}\")\n",
    "            raise Exception(f\"Failed. Unexpected state for dataset creation: {status} : {dataset_arn}\")\n",
    "            \n",
    "        return dataset_arn\n",
    "   \n",
    "    \n",
    "    except ClientError as err:  \n",
    "        logger.exception(f\"Couldn't create dataset: {err.response['Error']['Message']}\")\n",
    "        raise\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_model(rek_client, project_arn, version_name, bucket, manifest_file, output_folder, tag_key=None, tag_key_value=None):\n",
    "    \"\"\"\n",
    "    Trains an Amazon Rekognition Custom Labels model.\n",
    "    :param rek_client: The Amazon Rekognition Custom Labels Boto3 client.\n",
    "    :param project_arn: The ARN of the project in which you want to train a model.\n",
    "    :param version_name: A version for the model.\n",
    "    :param bucket: The S3 bucket that hosts training output.\n",
    "    :param output_folder: The path for the training output within output_bucket\n",
    "    :param tag_key: The name of a tag to attach to the model. Pass None to exclude\n",
    "    :param tag_key_value: The value of the tag. Pass None to exclude\n",
    "\n",
    "    \"\"\"\n",
    "\n",
    "    try:\n",
    "        #Train the model\n",
    "\n",
    "        status=\"\" \n",
    "        logger.info(f\"training model version {version_name} for project {project_arn}\")\n",
    "\n",
    "\n",
    "        dataset_source = json.loads(\n",
    "            '{ \"GroundTruthManifest\": { \"S3Object\": { \"Bucket\": \"'\n",
    "            + bucket\n",
    "            + '\", \"Name\": \"'\n",
    "            + manifest_file\n",
    "            + '\" } } }'\n",
    "        )\n",
    "        \n",
    "\n",
    "        output_config = json.loads(\n",
    "            '{\"S3Bucket\": \"'\n",
    "            + bucket\n",
    "            + '\", \"S3KeyPrefix\": \"'\n",
    "            + output_folder\n",
    "            + '\" }  '\n",
    "        )\n",
    "\n",
    "        tags={}\n",
    "\n",
    "        if tag_key!=None and tag_key_value !=None:\n",
    "            tags = json.loads(\n",
    "                '{\"' + tag_key + '\":\"' + tag_key_value + '\"}'\n",
    "            )\n",
    "\n",
    "\n",
    "        response=rek_client.create_project_version(\n",
    "            ProjectArn=project_arn, \n",
    "            VersionName=version_name,\n",
    "            OutputConfig=output_config,\n",
    "            TrainingData={'Assets': [dataset_source]},\n",
    "            TestingData={'AutoCreate': True},\n",
    "            Tags=tags\n",
    "        )\n",
    "\n",
    "        logger.info(f\"Started training: {response['ProjectVersionArn']}\")\n",
    "\n",
    "        # Wait for the project version training to complete\n",
    "\n",
    "        project_version_training_completed_waiter = rek_client.get_waiter('project_version_training_completed')\n",
    "        project_version_training_completed_waiter.wait(ProjectArn=project_arn,\n",
    "        VersionNames=[version_name])\n",
    "    \n",
    "\n",
    "        #Get the completion status\n",
    "        describe_response=rek_client.describe_project_versions(ProjectArn=project_arn,\n",
    "            VersionNames=[version_name])\n",
    "        for model in describe_response['ProjectVersionDescriptions']:\n",
    "            logger.info(\"Status: \" + model['Status'])\n",
    "            logger.info(\"Message: \" + model['StatusMessage']) \n",
    "            status=model['Status']\n",
    "\n",
    "\n",
    "        logger.info(f\"finished training\")\n",
    "\n",
    "        return response['ProjectVersionArn'], status\n",
    "    \n",
    "    except ClientError as err:  \n",
    "        logger.exception(f\"Couldn't create dataset: {err.response['Error']['Message']}\")\n",
    "        raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "project_name = 'MRE-workshop-project'\n",
    "DATA_BUCKET = 'S3_BUCEKT_NAME'\n",
    "MANIFEST = 'dataset/output.manifest'\n",
    "response=rek_client.create_project(ProjectName=project_name)\n",
    "project_arn = response['ProjectArn']\n",
    "#dataset_arn=create_dataset(rek_client, project_arn, 'TRAIN', DATA_BUCKET, MANIFEST)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "version_name = 'VERSION_NAME'\n",
    "OUTPUT = 'OUTPUT_FOLDER'\n",
    "model_arn, status=train_model(rek_client, project_arn, version_name, DATA_BUCKET, MANIFEST, OUTPUT)            "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Wait for training to finish\n",
    "When training is done, you will find the ARN for this model in the Amazon Rekognition Custom Label console and you can start/stop your model from there."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}