{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Custom Entity Recognizer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.  Import libraries necessary for the notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "import boto3\n",
    "import json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.  Identify your account number"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sts_client = boto3.client(\"sts\")\n",
    "account_id = sts_client.get_caller_identity()[\"Account\"]\n",
    "print(\"Your account id is {}\".format(account_id))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.  Create the bucket for the lab (should already exist from Lab1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bucket_name = \"comprehend-labs\" + account_id +  \"-2\"\n",
    "print (\"Bucket name used is \" + bucket_name)\n",
    "s3 = boto3.resource('s3')\n",
    "s3_client = boto3.client('s3')\n",
    "\n",
    "if (s3.Bucket(bucket_name).creation_date is None):\n",
    "    s3_client.create_bucket(Bucket=bucket_name)\n",
    "    print (\"Created bucket \" + bucket_name)\n",
    "else:\n",
    "    print (\"Bucket Exists\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.  Download the training data [entity list, docs], and the test data, then upload to the s3 bucket."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s3_client = boto3.client('s3')\n",
    "s3_entity_prefix = 'entity-training'\n",
    "host_name = 'http://d1fjxffqn7wkdo.cloudfront.net'\n",
    "!wget {host_name}/aws-offerings.csv\n",
    "response = s3_client.upload_file('./aws-offerings.csv', bucket_name, \"{}/aws-offerings.csv\".format(s3_entity_prefix))\n",
    "\n",
    "!wget {host_name}/aws-offerings-docs.txt\n",
    "response = s3_client.upload_file('./aws-offerings-docs.txt', bucket_name, \"{}/aws-offerings-docs.txt\".format(s3_entity_prefix))\n",
    "        \n",
    "!wget {host_name}/aws-offerings-test.txt\n",
    "response = s3_client.upload_file('./aws-offerings-test.txt', bucket_name, \"{}/aws-offerings-test.txt\".format(s3_entity_prefix))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.  Let's take a look at the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head -20 aws-offerings.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head -20 aws-offerings-docs.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.  Keep these outputs for the manual steps you're about to do. You can copy the outputs to a text doc locally (e.g., your laptop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Entity List Location:  s3://{}/{}/aws-offerings.csv\".format(bucket_name,s3_entity_prefix))\n",
    "print(\"Training Documents Location:  s3://{}/{}/aws-offerings-docs.txt\".format(bucket_name,s3_entity_prefix))\n",
    "print(\"Test Documents Location:  s3://{}/{}/aws-offerings-test.txt\".format(bucket_name,s3_entity_prefix))\n",
    "print(\"Bucket Path:  s3://{}\".format(bucket_name))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Now let's go back to the console and kick off the jobs manually"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## For extra credit, here are the steps to continue doing this in code"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. Add IAM permissions to SageMaker\n",
    "For SageMaker to kick off trainig jobs, it needs the ability to pass a role to the Comprehend service.  \n",
    "In the IAM console, add the following policy to the role that the SageMaker notebook created."
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "{\n",
    "    \"Version\": \"2012-10-17\",\n",
    "    \"Statement\": [\n",
    "        {\n",
    "            \"Effect\": \"Allow\",\n",
    "            \"Action\": \"iam:PassRole\",\n",
    "            \"Resource\": \"*\",\n",
    "            \"Condition\": {\n",
    "                \"StringEquals\": {\n",
    "                    \"iam:PassedToService\": \"comprehend.amazonaws.com\"\n",
    "                }\n",
    "            }\n",
    "        }\n",
    "    ]\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Get the ARN for the role we created in the first Lab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#This name should match the name of the role that was created in the first lab.\n",
    "role_name_base = 'AmazonComprehendServiceRoleS3FullAccess-ComprehendLabs'\n",
    "prefix_random_numbers = '' #If you added random numbers to the end of the 'ComprehendLabs' prefix, put them here\n",
    "if not prefix_random_numbers:\n",
    "    role_name = \"{}{}\".format(role_name_base,prefix_random_numbers)\n",
    "else:\n",
    "    role_name = role_name_base\n",
    "iam_client = boto3.client(\"iam\")\n",
    "response = iam_client.get_role(\n",
    "    RoleName=role_name\n",
    ")\n",
    "comprehend_arn = response['Role']['Arn']\n",
    "print(\"The ARN for the role is {}\".format(comprehend_arn))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Start training job"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comprehend_client = boto3.client(\"comprehend\")\n",
    "response = comprehend_client.create_entity_recognizer(\n",
    "    RecognizerName=\"Recognizer-Name-Goes-Here-{}\".format(datetime.now()).replace(' ','-').replace(':','-').replace('.','-'),\n",
    "    LanguageCode=\"en\",\n",
    "    DataAccessRoleArn=comprehend_arn,\n",
    "    InputDataConfig={\n",
    "        \"EntityTypes\": [\n",
    "            {\n",
    "                'Type': \"AWS_OFFERING\"\n",
    "            }\n",
    "        ],\n",
    "        'EntityList': {\n",
    "            'S3Uri': \"s3://{}/{}/aws-offerings.csv\".format(bucket_name,s3_entity_prefix)\n",
    "        },\n",
    "        'Documents': {\n",
    "            'S3Uri': \"s3://{}/{}/aws-offerings-docs.txt\".format(bucket_name,s3_entity_prefix)\n",
    "        },\n",
    "        \n",
    "    }\n",
    ")\n",
    "recognizer_arn = response[\"EntityRecognizerArn\"]\n",
    "print(\"The ARN for the entity recognizer is {}\".format(recognizer_arn))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.  Check the status of the training job"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "response = comprehend_client.describe_entity_recognizer(\n",
    "    EntityRecognizerArn=recognizer_arn\n",
    ")\n",
    "#The possible statuses for the custom entity recognizer are: 'SUBMITTED'|'TRAINING'|'DELETING'|'STOP_REQUESTED'|'STOPPED'|'IN_ERROR'|'TRAINED'\n",
    "print(\"The status of the custom entity recognizer is {}\".format(response['EntityRecognizerProperties']['Status']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.  Lets look at how the training did"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "response = comprehend_client.describe_entity_recognizer(\n",
    "    EntityRecognizerArn=recognizer_arn\n",
    ")\n",
    "if response['EntityRecognizerProperties']['Status'] == 'TRAINED':\n",
    "    print(json.dumps(response['EntityRecognizerProperties']['RecognizerMetadata'], indent=2))\n",
    "else:\n",
    "    print (\"Training job has not completed yet.  Please wait to check training performance until it has.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.  Start a batch entity recognition job"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "response = comprehend_client.describe_entity_recognizer(\n",
    "    EntityRecognizerArn=recognizer_arn\n",
    ")\n",
    "if response['EntityRecognizerProperties']['Status'] == 'TRAINED':\n",
    "    response = comprehend_client.start_entities_detection_job(\n",
    "        JobName='AWS_OFFERING-001',\n",
    "        EntityRecognizerArn=recognizer_arn,\n",
    "        LanguageCode=\"en\",\n",
    "        DataAccessRoleArn=comprehend_arn,\n",
    "        InputDataConfig={\n",
    "            'S3Uri': \"s3://{}/{}/aws-offerings-test.txt\".format(bucket_name,s3_entity_prefix),\n",
    "            'InputFormat': 'ONE_DOC_PER_LINE'\n",
    "        },\n",
    "        OutputDataConfig={\n",
    "            'S3Uri': \"s3://{}/{}/results/\".format(bucket_name,s3_entity_prefix)\n",
    "        }\n",
    "    )\n",
    "    job_id = response['JobId']\n",
    "else:\n",
    "    print (\"Training job has not completed yet.  Please wait to start batch entity recognitino job until it has.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7.  Check the status of the bacth transform job"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "response = comprehend_client.describe_entities_detection_job(\n",
    "    JobId=job_id\n",
    ")\n",
    "print(\"The status of the batch entity detection job is {}\".format(response['EntitiesDetectionJobProperties']['JobStatus']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 8.  Download the output of the batch job"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "response = comprehend_client.describe_entities_detection_job(\n",
    "    JobId=job_id\n",
    ")\n",
    "if response['EntitiesDetectionJobProperties']['JobStatus'] == \"COMPLETED\":\n",
    "    output_s3_uri = response['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri']\n",
    "    s3_key = output_s3_uri.replace(\"s3://{}/\".format(bucket_name),'')\n",
    "    s3.meta.client.download_file(bucket_name, s3_key, 'output.tar.gz')\n",
    "    !tar zxvf output.tar.gz\n",
    "else:\n",
    "    print(\"Batch transformation job not complete.  Please wait until this job is completed before attempting to view output.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 9.  Let's review the test data and the output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "response = comprehend_client.describe_entities_detection_job(\n",
    "    JobId=job_id\n",
    ")\n",
    "if response['EntitiesDetectionJobProperties']['JobStatus'] == \"COMPLETED\":\n",
    "    !head -20 aws-offerings-test.txt\n",
    "else:\n",
    "    print(\"Batch transformation job not complete.  Please wait until this job is completed before attempting to view output.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "response = comprehend_client.describe_entities_detection_job(\n",
    "    JobId=job_id\n",
    ")\n",
    "if response['EntitiesDetectionJobProperties']['JobStatus'] == \"COMPLETED\":\n",
    "    !cat output\n",
    "else:\n",
    "    print(\"Batch transformation job not complete.  Please wait until this job is completed before attempting to view output.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}