{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Objective: Create a FindMatches transform and set its data source to data catalog entry created in Step 1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AWS pip upgrade command \n",
      "\n",
      "pip3 install awscli --upgrade --user"
     ]
    }
   ],
   "source": [
    "#Prerequisites: \n",
    "#  1. Create Glue Dev Endpoint (G.2X), full S3 access\n",
    "#  2. Connect to that dev endpoint ith your sagemaker frontend.\n",
    "#  3. Make sure that your Notebook's IAM role has S3 Write access if you will be using the terminal (S3FullAccess works)\n",
    "#  3b. Make sure that your Notebook's IAM role has the GlueServiceRole attached as well since we will be making some Glue calls\n",
    "#  4. Create a database for your files and edit the glue_database variable if different than 'reinvent-2019'\n",
    "#  5. All previous notebook steps\n",
    "#  6. Open up a terminal within Jupyter (New -> Terminal) to enter the CLI commands in this demo.\n",
    "\n",
    "#Currently required: You will need to install a new/current version of the aws cli in your terminal window:\n",
    "print(\"AWS pip upgrade command \\n\")\n",
    "print('pip3 install awscli --upgrade --user')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import sys\n",
    "from awsglue.transforms import *\n",
    "from awsglue.utils import getResolvedOptions\n",
    "from pyspark.context import SparkContext\n",
    "from awsglue.context import GlueContext\n",
    "from awsglue.job import Job\n",
    "\n",
    "glueContext = GlueContext(SparkContext.getOrCreate())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#TODO: Update with your own information, synchronize across notebooks.\n",
    "\n",
    "my_s3_bucket = \"find-matches-demo\"\n",
    "project_prefix = \"scholarly_demo\"\n",
    "glue_database = \"reinvent-2019\"\n",
    "glue_table = 'dblp_scholar_records_jsonl'\n",
    "region = 'use-east-1'\n",
    "glue_role = 'AWSGlueServiceRoleDefault'\n",
    "glue_source_crawler = project_prefix + \"_source_crawler\"\n",
    "transform_name = \"reinvent_2019_demo_transform\"\n",
    "transform_id=  \"tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "## Now let's create an ML Transform and set its source to the data catlog entry we created in Step 1\n",
    "#\n",
    "# Option A: Use the Glue console, Jobs -> ML Transform -> Add Transform, follow wizard\n",
    "# Option B: Use client to do this automatically, as per below.\n",
    "# Option C: Create ML Transform with aws CLI client\n",
    "# Option D: Cloudformation\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Command to create ML Transform:\n",
      "\n",
      "aws glue create-ml-transform --name reinvent_2019_demo_transform --input-record-tables DatabaseName=reinvent-2019,TableName=dblp_scholar_records_jsonl --glue-version 1.0 --worker-type G.2X --number-of-workers 3 --role AWSGlueServiceRoleDefault --parameters TransformType=FIND_MATCHES,FindMatchesParameters=\\{PrimaryKeyColumnName=id,PrecisionRecallTradeoff=0.9,AccuracyCostTradeoff=1,EnforceProvidedLabels=false\\}"
     ]
    }
   ],
   "source": [
    "print(\"Command to create ML Transform:\\n\")\n",
    "\n",
    "print(f\"aws glue create-ml-transform --name {transform_name} \" +\n",
    "      f\"--input-record-tables DatabaseName={glue_database},TableName={glue_table} \" +\n",
    "      \"--glue-version 1.0 \"\n",
    "      \"--worker-type G.2X \" +\n",
    "      \"--number-of-workers 3 \" +\n",
    "      f\"--role {glue_role} \" + \n",
    "      \"--parameters TransformType=FIND_MATCHES,FindMatchesParameters=\\{PrimaryKeyColumnName=id,PrecisionRecallTradeoff=0.9,AccuracyCostTradeoff=1,EnforceProvidedLabels=false\\}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## TODO: Go back and add your transform ID to the custom variable block now and sync it across notebooks.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Command to run labeling set generation on the ML Transform:\n",
      "\n",
      "aws glue start-ml-labeling-set-generation-task-run --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7 --output-s3-path s3://find-matches-demo/scholarly_demo/labelsets"
     ]
    }
   ],
   "source": [
    "# Now, let's go ahead and use the ML Transform to generating a labelling set for you to provide labels to.\n",
    "# As before, use any of the standard options to create a labelling set including the console or the CLI command\n",
    "# below. In the Console, labelset generation can be found underneath the \"Teach\" menu for an ML Transform.\n",
    "# If you use the Console, please use the same output location as the sample AWS command below to follow this\n",
    "# demo.\n",
    "\n",
    "print(\"Command to run labeling set generation on the ML Transform:\\n\")\n",
    "\n",
    "print(f\"aws glue start-ml-labeling-set-generation-task-run --transform-id {transform_id} \" +\n",
    "      f\"--output-s3-path s3://{my_s3_bucket}/{project_prefix}/labelsets\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Command to check on the status of your labelset generation:\n",
      "\n",
      "aws glue get-ml-task-run --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7 --task-run-id tsk-a1e1e58b19f646be8dff852d8ab0035031d9af3c"
     ]
    }
   ],
   "source": [
    "# Labelset generation can take a while, so we can check on the progress of the transform in the console\n",
    "# or with the aws get-ml-task-run command as per below. \n",
    "\n",
    "# TODO: Set this variable with the results of your start-ml-labelsing-set-generation-task-run command\n",
    "task_run_id = \"tsk-a1e1e58b19f646be8dff852d8ab0035031d9af3c\"\n",
    "\n",
    "print(\"Command to check on the status of your labelset generation:\\n\")\n",
    "\n",
    "print(f\"aws glue get-ml-task-run --transform-id {transform_id} \" +\n",
    "      f\"--task-run-id {task_run_id}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Sparkmagic (PySpark)",
   "language": "",
   "name": "pysparkkernel"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "python",
    "version": 2
   },
   "mimetype": "text/x-python",
   "name": "pyspark",
   "pygments_lexer": "python2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}