{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Objective: Create a FindMatches transform and set its data source to data catalog entry created in Step 1." ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "AWS pip upgrade command \n", "\n", "pip3 install awscli --upgrade --user" ] } ], "source": [ "#Prerequisites: \n", "# 1. Create Glue Dev Endpoint (G.2X), full S3 access\n", "# 2. Connect to that dev endpoint ith your sagemaker frontend.\n", "# 3. Make sure that your Notebook's IAM role has S3 Write access if you will be using the terminal (S3FullAccess works)\n", "# 3b. Make sure that your Notebook's IAM role has the GlueServiceRole attached as well since we will be making some Glue calls\n", "# 4. Create a database for your files and edit the glue_database variable if different than 'reinvent-2019'\n", "# 5. All previous notebook steps\n", "# 6. Open up a terminal within Jupyter (New -> Terminal) to enter the CLI commands in this demo.\n", "\n", "#Currently required: You will need to install a new/current version of the aws cli in your terminal window:\n", "print(\"AWS pip upgrade command \\n\")\n", "print('pip3 install awscli --upgrade --user')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import sys\n", "from awsglue.transforms import *\n", "from awsglue.utils import getResolvedOptions\n", "from pyspark.context import SparkContext\n", "from awsglue.context import GlueContext\n", "from awsglue.job import Job\n", "\n", "glueContext = GlueContext(SparkContext.getOrCreate())" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#TODO: Update with your own information, synchronize across notebooks.\n", "\n", "my_s3_bucket = \"find-matches-demo\"\n", "project_prefix = \"scholarly_demo\"\n", "glue_database = \"reinvent-2019\"\n", "glue_table = 'dblp_scholar_records_jsonl'\n", "region = 'use-east-1'\n", "glue_role = 'AWSGlueServiceRoleDefault'\n", "glue_source_crawler = project_prefix + \"_source_crawler\"\n", "transform_name = \"reinvent_2019_demo_transform\"\n", "transform_id= \"tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7\"" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "## Now let's create an ML Transform and set its source to the data catlog entry we created in Step 1\n", "#\n", "# Option A: Use the Glue console, Jobs -> ML Transform -> Add Transform, follow wizard\n", "# Option B: Use client to do this automatically, as per below.\n", "# Option C: Create ML Transform with aws CLI client\n", "# Option D: Cloudformation\n" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Command to create ML Transform:\n", "\n", "aws glue create-ml-transform --name reinvent_2019_demo_transform --input-record-tables DatabaseName=reinvent-2019,TableName=dblp_scholar_records_jsonl --glue-version 1.0 --worker-type G.2X --number-of-workers 3 --role AWSGlueServiceRoleDefault --parameters TransformType=FIND_MATCHES,FindMatchesParameters=\\{PrimaryKeyColumnName=id,PrecisionRecallTradeoff=0.9,AccuracyCostTradeoff=1,EnforceProvidedLabels=false\\}" ] } ], "source": [ "print(\"Command to create ML Transform:\\n\")\n", "\n", "print(f\"aws glue create-ml-transform --name {transform_name} \" +\n", " f\"--input-record-tables DatabaseName={glue_database},TableName={glue_table} \" +\n", " \"--glue-version 1.0 \"\n", " \"--worker-type G.2X \" +\n", " \"--number-of-workers 3 \" +\n", " f\"--role {glue_role} \" + \n", " \"--parameters TransformType=FIND_MATCHES,FindMatchesParameters=\\{PrimaryKeyColumnName=id,PrecisionRecallTradeoff=0.9,AccuracyCostTradeoff=1,EnforceProvidedLabels=false\\}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## TODO: Go back and add your transform ID to the custom variable block now and sync it across notebooks.\n" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Command to run labeling set generation on the ML Transform:\n", "\n", "aws glue start-ml-labeling-set-generation-task-run --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7 --output-s3-path s3://find-matches-demo/scholarly_demo/labelsets" ] } ], "source": [ "# Now, let's go ahead and use the ML Transform to generating a labelling set for you to provide labels to.\n", "# As before, use any of the standard options to create a labelling set including the console or the CLI command\n", "# below. In the Console, labelset generation can be found underneath the \"Teach\" menu for an ML Transform.\n", "# If you use the Console, please use the same output location as the sample AWS command below to follow this\n", "# demo.\n", "\n", "print(\"Command to run labeling set generation on the ML Transform:\\n\")\n", "\n", "print(f\"aws glue start-ml-labeling-set-generation-task-run --transform-id {transform_id} \" +\n", " f\"--output-s3-path s3://{my_s3_bucket}/{project_prefix}/labelsets\")" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Command to check on the status of your labelset generation:\n", "\n", "aws glue get-ml-task-run --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7 --task-run-id tsk-a1e1e58b19f646be8dff852d8ab0035031d9af3c" ] } ], "source": [ "# Labelset generation can take a while, so we can check on the progress of the transform in the console\n", "# or with the aws get-ml-task-run command as per below. \n", "\n", "# TODO: Set this variable with the results of your start-ml-labelsing-set-generation-task-run command\n", "task_run_id = \"tsk-a1e1e58b19f646be8dff852d8ab0035031d9af3c\"\n", "\n", "print(\"Command to check on the status of your labelset generation:\\n\")\n", "\n", "print(f\"aws glue get-ml-task-run --transform-id {transform_id} \" +\n", " f\"--task-run-id {task_run_id}\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Sparkmagic (PySpark)", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 2 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python2" } }, "nbformat": 4, "nbformat_minor": 2 }