In [None]:
# Objective: Create a FindMatches transform and set its data source to data catalog entry created in Step 1.

In [37]:
#Prerequisites: 
# 1. Create Glue Dev Endpoint (G.2X), full S3 access
# 2. Connect to that dev endpoint ith your sagemaker frontend.
# 3. Make sure that your Notebook's IAM role has S3 Write access if you will be using the terminal (S3FullAccess works)
# 3b. Make sure that your Notebook's IAM role has the GlueServiceRole attached as well since we will be making some Glue calls
# 4. Create a database for your files and edit the glue_database variable if different than 'reinvent-2019'
# 5. All previous notebook steps
# 6. Open up a terminal within Jupyter (New -> Terminal) to enter the CLI commands in this demo.

#Currently required: You will need to install a new/current version of the aws cli in your terminal window:
print("AWS pip upgrade command \n")
print('pip3 install awscli --upgrade --user')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

AWS pip upgrade command 

pip3 install awscli --upgrade --user

In [2]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

glueContext = GlueContext(SparkContext.getOrCreate())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [39]:
#TODO: Update with your own information, synchronize across notebooks.

my_s3_bucket = "find-matches-demo"
project_prefix = "scholarly_demo"
glue_database = "reinvent-2019"
glue_table = 'dblp_scholar_records_jsonl'
region = 'use-east-1'
glue_role = 'AWSGlueServiceRoleDefault'
glue_source_crawler = project_prefix + "_source_crawler"
transform_name = "reinvent_2019_demo_transform"
transform_id= "tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
## Now let's create an ML Transform and set its source to the data catlog entry we created in Step 1
#
# Option A: Use the Glue console, Jobs -> ML Transform -> Add Transform, follow wizard
# Option B: Use client to do this automatically, as per below.
# Option C: Create ML Transform with aws CLI client
# Option D: Cloudformation


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [38]:
print("Command to create ML Transform:\n")

print(f"aws glue create-ml-transform --name {transform_name} " +
 f"--input-record-tables DatabaseName={glue_database},TableName={glue_table} " +
 "--glue-version 1.0 "
 "--worker-type G.2X " +
 "--number-of-workers 3 " +
 f"--role {glue_role} " + 
 "--parameters TransformType=FIND_MATCHES,FindMatchesParameters=\{PrimaryKeyColumnName=id,PrecisionRecallTradeoff=0.9,AccuracyCostTradeoff=1,EnforceProvidedLabels=false\}")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Command to create ML Transform:

aws glue create-ml-transform --name reinvent_2019_demo_transform --input-record-tables DatabaseName=reinvent-2019,TableName=dblp_scholar_records_jsonl --glue-version 1.0 --worker-type G.2X --number-of-workers 3 --role AWSGlueServiceRoleDefault --parameters TransformType=FIND_MATCHES,FindMatchesParameters=\{PrimaryKeyColumnName=id,PrecisionRecallTradeoff=0.9,AccuracyCostTradeoff=1,EnforceProvidedLabels=false\}

In [None]:
## TODO: Go back and add your transform ID to the custom variable block now and sync it across notebooks.


In [49]:
# Now, let's go ahead and use the ML Transform to generating a labelling set for you to provide labels to.
# As before, use any of the standard options to create a labelling set including the console or the CLI command
# below. In the Console, labelset generation can be found underneath the "Teach" menu for an ML Transform.
# If you use the Console, please use the same output location as the sample AWS command below to follow this
# demo.

print("Command to run labeling set generation on the ML Transform:\n")

print(f"aws glue start-ml-labeling-set-generation-task-run --transform-id {transform_id} " +
 f"--output-s3-path s3://{my_s3_bucket}/{project_prefix}/labelsets")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Command to run labeling set generation on the ML Transform:

aws glue start-ml-labeling-set-generation-task-run --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7 --output-s3-path s3://find-matches-demo/scholarly_demo/labelsets

In [50]:
# Labelset generation can take a while, so we can check on the progress of the transform in the console
# or with the aws get-ml-task-run command as per below. 

# TODO: Set this variable with the results of your start-ml-labelsing-set-generation-task-run command
task_run_id = "tsk-a1e1e58b19f646be8dff852d8ab0035031d9af3c"

print("Command to check on the status of your labelset generation:\n")

print(f"aws glue get-ml-task-run --transform-id {transform_id} " +
 f"--task-run-id {task_run_id}")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Command to check on the status of your labelset generation:

aws glue get-ml-task-run --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7 --task-run-id tsk-a1e1e58b19f646be8dff852d8ab0035031d9af3c