In [13]:
#Prerequisites: 
# 1. Create Glue Dev Endpoint (G.2X), full S3 access
# 2. Connect to that dev endpoint ith your sagemaker frontend.
# 3. Make sure that your Notebook's IAM role has S3 Write access if you will be using the terminal (S3FullAccess works)
# 3b. Make sure that your Notebook's IAM role has the GlueServiceRole attached as well since we will be making some Glue calls
# 4. Create a database for your files and edit the glue_database variable if different than 'reinvent-2019'
# 5. All previous notebook steps
# 6. Open up a terminal within Jupyter (New -> Terminal) to enter the CLI commands in this demo.

#Currently required: You will need to install a new/current version of the aws cli in your terminal window:
print("AWS pip upgrade command \n")
print('pip3 install awscli --upgrade --user')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

AWS pip upgrade command 

pip3 install awscli --upgrade --user

In [14]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

glueContext = GlueContext(SparkContext.getOrCreate())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
#TODO: Update with your own information, synchronize across notebooks.

my_s3_bucket = "find-matches-demo"
project_prefix = "scholarly_demo"
glue_database = "reinvent-2019"
glue_table = 'dblp_scholar_records_jsonl'
region = 'use-east-1'
glue_role = 'AWSGlueServiceRoleDefault'
glue_source_crawler = project_prefix + "_source_crawler"
transform_name = "reinvent_2019_demo_transform"
transform_id= "tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
# Let's go ahead and train this ML Transform using some labels. You can use labels that you have 
# manually labeled from the labelset generated from the previous step, or you can use the existing
# labels provided by the scholarly dataset. Below, we'll walk through ingesting and training the 
# labels provided in the scholarly dataset, but the steps would be the same as if we had labeled the matches
# ourselves by filling in the missing label values in the generated labeling sets. 

print ("Command to download the labels from the scholarly dataset into your own s3 bucket: \n")
print ("aws s3 cp " + 
 "s3://ml-transforms-public-datasets-us-east-1/dblp-scholar/labels/dblp_scholar_labels_350.csv " + 
 "s3://" + my_s3_bucket + "/" + project_prefix + "/labels/dblp_scholar_labels_350.csv")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Command to download the labels from the scholarly dataset into your own s3 bucket: 

aws s3 cp s3://ml-transforms-public-datasets-us-east-1/dblp-scholar/labels/dblp_scholar_labels_350.csv s3://find-matches-demo/scholarly_demo/labels/dblp_scholar_labels_350.csv

In [17]:
# Load those labels into a spark dataframe so we can see what's involved:

labels_df = spark.read.load("s3://" + my_s3_bucket + "/" + project_prefix + "/labels/dblp_scholar_labels_350.csv",
 format="csv", sep=",", inferSchema="true", header="true")

labels_df.show(truncate=17)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+-----+-----------------+-----------------+-----------------+-----------------+------+-------+
|labeling_set_id|label| id| title| authors| venue| year| source|
+---------------+-----+-----------------+-----------------+-----------------+-----------------+------+-------+
| 0| 0|conf_sigmod_Ab...|Visual COKO: a...|D Abadi, M Che...|SIGMOD Conference|2002.0| DBLP|
| 0| 0| f2Lea-RN8dsJ|Visual COKO: a...| DJ Abadi|SIGMOD Confere...|2002.0|Scholar|
| 0| 1|conf_sigmod_Ab...|Aurora: A Data...|D Abadi, D Car...|SIGMOD Conference|2003.0| DBLP|
| 0| 1| eBnT7lhV2LwJ|Aurora: A Data...|D Abadi, D Car...|Proceedings of...| null|Scholar|
| 0| 1|journals_vldb_...|Aurora: a new ...|D Abadi, D Car...| VLDB J.|2003.0| DBLP|
| 0| 2| AxpQwgyRyLgJ|Active XML Doc...|S Abiteboul, A...| ACM SIGMOD,| null|Scholar|
| 0| 2|conf_sigmod_Ab...|Dynamic XML do...|S Abiteboul, A...|SIGMOD Conference|2003.0| DBLP|
| 0| 2| Rjb06zlxbLIJ|Dynamic XML do...|S Abiteboul, A...|SIGMOD Confere...|2003.0|Scholar|
|

In [20]:
labels_df.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

352

In [18]:
# Looks fairly reasonable, right? Let's go ahead and send them to the model

print ("Command to upload the labels from the your own s3 bucket and send them to the Transform: \n")
print ("aws glue start-import-labels-task-run " + 
 f"--transform-id {transform_id} " + 
 "--input-s3-path s3://" + my_s3_bucket + "/" + project_prefix + "/labels/dblp_scholar_labels_350.csv")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Command to upload the labels from the your own s3 bucket and send them to the Transform: 

aws glue start-import-labels-task-run --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7 --input-s3-path s3://find-matches-demo/scholarly_demo/labels/dblp_scholar_labels_350.csv

In [19]:
# Once this import succeeds (it should be relatively quick), the ML Transform will have a new label count associated
# with it. Let's just check to make sure it has the expected label count now: 

print ("Command to get the ML Transform information, including label count: \n")
print ("aws glue get-ml-transform " + 
 f"--transform-id {transform_id} ")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Command to upload the labels from the your own s3 bucket and send them to the Transform: 

aws glue get-ml-transform --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7

In [21]:
# Looks good! Let's go ahead and get a super rough estimate of the quality. To do this, we can start an 
# "ML Evaluation Task Run" which will use a portion of held out data (unseen by the ML model) to estimate
# the quality of the ML model. Note that this does not estimate any quality losses due to the other stages
# of the algorithm (candidate generation, clustering, and match enforcement), but it can give us some feedback 
# that we are setup correctly and a general idea of the quality of the matching with the current labels. If 
# this number isn't high enough, it can typically be improved by adding additional labeled data.

print ("Command to start an ML Evaluation Task Run: \n")
print ("aws glue start-ml-evaluation-task-run " + 
 f"--transform-id {transform_id} ")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Command to start an ML Evaluation Task Run: 

aws glue start-ml-evaluation-task-run --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7

In [23]:
# Let's see how it worked. You can view the summary metrics for an ML transform easily from the console, or we can
# also take a look at the results of the last evaluation task run:

#TODO: Update this task run ID with the return value from the preview evaluation task run call:

task_run_id = "tsk-d7df4407ca5ea5ea538e75ab9bc37dbb0d58d23b"

print(f"aws glue get-ml-task-run --transform-id {transform_id} " +
 f"--task-run-id {task_run_id}")

print ("(After Success): Command to get the ML Transform information, including metrics: \n")
print ("aws glue get-ml-transform " + 
 f"--transform-id {transform_id} ")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

aws glue get-ml-task-run --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7 --task-run-id tsk-d7df4407ca5ea5ea538e75ab9bc37dbb0d58d23b
(After Success): Command to get the ML Transform information, including metrics: 

aws glue get-ml-transform --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7

In [None]:
# If everything looks good, we're set and we can move onto the final stage, matching!