In [1]:
#Prerequisites: 
#  1. Create Glue Dev Endpoint (G.2X), full S3 access
#  2. Connect to that dev endpoint ith your sagemaker frontend.
#  3. Make sure that your Notebook's IAM role has S3 Write access if you will be using the terminal (S3FullAccess works)
#  3b. Make sure that your Notebook's IAM role has the GlueServiceRole attached as well since we will be making some Glue calls
#  4. Create a database for your files and edit the glue_database variable if different than 'reinvent-2019'
#  5. All previous notebook steps
#  6. Open up a terminal within Jupyter (New -> Terminal) to enter the CLI commands in this demo.

#Currently required: You will need to install a new/current version of the aws cli in your terminal window:
print("AWS pip upgrade command \n")
print('pip3 install awscli --upgrade --user')

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
8,application_1574712114143_0009,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

AWS pip upgrade command 

pip3 install awscli --upgrade --user

In [2]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql.functions import col
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglueml.transforms import FindMatches

glueContext = GlueContext(SparkContext.getOrCreate())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
#TODO: Update with your own information, synchronize across notebooks.

my_s3_bucket = "find-matches-demo"
project_prefix = "scholarly_demo"
glue_database = "reinvent-2019"
glue_table = 'dblp_scholar_records_jsonl'
region = 'use-east-1'
glue_role = 'AWSGlueServiceRoleDefault'
glue_source_crawler = project_prefix + "_source_crawler"
transform_name = "reinvent_2019_demo_transform"
transform_id=  "tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Alright, at this point we should have a trained Matching ML Transform and even have a rough estimate of the quality
# of that matcher's ML model. Let's put it to work and do some actual matching.
# 
# In production, FindMatches Transforms are executed by create a Glue Job and then calling the Transform within that 
# Glue Job. You can create a Glue Job to call this transform fairly easily by using the "Create Job" wizard in the
# Glue Console. However, if you are running this notebook while connected to a Glue Dev Endpoint, you can also simply
# execute the matcher inline, which we do below:


source_data = glueContext.create_dynamic_frame.from_catalog(database = glue_database, table_name = glue_table)

matched_data = FindMatches.apply(frame = source_data, transformId = transform_id)

#datasink2 = glueContext.write_dynamic_frame.from_options(frame = findmatches1, connection_type = "s3", connection_options = {"path": "s3://find-matches-debug/Scholar/output"}, format = "csv", transformation_ctx = "datasink2")

matched_data.toDF().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+--------------------+--------------------+--------------------+------+-------+--------+
|                  id|               title|             authors|               venue|  year| source|match_id|
+--------------------+--------------------+--------------------+--------------------+------+-------+--------+
|    conf_vldb_LuSL95|NeuroRule: A Conn...|H Lu, R Setiono, ...|                VLDB|1995.0|   DBLP|      26|
|journals_sigmod_C...|An Active Functio...| M Cilia, A Buchmann|       SIGMOD Record|2002.0|   DBLP|      29|
|conf_sigmod_Altin...|DBCache: database...|M Altinel, Q Luo,...|   SIGMOD Conference|2002.0|   DBLP|     474|
|conf_sigmod_Runde...|Evolvable View En...|E Rundensteiner, ...|   SIGMOD Conference|1999.0|   DBLP|     964|
|conf_vldb_Vassalo...|Describing and Us...|V Vassalos, Y Pap...|                VLDB|1997.0|   DBLP|    1677|
|conf_vldb_Gibbons...|Fast Incremental ...|P Gibbons, Y Mati...|                VLDB|1997.0|   DBLP|    1697|
|conf_sigm

In [9]:
# Let's take a look at the output sorted by match ID
matched_data.toDF().sort(col("match_id")).show(truncate=200)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------------------------+------+-------+--------+
|                                                                                                                               id|                                                                                    title|                                           authors|                                                               venue|  year| source|match_id|
+---------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+-----------------------------------------------

In [6]:
#How about we take a look and see what the biggest clusters are, in case we're worried about over matching:

matched_data.toDF().groupBy(col("match_id")).count().sort(col("count").desc()).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+-----+
|match_id|count|
+--------+-----+
|    7200|   26|
|      19|   24|
|    3140|   21|
|    6910|   19|
|     565|   16|
|     553|   15|
|    1437|   15|
|    1419|   14|
|     226|   14|
|    1954|   14|
|    1409|   14|
|   10838|   14|
|    9262|   13|
|    1675|   13|
|    1133|   13|
|    1543|   12|
|     810|   12|
|   11489|   11|
|    1212|   11|
|    1704|   11|
+--------+-----+
only showing top 20 rows

In [7]:
#That big cluster look looks a little suspicious, let's check it out:

matched_data.toDF().filter(col('match_id') == '7200').show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+--------------------+--------------------+--------------------+----+-------+--------+
|          id|               title|             authors|               venue|year| source|match_id|
+------------+--------------------+--------------------+--------------------+----+-------+--------+
|5bt5DO88llIJ|The R*-Tree: An E...|N Beckermann, H K...|Proc. ACM SIGMOD ...|null|Scholar|    7200|
|rqZOBLs-GwkJ|The gih-tree: an ...|N Beckmann, HP Kr...|  Proceedings of ACM|null|Scholar|    7200|
|He1UKqU3NSkJ|andB. Seeger. The...|N Beckmann, HP Kr...|Proc. SIGMOD Conf...|null|Scholar|    7200|
|R94wj3KT1Q8J|The R*-tree: An E...|N Beckmann, HP Kr...|Proc. 1990 ACM SI...|null|Scholar|    7200|
|8EInhbOVFgMJ|The R*-Tree: An E...|          N BACKMANN|         ACM Sigmod,|null|Scholar|    7200|
|2BYfXDOWCuAJ|The R*-tree: An E...|NBHP Kriegel, R S...|Proceedings of th...|null|Scholar|    7200|
|gZPjGlHNMXsJ|The R√¢??-tree: a...|N Beckmann, HP Kr...|    Proc. ACM SIGMOD|null|Scholar|    7200|


In [8]:
# Where to go from here?
#
# * Improve metrics by adding additional labels either manually or by generating additional labelling sets and
#   filling out their label columns and uploading.
# * Tweak the desired precisionRecallTradeoff or the costPerfomanceTradeoff to match your business case
# * Consider enforicng matches if you have some troublesome matches that the ML isn't quite learning and you need
#   to guarantee output on particular matches (there is some performance penalty)
# * Integrate into your standard ETL pipeline by calling the matcher from a standard Glue Job.

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…