In [1]:
# Objective: Download, prepare, and explore the data sources that we will be integrating with FindMatches

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
9,application_1574712114143_0010,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
#Prerequisites: 
#  1. Create Glue Dev Endpoint (G.2X), full S3 access
#  2. Connect to that dev endpoint ith your sagemaker frontend.
#  3. Make sure that your Notebook's IAM role has S3 Write access if you will be using the terminal (S3FullAccess works)
#  3b. Make sure that your Notebook's IAM role has the GlueServiceRole attached as well since we will be making some Glue calls
#  4. Create a database for your files and edit the glue_database variable if different than 'reinvent-2019'
#  5. All previous notebook steps
#  6. Open up a terminal within Jupyter (New -> Terminal) to enter the CLI commands in this demo.

#Currently required: You will need to install a new/current version of the aws cli in your terminal window:
print("AWS pip upgrade command \n")
print('pip3 install awscli --upgrade --user')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

AWS pip upgrade command 

pip3 install awscli --upgrade --user

In [3]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

glueContext = GlueContext(SparkContext.getOrCreate())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
#TODO: Update with your own information, synchronize across notebooks.

my_s3_bucket = "find-matches-demo"
project_prefix = "scholarly_demo"
glue_database = "reinvent-2019"
glue_table = 'dblp_scholar_records_jsonl'
region = 'use-east-1'
glue_role = 'AWSGlueServiceRoleDefault'
glue_source_crawler = project_prefix + "_source_crawler"
transform_name = "reinvent_2019_demo_transform"
transform_id=  "tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
glue_source_crawler

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'scholarly_demo_source_crawler'

In [6]:
print ("Command to download the source records into your own s3 bucket: \n")
print ("aws s3 cp " + 
      "s3://ml-transforms-public-datasets-us-east-1/dblp-scholar/records/dblp_scholar_records.jsonl " + 
      "s3://" + my_s3_bucket + "/" + project_prefix + "/source/dblp_scholar_records.jsonl")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Command to download the source records into your own s3 bucket: 

aws s3 cp s3://ml-transforms-public-datasets-us-east-1/dblp-scholar/records/dblp_scholar_records.jsonl s3://find-matches-demo/scholarly_demo/source/dblp_scholar_records.jsonl

In [33]:
# Create a crawler and run it against the file to load the data reference into the Glue/LF Data Catalog
# This is easy to do in the AWS Console, or you can also do this via AWS CLI as per below.

s3_targets = {
        'S3Targets': [
            {
                'Path': "s3://" + my_s3_bucket + "/" + project_prefix + "/source/dblp_scholar_records.jsonl",
            },
        ],
    }

print("CLI command to create the crawler\n")
print(f"aws glue create-crawler --name {glue_source_crawler} --role {glue_role} " +
      f'--database-name {glue_database} '
      '--targets \'{"S3Targets": [{"Path": "s3://'+my_s3_bucket+'/'+project_prefix+'/source/dblp_scholar_records.jsonl"}]}\'')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

CLI command to run the crawler

aws glue create-crawler --name scholarly_demo_source_crawler --role AWSGlueServiceRoleDefault --database-name reinvent-2019 --targets '{"S3Targets": [{"Path": "s3://find-matches-demo/scholarly_demo/source/dblp_scholar_records.jsonl"}]}'

In [34]:
# Run the crawler

print("CLI command to run the crawler\n")
print(f"aws glue start-crawler --name {glue_source_crawler}")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

CLI command to run the crawler

aws glue start-crawler --name scholarly_demo_source_crawler

In [35]:
# Wait for crawl to finish

print("CLI command to check on the crawler status so we can wait until it finishes\n")
print(f"aws glue get-crawler --name {glue_source_crawler}")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

CLI command to check on the crawler status so we can wait until it finishes

aws glue get-crawler --name scholarly_demo_source_crawler

In [36]:
# Take a look at the table schema for a sanity check: 
import pprint 

response = client.get_table(
    DatabaseName=glue_database,
    Name='dblp_scholar_records_jsonl'
)

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(response['Table']['StorageDescriptor']['Columns'])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[   {'Name': 'id', 'Type': 'string'},
    {'Name': 'title', 'Type': 'string'},
    {'Name': 'authors', 'Type': 'string'},
    {'Name': 'venue', 'Type': 'string'},
    {'Name': 'year', 'Type': 'double'},
    {'Name': 'source', 'Type': 'string'}]

In [37]:
#Looking good, so let's take a look at the actual data:

source = glueContext.create_dynamic_frame.from_catalog(database=glue_database, table_name="dblp_scholar_records_jsonl").toDF()
print (f"Source dataset length: {source.count()}")
source.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Source dataset length: 66879
+--------------------+--------------------+--------------------+-------------+------+------+
|                  id|               title|             authors|        venue|  year|source|
+--------------------+--------------------+--------------------+-------------+------+------+
|conf_vldb_Rusinki...|Towards a Coopera...|M Rusinkiewicz, W...|         VLDB|1995.0|  DBLP|
|journals_sigmod_E...|SQL/XML is Making...|A Eisenberg, J Me...|SIGMOD Record|2002.0|  DBLP|
|conf_vldb_AmmannJR95|Using Formal Meth...|P Ammann, S Jajod...|         VLDB|1995.0|  DBLP|
|journals_sigmod_L...|      Editor's Notes|               L Liu|SIGMOD Record|2002.0|  DBLP|
|journals_sigmod_H...|Report on the ACM...|                null|         null|2002.0|  DBLP|
|conf_vldb_Ferrand...|Schema and Databa...|F Ferrandina, T M...|         VLDB|1995.0|  DBLP|
|conf_vldb_Subieta...|Procedures in Obj...|K Subieta, Y Kamb...|         VLDB|1995.0|  DBLP|
|journals_sigmod_B...|Phoenix Project: ..

In [38]:
#Look at some details specifically from from Scholar

print ("Scholar dataset length: " + str(source.filter(source.source == 'Scholar').count()) );
source.filter(source.source == 'Scholar').sample(False,.01).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Scholar dataset length: 64263
+--------------------+--------------------+--------------------+--------------------+------+-------+
|                  id|               title|             authors|               venue|  year| source|
+--------------------+--------------------+--------------------+--------------------+------+-------+
|        OOZb-r3GUTMJ|Genera crustaceor...|        PA Latreille|            Tomus I,|  null|Scholar|
|        9F4uvDaInLAJ|Towards Generic S...|M Garschhammer, R...|Seattle, Washingt...|  null|Scholar|
|        z0OXMVQB5pIJ|Implicit Stereoty...|MR Banaji, C Hard...|JOURNAL OF PERSON...|1993.0|Scholar|
|        Bn5h0IbqbNwJ|i2i Trust in E-co...|  JS Olson, GM Olson| COMMUNICATIONS-ACM,|2000.0|Scholar|
|        mrxAHi3pQgAJ|Systematics of th...|RE Jenkins, BJ Fr...|         Unpublished|  null|Scholar|
|url:http:__portal...|A content based i...|M Uysal, F Yarman...|Proceedings of th...|2004.0|Scholar|
|        x8s3QpPxgGYJ|PROMISE: peer-to-...|M Hefeeda, A Habi.

In [39]:
#Look at some details specifically from from DBLP

print ("DBLP dataset length: " + str(source.filter(source.source == 'DPLP').count()) );
source.filter(source.source == 'DBLP').sample(False,.01).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DBLP dataset length: 0
+--------------------+--------------------+--------------------+-----------------+------+------+
|                  id|               title|             authors|            venue|  year|source|
+--------------------+--------------------+--------------------+-----------------+------+------+
|conf_vldb_MohaniaS94|Some Issues in De...|  M Mohania, N Sarda|             VLDB|1994.0|  DBLP|
| conf_vldb_SetzerZ94|New Concurrency C...|  V Setzer, A Zisman|             VLDB|1994.0|  DBLP|
|journals_sigmod_S...|Using Unknowns to...|Y Saygin, V Veryk...|    SIGMOD Record|2001.0|  DBLP|
|conf_sigmod_WangW...|Clustering by pat...|H Wang, W Wang, J...|SIGMOD Conference|2002.0|  DBLP|
|conf_sigmod_Cresc...|RoadRunner: autom...|V Crescenzi, G Me...|SIGMOD Conference|2002.0|  DBLP|
|journals_sigmod_P...|Database Research...|D Phatak, N Sarda...|    SIGMOD Record|1996.0|  DBLP|
| conf_sigmod_YuMWL01|Efficient and Eff...|C Yu, W Meng, W W...|SIGMOD Conference|2001.0|  DBLP|
|conf_s