{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
9application_1574712114143_0010pysparkidleLinkLink
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Objective: Download, prepare, and explore the data sources that we will be integrating with FindMatches" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "AWS pip upgrade command \n", "\n", "pip3 install awscli --upgrade --user" ] } ], "source": [ "#Prerequisites: \n", "# 1. Create Glue Dev Endpoint (G.2X), full S3 access\n", "# 2. Connect to that dev endpoint ith your sagemaker frontend.\n", "# 3. Make sure that your Notebook's IAM role has S3 Write access if you will be using the terminal (S3FullAccess works)\n", "# 3b. Make sure that your Notebook's IAM role has the GlueServiceRole attached as well since we will be making some Glue calls\n", "# 4. Create a database for your files and edit the glue_database variable if different than 'reinvent-2019'\n", "# 5. All previous notebook steps\n", "# 6. Open up a terminal within Jupyter (New -> Terminal) to enter the CLI commands in this demo.\n", "\n", "#Currently required: You will need to install a new/current version of the aws cli in your terminal window:\n", "print(\"AWS pip upgrade command \\n\")\n", "print('pip3 install awscli --upgrade --user')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import sys\n", "from awsglue.transforms import *\n", "from awsglue.utils import getResolvedOptions\n", "from pyspark.context import SparkContext\n", "from awsglue.context import GlueContext\n", "from awsglue.job import Job\n", "\n", "glueContext = GlueContext(SparkContext.getOrCreate())" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#TODO: Update with your own information, synchronize across notebooks.\n", "\n", "my_s3_bucket = \"find-matches-demo\"\n", "project_prefix = \"scholarly_demo\"\n", "glue_database = \"reinvent-2019\"\n", "glue_table = 'dblp_scholar_records_jsonl'\n", "region = 'use-east-1'\n", "glue_role = 'AWSGlueServiceRoleDefault'\n", "glue_source_crawler = project_prefix + \"_source_crawler\"\n", "transform_name = \"reinvent_2019_demo_transform\"\n", "transform_id= \"tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "'scholarly_demo_source_crawler'" ] } ], "source": [ "glue_source_crawler" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Command to download the source records into your own s3 bucket: \n", "\n", "aws s3 cp s3://ml-transforms-public-datasets-us-east-1/dblp-scholar/records/dblp_scholar_records.jsonl s3://find-matches-demo/scholarly_demo/source/dblp_scholar_records.jsonl" ] } ], "source": [ "print (\"Command to download the source records into your own s3 bucket: \\n\")\n", "print (\"aws s3 cp \" + \n", " \"s3://ml-transforms-public-datasets-us-east-1/dblp-scholar/records/dblp_scholar_records.jsonl \" + \n", " \"s3://\" + my_s3_bucket + \"/\" + project_prefix + \"/source/dblp_scholar_records.jsonl\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CLI command to run the crawler\n", "\n", "aws glue create-crawler --name scholarly_demo_source_crawler --role AWSGlueServiceRoleDefault --database-name reinvent-2019 --targets '{\"S3Targets\": [{\"Path\": \"s3://find-matches-demo/scholarly_demo/source/dblp_scholar_records.jsonl\"}]}'" ] } ], "source": [ "# Create a crawler and run it against the file to load the data reference into the Glue/LF Data Catalog\n", "# This is easy to do in the AWS Console, or you can also do this via AWS CLI as per below.\n", "\n", "s3_targets = {\n", " 'S3Targets': [\n", " {\n", " 'Path': \"s3://\" + my_s3_bucket + \"/\" + project_prefix + \"/source/dblp_scholar_records.jsonl\",\n", " },\n", " ],\n", " }\n", "\n", "print(\"CLI command to create the crawler\\n\")\n", "print(f\"aws glue create-crawler --name {glue_source_crawler} --role {glue_role} \" +\n", " f'--database-name {glue_database} '\n", " '--targets \\'{\"S3Targets\": [{\"Path\": \"s3://'+my_s3_bucket+'/'+project_prefix+'/source/dblp_scholar_records.jsonl\"}]}\\'')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CLI command to run the crawler\n", "\n", "aws glue start-crawler --name scholarly_demo_source_crawler" ] } ], "source": [ "# Run the crawler\n", "\n", "print(\"CLI command to run the crawler\\n\")\n", "print(f\"aws glue start-crawler --name {glue_source_crawler}\")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "scrolled": true }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CLI command to check on the crawler status so we can wait until it finishes\n", "\n", "aws glue get-crawler --name scholarly_demo_source_crawler" ] } ], "source": [ "# Wait for crawl to finish\n", "\n", "print(\"CLI command to check on the crawler status so we can wait until it finishes\\n\")\n", "print(f\"aws glue get-crawler --name {glue_source_crawler}\")\n" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[ {'Name': 'id', 'Type': 'string'},\n", " {'Name': 'title', 'Type': 'string'},\n", " {'Name': 'authors', 'Type': 'string'},\n", " {'Name': 'venue', 'Type': 'string'},\n", " {'Name': 'year', 'Type': 'double'},\n", " {'Name': 'source', 'Type': 'string'}]" ] } ], "source": [ "# Take a look at the table schema for a sanity check: \n", "import pprint \n", "\n", "response = client.get_table(\n", " DatabaseName=glue_database,\n", " Name='dblp_scholar_records_jsonl'\n", ")\n", "\n", "pp = pprint.PrettyPrinter(indent=4)\n", "pp.pprint(response['Table']['StorageDescriptor']['Columns'])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Source dataset length: 66879\n", "+--------------------+--------------------+--------------------+-------------+------+------+\n", "| id| title| authors| venue| year|source|\n", "+--------------------+--------------------+--------------------+-------------+------+------+\n", "|conf_vldb_Rusinki...|Towards a Coopera...|M Rusinkiewicz, W...| VLDB|1995.0| DBLP|\n", "|journals_sigmod_E...|SQL/XML is Making...|A Eisenberg, J Me...|SIGMOD Record|2002.0| DBLP|\n", "|conf_vldb_AmmannJR95|Using Formal Meth...|P Ammann, S Jajod...| VLDB|1995.0| DBLP|\n", "|journals_sigmod_L...| Editor's Notes| L Liu|SIGMOD Record|2002.0| DBLP|\n", "|journals_sigmod_H...|Report on the ACM...| null| null|2002.0| DBLP|\n", "|conf_vldb_Ferrand...|Schema and Databa...|F Ferrandina, T M...| VLDB|1995.0| DBLP|\n", "|conf_vldb_Subieta...|Procedures in Obj...|K Subieta, Y Kamb...| VLDB|1995.0| DBLP|\n", "|journals_sigmod_B...|Phoenix Project: ...| R Barga, D Lomet|SIGMOD Record|2002.0| DBLP|\n", "|journals_sigmod_O...|Mining the World ...| null| null|2002.0| DBLP|\n", "| conf_vldb_MoserKK95|L/MRP: A Buffer M...|F Moser, A Kraiss...| VLDB|1995.0| DBLP|\n", "|journals_sigmod_K...|Report on the NSF...| null| null|2002.0| DBLP|\n", "|conf_vldb_Chaudhu...|Retrieval of Comp...|S Chaudhuri, S Gh...| VLDB|1995.0| DBLP|\n", "|journals_sigmod_K...|MPEG-7 and Multim...| H Kosch|SIGMOD Record|2002.0| DBLP|\n", "| conf_vldb_LuTD95|The Fittest Survi...| H Lu, K Tan, S Dao| VLDB|1995.0| DBLP|\n", "|journals_sigmod_R...|Reminiscences on ...| null| null|2002.0| DBLP|\n", "|conf_vldb_TreschPL95|Type Classificati...|M Tresch, N Palme...| VLDB|1995.0| DBLP|\n", "|journals_sigmod_G...|Data Mining: Prac...| J Geller|SIGMOD Record|2002.0| DBLP|\n", "| conf_vldb_MehtaD95|Managing Intra-op...| M Mehta, D DeWitt| VLDB|1995.0| DBLP|\n", "|journals_sigmod_C...|Advances in Datab...|A Caplinskas, J E...|SIGMOD Record|2002.0| DBLP|\n", "|conf_vldb_SrikantA95|Mining Generalize...|R Srikant, R Agrawal| VLDB|1995.0| DBLP|\n", "+--------------------+--------------------+--------------------+-------------+------+------+\n", "only showing top 20 rows" ] } ], "source": [ "#Looking good, so let's take a look at the actual data:\n", "\n", "source = glueContext.create_dynamic_frame.from_catalog(database=glue_database, table_name=\"dblp_scholar_records_jsonl\").toDF()\n", "print (f\"Source dataset length: {source.count()}\")\n", "source.show()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Scholar dataset length: 64263\n", "+--------------------+--------------------+--------------------+--------------------+------+-------+\n", "| id| title| authors| venue| year| source|\n", "+--------------------+--------------------+--------------------+--------------------+------+-------+\n", "| OOZb-r3GUTMJ|Genera crustaceor...| PA Latreille| Tomus I,| null|Scholar|\n", "| 9F4uvDaInLAJ|Towards Generic S...|M Garschhammer, R...|Seattle, Washingt...| null|Scholar|\n", "| z0OXMVQB5pIJ|Implicit Stereoty...|MR Banaji, C Hard...|JOURNAL OF PERSON...|1993.0|Scholar|\n", "| Bn5h0IbqbNwJ|i2i Trust in E-co...| JS Olson, GM Olson| COMMUNICATIONS-ACM,|2000.0|Scholar|\n", "| mrxAHi3pQgAJ|Systematics of th...|RE Jenkins, BJ Fr...| Unpublished| null|Scholar|\n", "|url:http:__portal...|A content based i...|M Uysal, F Yarman...|Proceedings of th...|2004.0|Scholar|\n", "| x8s3QpPxgGYJ|PROMISE: peer-to-...|M Hefeeda, A Habi...|Proceedings of AC...|2003.0|Scholar|\n", "| -ZAKITqaNZUJ|SLAB: A software-...| JD Miller| null| null|Scholar|\n", "| hAiGk_XI3B4J|Guest Editor's In...| D Hix| IEEE Software,|1989.0|Scholar|\n", "| GCVMmOSZn0AJ|Pat Selinger Spea...|P Selinger, M Win...| SIGMOD Record,| null|Scholar|\n", "| 0V8E_ij2SpYJ|Real-time numeric...| F Blais, M Rioux| SIGNAL PROC.,|1986.0|Scholar|\n", "| 8Ha8o2X9l2IJ|JavaScript 1.2 Ad...| S Mintert| null| null|Scholar|\n", "| a2wqpKma5J4J|Chain : Operator ...|B Babcock, S Babu...| SIGMOD Conference,|2003.0|Scholar|\n", "| dvjQs4Y6H_gJ|Preliminary resea...| GB Teague, C Macias| null|1990.0|Scholar|\n", "| kwyhOo5rlJ8J|Web-based casinos...| K Nash| null| null|Scholar|\n", "| WyCQmiQwZbMJ|Smith. BL (1990)....|F Gabelnick, J Ma...|New Directions in...| null|Scholar|\n", "|url:http:__portal...|Threading Stories...| X Wu|Proceedings of th...|2005.0|Scholar|\n", "| JVGBOCQFD04J|ICAME Conference ...| F Varret√¢?¬¶| null|1998.0|Scholar|\n", "| rtmpVZ4_bXMJ| Portal page bonanza| ZD Publishing| PC Magazine,| null|Scholar|\n", "| 9cyBWYOQlAoJ|IT Changes Hit Su...| ML Songini| Computerworld,| null|Scholar|\n", "+--------------------+--------------------+--------------------+--------------------+------+-------+\n", "only showing top 20 rows" ] } ], "source": [ "#Look at some details specifically from from Scholar\n", "\n", "print (\"Scholar dataset length: \" + str(source.filter(source.source == 'Scholar').count()) );\n", "source.filter(source.source == 'Scholar').sample(False,.01).show()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "DBLP dataset length: 0\n", "+--------------------+--------------------+--------------------+-----------------+------+------+\n", "| id| title| authors| venue| year|source|\n", "+--------------------+--------------------+--------------------+-----------------+------+------+\n", "|conf_vldb_MohaniaS94|Some Issues in De...| M Mohania, N Sarda| VLDB|1994.0| DBLP|\n", "| conf_vldb_SetzerZ94|New Concurrency C...| V Setzer, A Zisman| VLDB|1994.0| DBLP|\n", "|journals_sigmod_S...|Using Unknowns to...|Y Saygin, V Veryk...| SIGMOD Record|2001.0| DBLP|\n", "|conf_sigmod_WangW...|Clustering by pat...|H Wang, W Wang, J...|SIGMOD Conference|2002.0| DBLP|\n", "|conf_sigmod_Cresc...|RoadRunner: autom...|V Crescenzi, G Me...|SIGMOD Conference|2002.0| DBLP|\n", "|journals_sigmod_P...|Database Research...|D Phatak, N Sarda...| SIGMOD Record|1996.0| DBLP|\n", "| conf_sigmod_YuMWL01|Efficient and Eff...|C Yu, W Meng, W W...|SIGMOD Conference|2001.0| DBLP|\n", "|conf_sigmod_Davul...|A Layered Archite...|H Davulcu, J Frei...|SIGMOD Conference|1999.0| DBLP|\n", "|conf_sigmod_Hidber99|Online Associatio...| C Hidber|SIGMOD Conference|1999.0| DBLP|\n", "| conf_vldb_TanEO01|Efficient Progres...| K Tan, P Eng, B Ooi| VLDB|2001.0| DBLP|\n", "|conf_sigmod_Godfr...|Secure and Portab...|M Godfrey, T Mayr...|SIGMOD Conference|1998.0| DBLP|\n", "|conf_vldb_AmbiteK...|The WorlInfo Assi...|J Ambite, C Knobl...| VLDB|2001.0| DBLP|\n", "|conf_vldb_LabioYC...|Performance Issue...|W Labio, J Yang, ...| VLDB|2000.0| DBLP|\n", "|conf_vldb_Deutsch...|Physical Data Ind...|A Deutsch, L Popa...| VLDB|1999.0| DBLP|\n", "|conf_vldb_ChenCFG...|High Level Indexi...|W Chen, J Chow, Y...| VLDB|1999.0| DBLP|\n", "| conf_sigmod_DanS95|An Online Video P...| A Dan, D Sitaram|SIGMOD Conference|1995.0| DBLP|\n", "|conf_vldb_Abitebo...|XML Repository an...|S Abiteboul, V Ag...| VLDB|1999.0| DBLP|\n", "|conf_vldb_Alsabti...|A One-Pass Algori...|K Alsabti, S Rank...| VLDB|1997.0| DBLP|\n", "|conf_vldb_AshwinR...|Garbage Collectio...|S Ashwin, P Roy, ...| VLDB|1997.0| DBLP|\n", "|conf_vldb_Srivast...|Answering Queries...|D Srivastava, S D...| VLDB|1996.0| DBLP|\n", "+--------------------+--------------------+--------------------+-----------------+------+------+\n", "only showing top 20 rows" ] } ], "source": [ "#Look at some details specifically from from DBLP\n", "\n", "print (\"DBLP dataset length: \" + str(source.filter(source.source == 'DPLP').count()) );\n", "source.filter(source.source == 'DBLP').sample(False,.01).show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Sparkmagic (PySpark)", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 2 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python2" } }, "nbformat": 4, "nbformat_minor": 2 }