{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting Spark application\n" ] }, { "data": { "text/html": [ "\n", "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
8application_1574712114143_0009pysparkidleLinkLink
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "SparkSession available as 'spark'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "AWS pip upgrade command \n", "\n", "pip3 install awscli --upgrade --user" ] } ], "source": [ "#Prerequisites: \n", "# 1. Create Glue Dev Endpoint (G.2X), full S3 access\n", "# 2. Connect to that dev endpoint ith your sagemaker frontend.\n", "# 3. Make sure that your Notebook's IAM role has S3 Write access if you will be using the terminal (S3FullAccess works)\n", "# 3b. Make sure that your Notebook's IAM role has the GlueServiceRole attached as well since we will be making some Glue calls\n", "# 4. Create a database for your files and edit the glue_database variable if different than 'reinvent-2019'\n", "# 5. All previous notebook steps\n", "# 6. Open up a terminal within Jupyter (New -> Terminal) to enter the CLI commands in this demo.\n", "\n", "#Currently required: You will need to install a new/current version of the aws cli in your terminal window:\n", "print(\"AWS pip upgrade command \\n\")\n", "print('pip3 install awscli --upgrade --user')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import sys\n", "from awsglue.transforms import *\n", "from awsglue.utils import getResolvedOptions\n", "from pyspark.context import SparkContext\n", "from pyspark.sql.functions import col\n", "from awsglue.context import GlueContext\n", "from awsglue.job import Job\n", "from awsglueml.transforms import FindMatches\n", "\n", "glueContext = GlueContext(SparkContext.getOrCreate())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#TODO: Update with your own information, synchronize across notebooks.\n", "\n", "my_s3_bucket = \"find-matches-demo\"\n", "project_prefix = \"scholarly_demo\"\n", "glue_database = \"reinvent-2019\"\n", "glue_table = 'dblp_scholar_records_jsonl'\n", "region = 'use-east-1'\n", "glue_role = 'AWSGlueServiceRoleDefault'\n", "glue_source_crawler = project_prefix + \"_source_crawler\"\n", "transform_name = \"reinvent_2019_demo_transform\"\n", "transform_id= \"tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+--------------------+--------------------+--------------------+------+-------+--------+\n", "| id| title| authors| venue| year| source|match_id|\n", "+--------------------+--------------------+--------------------+--------------------+------+-------+--------+\n", "| conf_vldb_LuSL95|NeuroRule: A Conn...|H Lu, R Setiono, ...| VLDB|1995.0| DBLP| 26|\n", "|journals_sigmod_C...|An Active Functio...| M Cilia, A Buchmann| SIGMOD Record|2002.0| DBLP| 29|\n", "|conf_sigmod_Altin...|DBCache: database...|M Altinel, Q Luo,...| SIGMOD Conference|2002.0| DBLP| 474|\n", "|conf_sigmod_Runde...|Evolvable View En...|E Rundensteiner, ...| SIGMOD Conference|1999.0| DBLP| 964|\n", "|conf_vldb_Vassalo...|Describing and Us...|V Vassalos, Y Pap...| VLDB|1997.0| DBLP| 1677|\n", "|conf_vldb_Gibbons...|Fast Incremental ...|P Gibbons, Y Mati...| VLDB|1997.0| DBLP| 1697|\n", "|conf_sigmod_Singh...|Relaxed Transacti...|M Singh, C Tomlin...| SIGMOD Conference|1994.0| DBLP| 1806|\n", "|conf_vldb_Raghavan96|Loading the Data ...| V Raghavan| VLDB|1996.0| DBLP| 1950|\n", "|conf_vldb_BergerB...|Xcerpt and visXce...|S Berger, F Bry, ...| VLDB|2003.0| DBLP| 2040|\n", "|conf_vldb_Ludasch...|A Transducer-Base...|B Lud‰scher, P Mu...| VLDB|2002.0| DBLP| 2214|\n", "|journals_tods_Jag...|The INCINERATE Da...| H Jagadish|ACM Trans. Databa...|1995.0| DBLP| 2250|\n", "|journals_vldb_Mih...|Locating and acce...|G Mihaila, L Rasc...| VLDB J.|2002.0| DBLP| 2453|\n", "|journals_vldb_Kel...|A Predicate-based...| A Keller, J Basu| VLDB J.|1996.0| DBLP| 2509|\n", "|journals_vldb_Har...|Join Algorithm Co...|E Harris, K Ramam...| VLDB J.|1996.0| DBLP| 2529|\n", "| fp7YtnQpu9EJ|DYNAMIC PERFORMAN...| YS Wang| null| null|Scholar| 2927|\n", "| 5lw-_yAlz-oJ|Local Corresponde...|JK Guo, D Doerman...| null| null|Scholar| 3091|\n", "| 8g5MKy4ZkfwJ|A generalized mod...| SK Gadia, CS Yeung|Proceedings of th...|1988.0|Scholar| 3506|\n", "| RTHoDofpcw4J|Groupware: Let√¢?...|L Lindop, T Relph...|PC Magazine, August,| null|Scholar| 3764|\n", "| gwaG9siVZwMJ|Customer Service ...| PF Harrison|BT Technology Jou...|1997.0|Scholar| 4590|\n", "| gH_HDDJghbAJ|Incremental Maint...| J Su| SIGMOD Record,|2000.0|Scholar| 270|\n", "+--------------------+--------------------+--------------------+--------------------+------+-------+--------+\n", "only showing top 20 rows" ] } ], "source": [ "# Alright, at this point we should have a trained Matching ML Transform and even have a rough estimate of the quality\n", "# of that matcher's ML model. Let's put it to work and do some actual matching.\n", "# \n", "# In production, FindMatches Transforms are executed by create a Glue Job and then calling the Transform within that \n", "# Glue Job. You can create a Glue Job to call this transform fairly easily by using the \"Create Job\" wizard in the\n", "# Glue Console. However, if you are running this notebook while connected to a Glue Dev Endpoint, you can also simply\n", "# execute the matcher inline, which we do below:\n", "\n", "\n", "source_data = glueContext.create_dynamic_frame.from_catalog(database = glue_database, table_name = glue_table)\n", "\n", "matched_data = FindMatches.apply(frame = source_data, transformId = transform_id)\n", "\n", "#datasink2 = glueContext.write_dynamic_frame.from_options(frame = findmatches1, connection_type = \"s3\", connection_options = {\"path\": \"s3://find-matches-debug/Scholar/output\"}, format = \"csv\", transformation_ctx = \"datasink2\")\n", "\n", "matched_data.toDF().show()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+---------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------------------------+------+-------+--------+\n", "| id| title| authors| venue| year| source|match_id|\n", "+---------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------------------------+------+-------+--------+\n", "| conf_vldb_RusinkiewiczKTWM95| Towards a Cooperative Transaction Model - The Cooperative Activity Model| M Rusinkiewicz, W Klas, T Tesch, J W‰sch, P Muth| VLDB|1995.0| DBLP| 0|\n", "| T2fm7Wb1ak4J| Towards a Cooperative Transaction Model-The Cooperative Activity Model| M Rusinkiewicz, W Klas, T Tesch, J Waesch, P Muth|PROCEEDINGS OF THE INTERNATIONAL CONFERENCE ON VERY LARGE …,|1995.0|Scholar| 0|\n", "| journals_sigmod_EisenbergM02| SQL/XML is Making Good Progress| A Eisenberg, J Melton| SIGMOD Record|2002.0| DBLP| 1|\n", "| wgK6p4mDSIMJ| SQL/XML is Making Good Progress| A Eisenberg, J Melton| SIGMOD Record,|2002.0|Scholar| 1|\n", "| x-H7BqZ0Hw8J| Using Formal Methods to Reason about Semantics-Based Decompositions of Transactions| P Ammann, S Jajodia, I Ray|PROCEEDINGS OF THE INTERNATIONAL CONFERENCE ON VERY LARGE …,|1995.0|Scholar| 2|\n", "| conf_vldb_AmmannJR95| Using Formal Methods to Reason about Semantics-Based Decompositions of Transactions| P Ammann, S Jajodia, I Ray| VLDB|1995.0| DBLP| 2|\n", "| journals_sigmod_Liu02| Editor's Notes| L Liu| SIGMOD Record|2002.0| DBLP| 3|\n", "| journals_sigmod_Liu02b| Editor's Notes| L Liu| SIGMOD Record|2002.0| DBLP| 3|\n", "| journals_sigmod_Liu02a| Editor's Notes| L Liu| SIGMOD Record|2002.0| DBLP| 3|\n", "| journals_sigmod_Liu02c| Editor's Notes| L Liu| SIGMOD Record|2002.0| DBLP| 3|\n", "|url:http:__portal.acm.org_ft_gateway.cfm%3Fid%3D584455%26type%3Dpdf%26dl%3DGUIDE%26dl%3DACM%26CFID%3D11111111%26CFTOKEN%3D2222222|Report on the ACM Fourth International Workshop on Data Warehousing and OLAP (DOLAP 2001)| J Hammer| SIGMOD Record,|2002.0|Scholar| 4|\n", "| journals_sigmod_Hammer02|Report on the ACM Fourth International Workshop on Data Warehousing and OLAP (DOLAP 2001)| null| null|2002.0| DBLP| 4|\n", "| conf_vldb_FerrandinaMZFM95| Schema and Database Evolution in the O2 Object Database System|F Ferrandina, T Meyer, R Zicari, G Ferran, J Madec| VLDB|1995.0| DBLP| 5|\n", "| 9rofzgQ6HtcJ| Schema and Database Evolution in the O√Ǭæ Object Database System|F Ferrandina, T Meyer, R Zicari, G Ferran, J Madec| Proc. of the 21st Int√¢??l Conf. on Very Large Databases (| null|Scholar| 5|\n", "| gEFY87Ma0XUJ| Procedures in Object-Oriented Query Languages| K Subieta, Y Kambayashi, J Leszczylowski|PROCEEDINGS OF THE INTERNATIONAL CONFERENCE ON VERY LARGE …,|1995.0|Scholar| 6|\n", "| conf_vldb_SubietaKL95| Procedures in Object-Oriented Query Languages| K Subieta, Y Kambayashi, J Leszczylowski| VLDB|1995.0| DBLP| 6|\n", "| journals_sigmod_BargaL02| Phoenix Project: Fault-Tolerant Applications| R Barga, D Lomet| SIGMOD Record|2002.0| DBLP| 7|\n", "| QTzV3iNq2O8J| Phoenix Project: Fault-Tolerant Applications| R Barga, D Lomet| SIGMOD Record,|2002.0|Scholar| 7|\n", "| fY3kkkzBLw8J| Phoenix Project: Fault Tolerant Applications| D Lomet, R Barga| SIGMOD Record,| null|Scholar| 7|\n", "| journals_sigmod_Ouksel02| Mining the World Wide Web: An Information Search Approach - Book Review| null| null|2002.0| DBLP| 8|\n", "+---------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------------------------+------+-------+--------+\n", "only showing top 20 rows" ] } ], "source": [ "# Let's take a look at the output sorted by match ID\n", "matched_data.toDF().sort(col(\"match_id\")).show(truncate=200)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+--------+-----+\n", "|match_id|count|\n", "+--------+-----+\n", "| 7200| 26|\n", "| 19| 24|\n", "| 3140| 21|\n", "| 6910| 19|\n", "| 565| 16|\n", "| 553| 15|\n", "| 1437| 15|\n", "| 1419| 14|\n", "| 226| 14|\n", "| 1954| 14|\n", "| 1409| 14|\n", "| 10838| 14|\n", "| 9262| 13|\n", "| 1675| 13|\n", "| 1133| 13|\n", "| 1543| 12|\n", "| 810| 12|\n", "| 11489| 11|\n", "| 1212| 11|\n", "| 1704| 11|\n", "+--------+-----+\n", "only showing top 20 rows" ] } ], "source": [ "#How about we take a look and see what the biggest clusters are, in case we're worried about over matching:\n", "\n", "matched_data.toDF().groupBy(col(\"match_id\")).count().sort(col(\"count\").desc()).show()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+------------+--------------------+--------------------+--------------------+----+-------+--------+\n", "| id| title| authors| venue|year| source|match_id|\n", "+------------+--------------------+--------------------+--------------------+----+-------+--------+\n", "|5bt5DO88llIJ|The R*-Tree: An E...|N Beckermann, H K...|Proc. ACM SIGMOD ...|null|Scholar| 7200|\n", "|rqZOBLs-GwkJ|The gih-tree: an ...|N Beckmann, HP Kr...| Proceedings of ACM|null|Scholar| 7200|\n", "|He1UKqU3NSkJ|andB. Seeger. The...|N Beckmann, HP Kr...|Proc. SIGMOD Conf...|null|Scholar| 7200|\n", "|R94wj3KT1Q8J|The R*-tree: An E...|N Beckmann, HP Kr...|Proc. 1990 ACM SI...|null|Scholar| 7200|\n", "|8EInhbOVFgMJ|The R*-Tree: An E...| N BACKMANN| ACM Sigmod,|null|Scholar| 7200|\n", "|2BYfXDOWCuAJ|The R*-tree: An E...|NBHP Kriegel, R S...|Proceedings of th...|null|Scholar| 7200|\n", "|gZPjGlHNMXsJ|The R√¢??-tree: a...|N Beckmann, HP Kr...| Proc. ACM SIGMOD|null|Scholar| 7200|\n", "|Vi1lYeDZOI0J|The R√¢??-tree: A...|N Beckmann, HP Kr...|Proceedings ACM S...|null|Scholar| 7200|\n", "|y2oalMrypiUJ|1990 . The R -tre...|N Beckmann, HP Kr...|Proc. ACM SIGMOD ...|null|Scholar| 7200|\n", "|w-jrGadOafcJ|The R* tree: an e...|N Beckmann, HP Kr...|ACM SIGMOD Conf. ...|null|Scholar| 7200|\n", "|reFDs4Up9ksJ|The R -tree: An e...|N Beckmann, HP Kr...|… Interna...|null|Scholar| 7200|\n", "|9FO_pFxsTg8J|newblock The R*-T...|N Beckmann, HP Kr...| Proc. 1990|null|Scholar| 7200|\n", "|2Yy0kQZvT0EJ|The R-tree: AnEf ...|N Beckmann, HP Kr...|Proc. of the ACM ...|null|Scholar| 7200|\n", "|63mcayOUdEoJ|The r*-tree: An e...| N Katayama, S Satoh| Proc. of SIGMOD,|null|Scholar| 7200|\n", "|ybAqpWVtoTUJ|The R*-tree: An E...|N Beckmann, HP Kr...| ACM-SIGMOD|null|Scholar| 7200|\n", "|fa67VqqHqZwJ|The R -tree: An e...|N Beckmann, HP Kr...|Proc. 1990 ACM-SI...|null|Scholar| 7200|\n", "|cNFeeVXbJDYJ|The R*-tree: An E...|N Bechmann, HP Kr...| ACM SIGMOD,|null|Scholar| 7200|\n", "|EnycLFKkzpUJ|The R*-tree: An E...|T Brinkhoff, HP K...| Proc. ACM SIGMOD,|null|Scholar| 7200|\n", "|yzAMAexU8AAJ|Bernhard √¢??The ...|N Beckmann, HP Kr...|Proc. ACM-SIGMOD ...|null|Scholar| 7200|\n", "|vK2DhXE23DEJ|others. The R*-tr...| N Beckmann|Proceedings of th...|null|Scholar| 7200|\n", "+------------+--------------------+--------------------+--------------------+----+-------+--------+\n", "only showing top 20 rows" ] } ], "source": [ "#That big cluster look looks a little suspicious, let's check it out:\n", "\n", "matched_data.toDF().filter(col('match_id') == '7200').show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Where to go from here?\n", "#\n", "# * Improve metrics by adding additional labels either manually or by generating additional labelling sets and\n", "# filling out their label columns and uploading.\n", "# * Tweak the desired precisionRecallTradeoff or the costPerfomanceTradeoff to match your business case\n", "# * Consider enforicng matches if you have some troublesome matches that the ML isn't quite learning and you need\n", "# to guarantee output on particular matches (there is some performance penalty)\n", "# * Integrate into your standard ETL pipeline by calling the matcher from a standard Glue Job." ] } ], "metadata": { "kernelspec": { "display_name": "Sparkmagic (PySpark)", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 2 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python2" } }, "nbformat": 4, "nbformat_minor": 2 }