{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting Spark application\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<tr><th>ID</th><th>YARN Application ID</th><th>Kind</th><th>State</th><th>Spark UI</th><th>Driver log</th><th>Current session?</th></tr><tr><td>8</td><td>application_1574712114143_0009</td><td>pyspark</td><td>idle</td><td><a target=\"_blank\" href=\"http://ip-172-32-22-241.ec2.internal:20888/proxy/application_1574712114143_0009/\">Link</a></td><td><a target=\"_blank\" href=\"http://ip-172-32-22-202.ec2.internal:8042/node/containerlogs/container_1574712114143_0009_01_000001/livy\">Link</a></td><td>✔</td></tr></table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SparkSession available as 'spark'.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AWS pip upgrade command \n",
      "\n",
      "pip3 install awscli --upgrade --user"
     ]
    }
   ],
   "source": [
    "#Prerequisites: \n",
    "#  1. Create Glue Dev Endpoint (G.2X), full S3 access\n",
    "#  2. Connect to that dev endpoint ith your sagemaker frontend.\n",
    "#  3. Make sure that your Notebook's IAM role has S3 Write access if you will be using the terminal (S3FullAccess works)\n",
    "#  3b. Make sure that your Notebook's IAM role has the GlueServiceRole attached as well since we will be making some Glue calls\n",
    "#  4. Create a database for your files and edit the glue_database variable if different than 'reinvent-2019'\n",
    "#  5. All previous notebook steps\n",
    "#  6. Open up a terminal within Jupyter (New -> Terminal) to enter the CLI commands in this demo.\n",
    "\n",
    "#Currently required: You will need to install a new/current version of the aws cli in your terminal window:\n",
    "print(\"AWS pip upgrade command \\n\")\n",
    "print('pip3 install awscli --upgrade --user')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import sys\n",
    "from awsglue.transforms import *\n",
    "from awsglue.utils import getResolvedOptions\n",
    "from pyspark.context import SparkContext\n",
    "from pyspark.sql.functions import col\n",
    "from awsglue.context import GlueContext\n",
    "from awsglue.job import Job\n",
    "from awsglueml.transforms import FindMatches\n",
    "\n",
    "glueContext = GlueContext(SparkContext.getOrCreate())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#TODO: Update with your own information, synchronize across notebooks.\n",
    "\n",
    "my_s3_bucket = \"find-matches-demo\"\n",
    "project_prefix = \"scholarly_demo\"\n",
    "glue_database = \"reinvent-2019\"\n",
    "glue_table = 'dblp_scholar_records_jsonl'\n",
    "region = 'use-east-1'\n",
    "glue_role = 'AWSGlueServiceRoleDefault'\n",
    "glue_source_crawler = project_prefix + \"_source_crawler\"\n",
    "transform_name = \"reinvent_2019_demo_transform\"\n",
    "transform_id=  \"tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+------+-------+--------+\n",
      "|                  id|               title|             authors|               venue|  year| source|match_id|\n",
      "+--------------------+--------------------+--------------------+--------------------+------+-------+--------+\n",
      "|    conf_vldb_LuSL95|NeuroRule: A Conn...|H Lu, R Setiono, ...|                VLDB|1995.0|   DBLP|      26|\n",
      "|journals_sigmod_C...|An Active Functio...| M Cilia, A Buchmann|       SIGMOD Record|2002.0|   DBLP|      29|\n",
      "|conf_sigmod_Altin...|DBCache: database...|M Altinel, Q Luo,...|   SIGMOD Conference|2002.0|   DBLP|     474|\n",
      "|conf_sigmod_Runde...|Evolvable View En...|E Rundensteiner, ...|   SIGMOD Conference|1999.0|   DBLP|     964|\n",
      "|conf_vldb_Vassalo...|Describing and Us...|V Vassalos, Y Pap...|                VLDB|1997.0|   DBLP|    1677|\n",
      "|conf_vldb_Gibbons...|Fast Incremental ...|P Gibbons, Y Mati...|                VLDB|1997.0|   DBLP|    1697|\n",
      "|conf_sigmod_Singh...|Relaxed Transacti...|M Singh, C Tomlin...|   SIGMOD Conference|1994.0|   DBLP|    1806|\n",
      "|conf_vldb_Raghavan96|Loading the Data ...|          V Raghavan|                VLDB|1996.0|   DBLP|    1950|\n",
      "|conf_vldb_BergerB...|Xcerpt and visXce...|S Berger, F Bry, ...|                VLDB|2003.0|   DBLP|    2040|\n",
      "|conf_vldb_Ludasch...|A Transducer-Base...|B Lud‰scher, P Mu...|                VLDB|2002.0|   DBLP|    2214|\n",
      "|journals_tods_Jag...|The INCINERATE Da...|          H Jagadish|ACM Trans. Databa...|1995.0|   DBLP|    2250|\n",
      "|journals_vldb_Mih...|Locating and acce...|G Mihaila, L Rasc...|             VLDB J.|2002.0|   DBLP|    2453|\n",
      "|journals_vldb_Kel...|A Predicate-based...|    A Keller, J Basu|             VLDB J.|1996.0|   DBLP|    2509|\n",
      "|journals_vldb_Har...|Join Algorithm Co...|E Harris, K Ramam...|             VLDB J.|1996.0|   DBLP|    2529|\n",
      "|        fp7YtnQpu9EJ|DYNAMIC PERFORMAN...|             YS Wang|                null|  null|Scholar|    2927|\n",
      "|        5lw-_yAlz-oJ|Local Corresponde...|JK Guo, D Doerman...|                null|  null|Scholar|    3091|\n",
      "|        8g5MKy4ZkfwJ|A generalized mod...|  SK Gadia, CS Yeung|Proceedings of th...|1988.0|Scholar|    3506|\n",
      "|        RTHoDofpcw4J|Groupware: Let√¢?...|L Lindop, T Relph...|PC Magazine, August,|  null|Scholar|    3764|\n",
      "|        gwaG9siVZwMJ|Customer Service ...|         PF Harrison|BT Technology Jou...|1997.0|Scholar|    4590|\n",
      "|        gH_HDDJghbAJ|Incremental Maint...|                J Su|      SIGMOD Record,|2000.0|Scholar|     270|\n",
      "+--------------------+--------------------+--------------------+--------------------+------+-------+--------+\n",
      "only showing top 20 rows"
     ]
    }
   ],
   "source": [
    "# Alright, at this point we should have a trained Matching ML Transform and even have a rough estimate of the quality\n",
    "# of that matcher's ML model. Let's put it to work and do some actual matching.\n",
    "# \n",
    "# In production, FindMatches Transforms are executed by create a Glue Job and then calling the Transform within that \n",
    "# Glue Job. You can create a Glue Job to call this transform fairly easily by using the \"Create Job\" wizard in the\n",
    "# Glue Console. However, if you are running this notebook while connected to a Glue Dev Endpoint, you can also simply\n",
    "# execute the matcher inline, which we do below:\n",
    "\n",
    "\n",
    "source_data = glueContext.create_dynamic_frame.from_catalog(database = glue_database, table_name = glue_table)\n",
    "\n",
    "matched_data = FindMatches.apply(frame = source_data, transformId = transform_id)\n",
    "\n",
    "#datasink2 = glueContext.write_dynamic_frame.from_options(frame = findmatches1, connection_type = \"s3\", connection_options = {\"path\": \"s3://find-matches-debug/Scholar/output\"}, format = \"csv\", transformation_ctx = \"datasink2\")\n",
    "\n",
    "matched_data.toDF().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------------------------+------+-------+--------+\n",
      "|                                                                                                                               id|                                                                                    title|                                           authors|                                                               venue|  year| source|match_id|\n",
      "+---------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------------------------+------+-------+--------+\n",
      "|                                                                                                     conf_vldb_RusinkiewiczKTWM95|                 Towards a Cooperative Transaction Model - The Cooperative Activity Model|  M Rusinkiewicz, W Klas, T Tesch, J W‰sch, P Muth|                                                                VLDB|1995.0|   DBLP|       0|\n",
      "|                                                                                                                     T2fm7Wb1ak4J|                   Towards a Cooperative Transaction Model-The Cooperative Activity Model| M Rusinkiewicz, W Klas, T Tesch, J Waesch, P Muth|PROCEEDINGS OF THE INTERNATIONAL CONFERENCE ON VERY LARGE  &hellip;,|1995.0|Scholar|       0|\n",
      "|                                                                                                     journals_sigmod_EisenbergM02|                                                          SQL/XML is Making Good Progress|                             A Eisenberg, J Melton|                                                       SIGMOD Record|2002.0|   DBLP|       1|\n",
      "|                                                                                                                     wgK6p4mDSIMJ|                                                          SQL/XML is Making Good Progress|                             A Eisenberg, J Melton|                                                      SIGMOD Record,|2002.0|Scholar|       1|\n",
      "|                                                                                                                     x-H7BqZ0Hw8J|      Using Formal Methods to Reason about Semantics-Based Decompositions of Transactions|                        P Ammann, S Jajodia, I Ray|PROCEEDINGS OF THE INTERNATIONAL CONFERENCE ON VERY LARGE  &hellip;,|1995.0|Scholar|       2|\n",
      "|                                                                                                             conf_vldb_AmmannJR95|      Using Formal Methods to Reason about Semantics-Based Decompositions of Transactions|                        P Ammann, S Jajodia, I Ray|                                                                VLDB|1995.0|   DBLP|       2|\n",
      "|                                                                                                            journals_sigmod_Liu02|                                                                           Editor's Notes|                                             L Liu|                                                       SIGMOD Record|2002.0|   DBLP|       3|\n",
      "|                                                                                                           journals_sigmod_Liu02b|                                                                           Editor's Notes|                                             L Liu|                                                       SIGMOD Record|2002.0|   DBLP|       3|\n",
      "|                                                                                                           journals_sigmod_Liu02a|                                                                           Editor's Notes|                                             L Liu|                                                       SIGMOD Record|2002.0|   DBLP|       3|\n",
      "|                                                                                                           journals_sigmod_Liu02c|                                                                           Editor's Notes|                                             L Liu|                                                       SIGMOD Record|2002.0|   DBLP|       3|\n",
      "|url:http:__portal.acm.org_ft_gateway.cfm%3Fid%3D584455%26type%3Dpdf%26dl%3DGUIDE%26dl%3DACM%26CFID%3D11111111%26CFTOKEN%3D2222222|Report on the ACM Fourth International Workshop on Data Warehousing and OLAP (DOLAP 2001)|                                          J Hammer|                                                      SIGMOD Record,|2002.0|Scholar|       4|\n",
      "|                                                                                                         journals_sigmod_Hammer02|Report on the ACM Fourth International Workshop on Data Warehousing and OLAP (DOLAP 2001)|                                              null|                                                                null|2002.0|   DBLP|       4|\n",
      "|                                                                                                       conf_vldb_FerrandinaMZFM95|                           Schema and Database Evolution in the O2 Object Database System|F Ferrandina, T Meyer, R Zicari, G Ferran, J Madec|                                                                VLDB|1995.0|   DBLP|       5|\n",
      "|                                                                                                                     9rofzgQ6HtcJ|                        Schema and Database Evolution in the O√Ç¬æ Object Database System|F Ferrandina, T Meyer, R Zicari, G Ferran, J Madec|          Proc. of the 21st Int√¢??l Conf. on Very Large Databases (|  null|Scholar|       5|\n",
      "|                                                                                                                     gEFY87Ma0XUJ|                                            Procedures in Object-Oriented Query Languages|          K Subieta, Y Kambayashi, J Leszczylowski|PROCEEDINGS OF THE INTERNATIONAL CONFERENCE ON VERY LARGE  &hellip;,|1995.0|Scholar|       6|\n",
      "|                                                                                                            conf_vldb_SubietaKL95|                                            Procedures in Object-Oriented Query Languages|          K Subieta, Y Kambayashi, J Leszczylowski|                                                                VLDB|1995.0|   DBLP|       6|\n",
      "|                                                                                                         journals_sigmod_BargaL02|                                             Phoenix Project: Fault-Tolerant Applications|                                  R Barga, D Lomet|                                                       SIGMOD Record|2002.0|   DBLP|       7|\n",
      "|                                                                                                                     QTzV3iNq2O8J|                                             Phoenix Project: Fault-Tolerant Applications|                                  R Barga, D Lomet|                                                      SIGMOD Record,|2002.0|Scholar|       7|\n",
      "|                                                                                                                     fY3kkkzBLw8J|                                             Phoenix Project: Fault Tolerant Applications|                                  D Lomet, R Barga|                                                      SIGMOD Record,|  null|Scholar|       7|\n",
      "|                                                                                                         journals_sigmod_Ouksel02|                  Mining the World Wide Web: An Information Search Approach - Book Review|                                              null|                                                                null|2002.0|   DBLP|       8|\n",
      "+---------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+--------------------------------------------------+--------------------------------------------------------------------+------+-------+--------+\n",
      "only showing top 20 rows"
     ]
    }
   ],
   "source": [
    "# Let's take a look at the output sorted by match ID\n",
    "matched_data.toDF().sort(col(\"match_id\")).show(truncate=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+-----+\n",
      "|match_id|count|\n",
      "+--------+-----+\n",
      "|    7200|   26|\n",
      "|      19|   24|\n",
      "|    3140|   21|\n",
      "|    6910|   19|\n",
      "|     565|   16|\n",
      "|     553|   15|\n",
      "|    1437|   15|\n",
      "|    1419|   14|\n",
      "|     226|   14|\n",
      "|    1954|   14|\n",
      "|    1409|   14|\n",
      "|   10838|   14|\n",
      "|    9262|   13|\n",
      "|    1675|   13|\n",
      "|    1133|   13|\n",
      "|    1543|   12|\n",
      "|     810|   12|\n",
      "|   11489|   11|\n",
      "|    1212|   11|\n",
      "|    1704|   11|\n",
      "+--------+-----+\n",
      "only showing top 20 rows"
     ]
    }
   ],
   "source": [
    "#How about we take a look and see what the biggest clusters are, in case we're worried about over matching:\n",
    "\n",
    "matched_data.toDF().groupBy(col(\"match_id\")).count().sort(col(\"count\").desc()).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+--------------------+--------------------+--------------------+----+-------+--------+\n",
      "|          id|               title|             authors|               venue|year| source|match_id|\n",
      "+------------+--------------------+--------------------+--------------------+----+-------+--------+\n",
      "|5bt5DO88llIJ|The R*-Tree: An E...|N Beckermann, H K...|Proc. ACM SIGMOD ...|null|Scholar|    7200|\n",
      "|rqZOBLs-GwkJ|The gih-tree: an ...|N Beckmann, HP Kr...|  Proceedings of ACM|null|Scholar|    7200|\n",
      "|He1UKqU3NSkJ|andB. Seeger. The...|N Beckmann, HP Kr...|Proc. SIGMOD Conf...|null|Scholar|    7200|\n",
      "|R94wj3KT1Q8J|The R*-tree: An E...|N Beckmann, HP Kr...|Proc. 1990 ACM SI...|null|Scholar|    7200|\n",
      "|8EInhbOVFgMJ|The R*-Tree: An E...|          N BACKMANN|         ACM Sigmod,|null|Scholar|    7200|\n",
      "|2BYfXDOWCuAJ|The R*-tree: An E...|NBHP Kriegel, R S...|Proceedings of th...|null|Scholar|    7200|\n",
      "|gZPjGlHNMXsJ|The R√¢??-tree: a...|N Beckmann, HP Kr...|    Proc. ACM SIGMOD|null|Scholar|    7200|\n",
      "|Vi1lYeDZOI0J|The R√¢??-tree: A...|N Beckmann, HP Kr...|Proceedings ACM S...|null|Scholar|    7200|\n",
      "|y2oalMrypiUJ|1990 . The R -tre...|N Beckmann, HP Kr...|Proc. ACM SIGMOD ...|null|Scholar|    7200|\n",
      "|w-jrGadOafcJ|The R* tree: an e...|N Beckmann, HP Kr...|ACM SIGMOD Conf. ...|null|Scholar|    7200|\n",
      "|reFDs4Up9ksJ|The R -tree: An e...|N Beckmann, HP Kr...|&hellip;  Interna...|null|Scholar|    7200|\n",
      "|9FO_pFxsTg8J|newblock The R*-T...|N Beckmann, HP Kr...|          Proc. 1990|null|Scholar|    7200|\n",
      "|2Yy0kQZvT0EJ|The R-tree: AnEf ...|N Beckmann, HP Kr...|Proc. of the ACM ...|null|Scholar|    7200|\n",
      "|63mcayOUdEoJ|The r*-tree: An e...| N Katayama, S Satoh|    Proc. of SIGMOD,|null|Scholar|    7200|\n",
      "|ybAqpWVtoTUJ|The R*-tree: An E...|N Beckmann, HP Kr...|          ACM-SIGMOD|null|Scholar|    7200|\n",
      "|fa67VqqHqZwJ|The R -tree: An e...|N Beckmann, HP Kr...|Proc. 1990 ACM-SI...|null|Scholar|    7200|\n",
      "|cNFeeVXbJDYJ|The R*-tree: An E...|N Bechmann, HP Kr...|         ACM SIGMOD,|null|Scholar|    7200|\n",
      "|EnycLFKkzpUJ|The R*-tree: An E...|T Brinkhoff, HP K...|   Proc. ACM SIGMOD,|null|Scholar|    7200|\n",
      "|yzAMAexU8AAJ|Bernhard √¢??The ...|N Beckmann, HP Kr...|Proc. ACM-SIGMOD ...|null|Scholar|    7200|\n",
      "|vK2DhXE23DEJ|others. The R*-tr...|          N Beckmann|Proceedings of th...|null|Scholar|    7200|\n",
      "+------------+--------------------+--------------------+--------------------+----+-------+--------+\n",
      "only showing top 20 rows"
     ]
    }
   ],
   "source": [
    "#That big cluster look looks a little suspicious, let's check it out:\n",
    "\n",
    "matched_data.toDF().filter(col('match_id') == '7200').show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Where to go from here?\n",
    "#\n",
    "# * Improve metrics by adding additional labels either manually or by generating additional labelling sets and\n",
    "#   filling out their label columns and uploading.\n",
    "# * Tweak the desired precisionRecallTradeoff or the costPerfomanceTradeoff to match your business case\n",
    "# * Consider enforicng matches if you have some troublesome matches that the ML isn't quite learning and you need\n",
    "#   to guarantee output on particular matches (there is some performance penalty)\n",
    "# * Integrate into your standard ETL pipeline by calling the matcher from a standard Glue Job."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Sparkmagic (PySpark)",
   "language": "",
   "name": "pysparkkernel"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "python",
    "version": 2
   },
   "mimetype": "text/x-python",
   "name": "pyspark",
   "pygments_lexer": "python2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}