{ "cells": [ { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "AWS pip upgrade command \n", "\n", "pip3 install awscli --upgrade --user" ] } ], "source": [ "#Prerequisites: \n", "# 1. Create Glue Dev Endpoint (G.2X), full S3 access\n", "# 2. Connect to that dev endpoint ith your sagemaker frontend.\n", "# 3. Make sure that your Notebook's IAM role has S3 Write access if you will be using the terminal (S3FullAccess works)\n", "# 3b. Make sure that your Notebook's IAM role has the GlueServiceRole attached as well since we will be making some Glue calls\n", "# 4. Create a database for your files and edit the glue_database variable if different than 'reinvent-2019'\n", "# 5. All previous notebook steps\n", "# 6. Open up a terminal within Jupyter (New -> Terminal) to enter the CLI commands in this demo.\n", "\n", "#Currently required: You will need to install a new/current version of the aws cli in your terminal window:\n", "print(\"AWS pip upgrade command \\n\")\n", "print('pip3 install awscli --upgrade --user')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import sys\n", "from awsglue.transforms import *\n", "from awsglue.utils import getResolvedOptions\n", "from pyspark.context import SparkContext\n", "from awsglue.context import GlueContext\n", "from awsglue.job import Job\n", "\n", "glueContext = GlueContext(SparkContext.getOrCreate())" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#TODO: Update with your own information, synchronize across notebooks.\n", "\n", "my_s3_bucket = \"find-matches-demo\"\n", "project_prefix = \"scholarly_demo\"\n", "glue_database = \"reinvent-2019\"\n", "glue_table = 'dblp_scholar_records_jsonl'\n", "region = 'use-east-1'\n", "glue_role = 'AWSGlueServiceRoleDefault'\n", "glue_source_crawler = project_prefix + \"_source_crawler\"\n", "transform_name = \"reinvent_2019_demo_transform\"\n", "transform_id= \"tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7\"" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Command to download the labels from the scholarly dataset into your own s3 bucket: \n", "\n", "aws s3 cp s3://ml-transforms-public-datasets-us-east-1/dblp-scholar/labels/dblp_scholar_labels_350.csv s3://find-matches-demo/scholarly_demo/labels/dblp_scholar_labels_350.csv" ] } ], "source": [ "# Let's go ahead and train this ML Transform using some labels. You can use labels that you have \n", "# manually labeled from the labelset generated from the previous step, or you can use the existing\n", "# labels provided by the scholarly dataset. Below, we'll walk through ingesting and training the \n", "# labels provided in the scholarly dataset, but the steps would be the same as if we had labeled the matches\n", "# ourselves by filling in the missing label values in the generated labeling sets. \n", "\n", "print (\"Command to download the labels from the scholarly dataset into your own s3 bucket: \\n\")\n", "print (\"aws s3 cp \" + \n", " \"s3://ml-transforms-public-datasets-us-east-1/dblp-scholar/labels/dblp_scholar_labels_350.csv \" + \n", " \"s3://\" + my_s3_bucket + \"/\" + project_prefix + \"/labels/dblp_scholar_labels_350.csv\")\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+---------------+-----+-----------------+-----------------+-----------------+-----------------+------+-------+\n", "|labeling_set_id|label| id| title| authors| venue| year| source|\n", "+---------------+-----+-----------------+-----------------+-----------------+-----------------+------+-------+\n", "| 0| 0|conf_sigmod_Ab...|Visual COKO: a...|D Abadi, M Che...|SIGMOD Conference|2002.0| DBLP|\n", "| 0| 0| f2Lea-RN8dsJ|Visual COKO: a...| DJ Abadi|SIGMOD Confere...|2002.0|Scholar|\n", "| 0| 1|conf_sigmod_Ab...|Aurora: A Data...|D Abadi, D Car...|SIGMOD Conference|2003.0| DBLP|\n", "| 0| 1| eBnT7lhV2LwJ|Aurora: A Data...|D Abadi, D Car...|Proceedings of...| null|Scholar|\n", "| 0| 1|journals_vldb_...|Aurora: a new ...|D Abadi, D Car...| VLDB J.|2003.0| DBLP|\n", "| 0| 2| AxpQwgyRyLgJ|Active XML Doc...|S Abiteboul, A...| ACM SIGMOD,| null|Scholar|\n", "| 0| 2|conf_sigmod_Ab...|Dynamic XML do...|S Abiteboul, A...|SIGMOD Conference|2003.0| DBLP|\n", "| 0| 2| Rjb06zlxbLIJ|Dynamic XML do...|S Abiteboul, A...|SIGMOD Confere...|2003.0|Scholar|\n", "| 0| 3|conf_sigmod_Ab...|A Database Int...|S Abiteboul, S...|SIGMOD Conference|1995.0| DBLP|\n", "| 0| 3| DakOA4Ew-poJ|A Database Int...|S Abiteboul, S...|Proc. ACM SIGM...| null|Scholar|\n", "| 0| 3| cu9DXtjeF24J|A database int...|S Abiteboul, S...|Proceedings of...|1995.0|Scholar|\n", "| 1| 0| WWaxLMIptTMJ|Self-tuning Hi...|A Aboulnaga, S...|SIGMOD Confere...|1999.0|Scholar|\n", "| 1| 0| xnDzelm2t1QJ|Self-tuning Hi...|AA AC, S Chaud...|Proceedings of...| null|Scholar|\n", "| 1| 0|conf_sigmod_Ab...|Self-tuning Hi...|A Aboulnaga, S...|SIGMOD Conference|1999.0| DBLP|\n", "| 1| 1| rO4-yA3tlfoJ|Broadcast Disk...|S Acharya, R A...|SIGMOD Confere...|1995.0|Scholar|\n", "| 1| 1| 2fO9nM2FoGYJ|Broadcast Disk...|S Acharya, R A...| ACM SIGMOD,| null|Scholar|\n", "| 1| 1| y74PIIvb_GMJ|Broadcast Disk...|S Achary, M Fr...|Proceedings of...| null|Scholar|\n", "| 1| 2| umGxlKvzLoUJ|Pull for Data ...|S Acharya, M F...|Proc. ACM SIGM...| null|Scholar|\n", "| 1| 2| 9OEjtHuRUzsJ|Balancing Push...|S Acharya, M F...|SIGMOD Confere...|1997.0|Scholar|\n", "| 1| 2| qdCTR7WpuFUJ|Balancing push...|M Franklin, S ...|Proceedings of...| null|Scholar|\n", "+---------------+-----+-----------------+-----------------+-----------------+-----------------+------+-------+\n", "only showing top 20 rows" ] } ], "source": [ "# Load those labels into a spark dataframe so we can see what's involved:\n", "\n", "labels_df = spark.read.load(\"s3://\" + my_s3_bucket + \"/\" + project_prefix + \"/labels/dblp_scholar_labels_350.csv\",\n", " format=\"csv\", sep=\",\", inferSchema=\"true\", header=\"true\")\n", "\n", "labels_df.show(truncate=17)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "352" ] } ], "source": [ "labels_df.count()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Command to upload the labels from the your own s3 bucket and send them to the Transform: \n", "\n", "aws glue start-import-labels-task-run --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7 --input-s3-path s3://find-matches-demo/scholarly_demo/labels/dblp_scholar_labels_350.csv" ] } ], "source": [ "# Looks fairly reasonable, right? Let's go ahead and send them to the model\n", "\n", "print (\"Command to upload the labels from the your own s3 bucket and send them to the Transform: \\n\")\n", "print (\"aws glue start-import-labels-task-run \" + \n", " f\"--transform-id {transform_id} \" + \n", " \"--input-s3-path s3://\" + my_s3_bucket + \"/\" + project_prefix + \"/labels/dblp_scholar_labels_350.csv\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Command to upload the labels from the your own s3 bucket and send them to the Transform: \n", "\n", "aws glue get-ml-transform --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7" ] } ], "source": [ "# Once this import succeeds (it should be relatively quick), the ML Transform will have a new label count associated\n", "# with it. Let's just check to make sure it has the expected label count now: \n", "\n", "print (\"Command to get the ML Transform information, including label count: \\n\")\n", "print (\"aws glue get-ml-transform \" + \n", " f\"--transform-id {transform_id} \")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Command to start an ML Evaluation Task Run: \n", "\n", "aws glue start-ml-evaluation-task-run --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7" ] } ], "source": [ "# Looks good! Let's go ahead and get a super rough estimate of the quality. To do this, we can start an \n", "# \"ML Evaluation Task Run\" which will use a portion of held out data (unseen by the ML model) to estimate\n", "# the quality of the ML model. Note that this does not estimate any quality losses due to the other stages\n", "# of the algorithm (candidate generation, clustering, and match enforcement), but it can give us some feedback \n", "# that we are setup correctly and a general idea of the quality of the matching with the current labels. If \n", "# this number isn't high enough, it can typically be improved by adding additional labeled data.\n", "\n", "print (\"Command to start an ML Evaluation Task Run: \\n\")\n", "print (\"aws glue start-ml-evaluation-task-run \" + \n", " f\"--transform-id {transform_id} \")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "aws glue get-ml-task-run --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7 --task-run-id tsk-d7df4407ca5ea5ea538e75ab9bc37dbb0d58d23b\n", "(After Success): Command to get the ML Transform information, including metrics: \n", "\n", "aws glue get-ml-transform --transform-id tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7" ] } ], "source": [ "# Let's see how it worked. You can view the summary metrics for an ML transform easily from the console, or we can\n", "# also take a look at the results of the last evaluation task run:\n", "\n", "#TODO: Update this task run ID with the return value from the preview evaluation task run call:\n", "\n", "task_run_id = \"tsk-d7df4407ca5ea5ea538e75ab9bc37dbb0d58d23b\"\n", "\n", "print(f\"aws glue get-ml-task-run --transform-id {transform_id} \" +\n", " f\"--task-run-id {task_run_id}\")\n", "\n", "print (\"(After Success): Command to get the ML Transform information, including metrics: \\n\")\n", "print (\"aws glue get-ml-transform \" + \n", " f\"--transform-id {transform_id} \")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# If everything looks good, we're set and we can move onto the final stage, matching!" ] } ], "metadata": { "kernelspec": { "display_name": "Sparkmagic (PySpark)", "language": "", "name": "pysparkkernel" }, "language_info": { "codemirror_mode": { "name": "python", "version": 2 }, "mimetype": "text/x-python", "name": "pyspark", "pygments_lexer": "python2" } }, "nbformat": 4, "nbformat_minor": 2 }