{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting Spark application\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<tr><th>ID</th><th>YARN Application ID</th><th>Kind</th><th>State</th><th>Spark UI</th><th>Driver log</th><th>Current session?</th></tr><tr><td>9</td><td>application_1574712114143_0010</td><td>pyspark</td><td>idle</td><td><a target=\"_blank\" href=\"http://ip-172-32-22-241.ec2.internal:20888/proxy/application_1574712114143_0010/\">Link</a></td><td><a target=\"_blank\" href=\"http://ip-172-32-14-3.ec2.internal:8042/node/containerlogs/container_1574712114143_0010_01_000001/livy\">Link</a></td><td>✔</td></tr></table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SparkSession available as 'spark'.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Objective: Download, prepare, and explore the data sources that we will be integrating with FindMatches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AWS pip upgrade command \n",
      "\n",
      "pip3 install awscli --upgrade --user"
     ]
    }
   ],
   "source": [
    "#Prerequisites: \n",
    "#  1. Create Glue Dev Endpoint (G.2X), full S3 access\n",
    "#  2. Connect to that dev endpoint ith your sagemaker frontend.\n",
    "#  3. Make sure that your Notebook's IAM role has S3 Write access if you will be using the terminal (S3FullAccess works)\n",
    "#  3b. Make sure that your Notebook's IAM role has the GlueServiceRole attached as well since we will be making some Glue calls\n",
    "#  4. Create a database for your files and edit the glue_database variable if different than 'reinvent-2019'\n",
    "#  5. All previous notebook steps\n",
    "#  6. Open up a terminal within Jupyter (New -> Terminal) to enter the CLI commands in this demo.\n",
    "\n",
    "#Currently required: You will need to install a new/current version of the aws cli in your terminal window:\n",
    "print(\"AWS pip upgrade command \\n\")\n",
    "print('pip3 install awscli --upgrade --user')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import sys\n",
    "from awsglue.transforms import *\n",
    "from awsglue.utils import getResolvedOptions\n",
    "from pyspark.context import SparkContext\n",
    "from awsglue.context import GlueContext\n",
    "from awsglue.job import Job\n",
    "\n",
    "glueContext = GlueContext(SparkContext.getOrCreate())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#TODO: Update with your own information, synchronize across notebooks.\n",
    "\n",
    "my_s3_bucket = \"find-matches-demo\"\n",
    "project_prefix = \"scholarly_demo\"\n",
    "glue_database = \"reinvent-2019\"\n",
    "glue_table = 'dblp_scholar_records_jsonl'\n",
    "region = 'use-east-1'\n",
    "glue_role = 'AWSGlueServiceRoleDefault'\n",
    "glue_source_crawler = project_prefix + \"_source_crawler\"\n",
    "transform_name = \"reinvent_2019_demo_transform\"\n",
    "transform_id=  \"tfm-810e6f50ff6e74964b5990ab354398b9bbd113e7\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'scholarly_demo_source_crawler'"
     ]
    }
   ],
   "source": [
    "glue_source_crawler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Command to download the source records into your own s3 bucket: \n",
      "\n",
      "aws s3 cp s3://ml-transforms-public-datasets-us-east-1/dblp-scholar/records/dblp_scholar_records.jsonl s3://find-matches-demo/scholarly_demo/source/dblp_scholar_records.jsonl"
     ]
    }
   ],
   "source": [
    "print (\"Command to download the source records into your own s3 bucket: \\n\")\n",
    "print (\"aws s3 cp \" + \n",
    "      \"s3://ml-transforms-public-datasets-us-east-1/dblp-scholar/records/dblp_scholar_records.jsonl \" + \n",
    "      \"s3://\" + my_s3_bucket + \"/\" + project_prefix + \"/source/dblp_scholar_records.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CLI command to run the crawler\n",
      "\n",
      "aws glue create-crawler --name scholarly_demo_source_crawler --role AWSGlueServiceRoleDefault --database-name reinvent-2019 --targets '{\"S3Targets\": [{\"Path\": \"s3://find-matches-demo/scholarly_demo/source/dblp_scholar_records.jsonl\"}]}'"
     ]
    }
   ],
   "source": [
    "# Create a crawler and run it against the file to load the data reference into the Glue/LF Data Catalog\n",
    "# This is easy to do in the AWS Console, or you can also do this via AWS CLI as per below.\n",
    "\n",
    "s3_targets = {\n",
    "        'S3Targets': [\n",
    "            {\n",
    "                'Path': \"s3://\" + my_s3_bucket + \"/\" + project_prefix + \"/source/dblp_scholar_records.jsonl\",\n",
    "            },\n",
    "        ],\n",
    "    }\n",
    "\n",
    "print(\"CLI command to create the crawler\\n\")\n",
    "print(f\"aws glue create-crawler --name {glue_source_crawler} --role {glue_role} \" +\n",
    "      f'--database-name {glue_database} '\n",
    "      '--targets \\'{\"S3Targets\": [{\"Path\": \"s3://'+my_s3_bucket+'/'+project_prefix+'/source/dblp_scholar_records.jsonl\"}]}\\'')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CLI command to run the crawler\n",
      "\n",
      "aws glue start-crawler --name scholarly_demo_source_crawler"
     ]
    }
   ],
   "source": [
    "# Run the crawler\n",
    "\n",
    "print(\"CLI command to run the crawler\\n\")\n",
    "print(f\"aws glue start-crawler --name {glue_source_crawler}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CLI command to check on the crawler status so we can wait until it finishes\n",
      "\n",
      "aws glue get-crawler --name scholarly_demo_source_crawler"
     ]
    }
   ],
   "source": [
    "# Wait for crawl to finish\n",
    "\n",
    "print(\"CLI command to check on the crawler status so we can wait until it finishes\\n\")\n",
    "print(f\"aws glue get-crawler --name {glue_source_crawler}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[   {'Name': 'id', 'Type': 'string'},\n",
      "    {'Name': 'title', 'Type': 'string'},\n",
      "    {'Name': 'authors', 'Type': 'string'},\n",
      "    {'Name': 'venue', 'Type': 'string'},\n",
      "    {'Name': 'year', 'Type': 'double'},\n",
      "    {'Name': 'source', 'Type': 'string'}]"
     ]
    }
   ],
   "source": [
    "# Take a look at the table schema for a sanity check: \n",
    "import pprint \n",
    "\n",
    "response = client.get_table(\n",
    "    DatabaseName=glue_database,\n",
    "    Name='dblp_scholar_records_jsonl'\n",
    ")\n",
    "\n",
    "pp = pprint.PrettyPrinter(indent=4)\n",
    "pp.pprint(response['Table']['StorageDescriptor']['Columns'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Source dataset length: 66879\n",
      "+--------------------+--------------------+--------------------+-------------+------+------+\n",
      "|                  id|               title|             authors|        venue|  year|source|\n",
      "+--------------------+--------------------+--------------------+-------------+------+------+\n",
      "|conf_vldb_Rusinki...|Towards a Coopera...|M Rusinkiewicz, W...|         VLDB|1995.0|  DBLP|\n",
      "|journals_sigmod_E...|SQL/XML is Making...|A Eisenberg, J Me...|SIGMOD Record|2002.0|  DBLP|\n",
      "|conf_vldb_AmmannJR95|Using Formal Meth...|P Ammann, S Jajod...|         VLDB|1995.0|  DBLP|\n",
      "|journals_sigmod_L...|      Editor's Notes|               L Liu|SIGMOD Record|2002.0|  DBLP|\n",
      "|journals_sigmod_H...|Report on the ACM...|                null|         null|2002.0|  DBLP|\n",
      "|conf_vldb_Ferrand...|Schema and Databa...|F Ferrandina, T M...|         VLDB|1995.0|  DBLP|\n",
      "|conf_vldb_Subieta...|Procedures in Obj...|K Subieta, Y Kamb...|         VLDB|1995.0|  DBLP|\n",
      "|journals_sigmod_B...|Phoenix Project: ...|    R Barga, D Lomet|SIGMOD Record|2002.0|  DBLP|\n",
      "|journals_sigmod_O...|Mining the World ...|                null|         null|2002.0|  DBLP|\n",
      "| conf_vldb_MoserKK95|L/MRP: A Buffer M...|F Moser, A Kraiss...|         VLDB|1995.0|  DBLP|\n",
      "|journals_sigmod_K...|Report on the NSF...|                null|         null|2002.0|  DBLP|\n",
      "|conf_vldb_Chaudhu...|Retrieval of Comp...|S Chaudhuri, S Gh...|         VLDB|1995.0|  DBLP|\n",
      "|journals_sigmod_K...|MPEG-7 and Multim...|             H Kosch|SIGMOD Record|2002.0|  DBLP|\n",
      "|    conf_vldb_LuTD95|The Fittest Survi...|  H Lu, K Tan, S Dao|         VLDB|1995.0|  DBLP|\n",
      "|journals_sigmod_R...|Reminiscences on ...|                null|         null|2002.0|  DBLP|\n",
      "|conf_vldb_TreschPL95|Type Classificati...|M Tresch, N Palme...|         VLDB|1995.0|  DBLP|\n",
      "|journals_sigmod_G...|Data Mining: Prac...|            J Geller|SIGMOD Record|2002.0|  DBLP|\n",
      "|  conf_vldb_MehtaD95|Managing Intra-op...|   M Mehta, D DeWitt|         VLDB|1995.0|  DBLP|\n",
      "|journals_sigmod_C...|Advances in Datab...|A Caplinskas, J E...|SIGMOD Record|2002.0|  DBLP|\n",
      "|conf_vldb_SrikantA95|Mining Generalize...|R Srikant, R Agrawal|         VLDB|1995.0|  DBLP|\n",
      "+--------------------+--------------------+--------------------+-------------+------+------+\n",
      "only showing top 20 rows"
     ]
    }
   ],
   "source": [
    "#Looking good, so let's take a look at the actual data:\n",
    "\n",
    "source = glueContext.create_dynamic_frame.from_catalog(database=glue_database, table_name=\"dblp_scholar_records_jsonl\").toDF()\n",
    "print (f\"Source dataset length: {source.count()}\")\n",
    "source.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Scholar dataset length: 64263\n",
      "+--------------------+--------------------+--------------------+--------------------+------+-------+\n",
      "|                  id|               title|             authors|               venue|  year| source|\n",
      "+--------------------+--------------------+--------------------+--------------------+------+-------+\n",
      "|        OOZb-r3GUTMJ|Genera crustaceor...|        PA Latreille|            Tomus I,|  null|Scholar|\n",
      "|        9F4uvDaInLAJ|Towards Generic S...|M Garschhammer, R...|Seattle, Washingt...|  null|Scholar|\n",
      "|        z0OXMVQB5pIJ|Implicit Stereoty...|MR Banaji, C Hard...|JOURNAL OF PERSON...|1993.0|Scholar|\n",
      "|        Bn5h0IbqbNwJ|i2i Trust in E-co...|  JS Olson, GM Olson| COMMUNICATIONS-ACM,|2000.0|Scholar|\n",
      "|        mrxAHi3pQgAJ|Systematics of th...|RE Jenkins, BJ Fr...|         Unpublished|  null|Scholar|\n",
      "|url:http:__portal...|A content based i...|M Uysal, F Yarman...|Proceedings of th...|2004.0|Scholar|\n",
      "|        x8s3QpPxgGYJ|PROMISE: peer-to-...|M Hefeeda, A Habi...|Proceedings of AC...|2003.0|Scholar|\n",
      "|        -ZAKITqaNZUJ|SLAB: A software-...|           JD Miller|                null|  null|Scholar|\n",
      "|        hAiGk_XI3B4J|Guest Editor's In...|               D Hix|      IEEE Software,|1989.0|Scholar|\n",
      "|        GCVMmOSZn0AJ|Pat Selinger Spea...|P Selinger, M Win...|      SIGMOD Record,|  null|Scholar|\n",
      "|        0V8E_ij2SpYJ|Real-time numeric...|    F Blais, M Rioux|       SIGNAL PROC.,|1986.0|Scholar|\n",
      "|        8Ha8o2X9l2IJ|JavaScript 1.2 Ad...|           S Mintert|                null|  null|Scholar|\n",
      "|        a2wqpKma5J4J|Chain : Operator ...|B Babcock, S Babu...|  SIGMOD Conference,|2003.0|Scholar|\n",
      "|        dvjQs4Y6H_gJ|Preliminary resea...| GB Teague, C Macias|                null|1990.0|Scholar|\n",
      "|        kwyhOo5rlJ8J|Web-based casinos...|              K Nash|                null|  null|Scholar|\n",
      "|        WyCQmiQwZbMJ|Smith. BL (1990)....|F Gabelnick, J Ma...|New Directions in...|  null|Scholar|\n",
      "|url:http:__portal...|Threading Stories...|                X Wu|Proceedings of th...|2005.0|Scholar|\n",
      "|        JVGBOCQFD04J|ICAME Conference ...|       F Varret√¢?¬¶|                null|1998.0|Scholar|\n",
      "|        rtmpVZ4_bXMJ| Portal page bonanza|       ZD Publishing|        PC Magazine,|  null|Scholar|\n",
      "|        9cyBWYOQlAoJ|IT Changes Hit Su...|          ML Songini|      Computerworld,|  null|Scholar|\n",
      "+--------------------+--------------------+--------------------+--------------------+------+-------+\n",
      "only showing top 20 rows"
     ]
    }
   ],
   "source": [
    "#Look at some details specifically from from Scholar\n",
    "\n",
    "print (\"Scholar dataset length: \" + str(source.filter(source.source == 'Scholar').count()) );\n",
    "source.filter(source.source == 'Scholar').sample(False,.01).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DBLP dataset length: 0\n",
      "+--------------------+--------------------+--------------------+-----------------+------+------+\n",
      "|                  id|               title|             authors|            venue|  year|source|\n",
      "+--------------------+--------------------+--------------------+-----------------+------+------+\n",
      "|conf_vldb_MohaniaS94|Some Issues in De...|  M Mohania, N Sarda|             VLDB|1994.0|  DBLP|\n",
      "| conf_vldb_SetzerZ94|New Concurrency C...|  V Setzer, A Zisman|             VLDB|1994.0|  DBLP|\n",
      "|journals_sigmod_S...|Using Unknowns to...|Y Saygin, V Veryk...|    SIGMOD Record|2001.0|  DBLP|\n",
      "|conf_sigmod_WangW...|Clustering by pat...|H Wang, W Wang, J...|SIGMOD Conference|2002.0|  DBLP|\n",
      "|conf_sigmod_Cresc...|RoadRunner: autom...|V Crescenzi, G Me...|SIGMOD Conference|2002.0|  DBLP|\n",
      "|journals_sigmod_P...|Database Research...|D Phatak, N Sarda...|    SIGMOD Record|1996.0|  DBLP|\n",
      "| conf_sigmod_YuMWL01|Efficient and Eff...|C Yu, W Meng, W W...|SIGMOD Conference|2001.0|  DBLP|\n",
      "|conf_sigmod_Davul...|A Layered Archite...|H Davulcu, J Frei...|SIGMOD Conference|1999.0|  DBLP|\n",
      "|conf_sigmod_Hidber99|Online Associatio...|            C Hidber|SIGMOD Conference|1999.0|  DBLP|\n",
      "|   conf_vldb_TanEO01|Efficient Progres...| K Tan, P Eng, B Ooi|             VLDB|2001.0|  DBLP|\n",
      "|conf_sigmod_Godfr...|Secure and Portab...|M Godfrey, T Mayr...|SIGMOD Conference|1998.0|  DBLP|\n",
      "|conf_vldb_AmbiteK...|The WorlInfo Assi...|J Ambite, C Knobl...|             VLDB|2001.0|  DBLP|\n",
      "|conf_vldb_LabioYC...|Performance Issue...|W Labio, J Yang, ...|             VLDB|2000.0|  DBLP|\n",
      "|conf_vldb_Deutsch...|Physical Data Ind...|A Deutsch, L Popa...|             VLDB|1999.0|  DBLP|\n",
      "|conf_vldb_ChenCFG...|High Level Indexi...|W Chen, J Chow, Y...|             VLDB|1999.0|  DBLP|\n",
      "|  conf_sigmod_DanS95|An Online Video P...|    A Dan, D Sitaram|SIGMOD Conference|1995.0|  DBLP|\n",
      "|conf_vldb_Abitebo...|XML Repository an...|S Abiteboul, V Ag...|             VLDB|1999.0|  DBLP|\n",
      "|conf_vldb_Alsabti...|A One-Pass Algori...|K Alsabti, S Rank...|             VLDB|1997.0|  DBLP|\n",
      "|conf_vldb_AshwinR...|Garbage Collectio...|S Ashwin, P Roy, ...|             VLDB|1997.0|  DBLP|\n",
      "|conf_vldb_Srivast...|Answering Queries...|D Srivastava, S D...|             VLDB|1996.0|  DBLP|\n",
      "+--------------------+--------------------+--------------------+-----------------+------+------+\n",
      "only showing top 20 rows"
     ]
    }
   ],
   "source": [
    "#Look at some details specifically from from DBLP\n",
    "\n",
    "print (\"DBLP dataset length: \" + str(source.filter(source.source == 'DPLP').count()) );\n",
    "source.filter(source.source == 'DBLP').sample(False,.01).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Sparkmagic (PySpark)",
   "language": "",
   "name": "pysparkkernel"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "python",
    "version": 2
   },
   "mimetype": "text/x-python",
   "name": "pyspark",
   "pygments_lexer": "python2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}