{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Knowledge Graph"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Transform JSON to RDF format"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set up transformation function to handle transforming the json object to RDF format as per your graph model. The code below handles the transformation for the following model: [link pic]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install rdflib\n",
    "!pip install tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import rdflib\n",
    "import urllib\n",
    "import uuid\n",
    "import logging\n",
    "import re\n",
    "\n",
    "from hashlib import md5\n",
    "\n",
    "from rdflib import URIRef, Literal, ConjunctiveGraph\n",
    "from rdflib.namespace import RDF, RDFS, XSD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "URL = 'http://example.com'\n",
    "\n",
    "## ****************** NEPTUNE NAMESPACE ********************\n",
    "DATA_GRAPH = f'{URL}/graph/data'\n",
    "\n",
    "RELATION = f'{URL}/relationship' # node -> node\n",
    "RESOURCE = f'{URL}/resource' # node/id\n",
    "PROPERTY = f'{URL}/property/' # node -> literal\n",
    "ENTITY_TYPE = f'{URL}/entity-type/' # attached to every node with rdf.type\n",
    "TYPE = f'{URL}/type' # attached to product property node (material/colour/..)\n",
    "\n",
    "## ****************** NODES NAMESPACE ********************\n",
    "DOC_NODE = f'{RESOURCE}/doc/'\n",
    "NESTED_DOC_NODE = f'{RESOURCE}/nested-doc/'\n",
    "\n",
    "HAS_NESTED_PROPERTY = f'{RELATION}/has-nested-property' # product -> properties"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_hash(value):\n",
    "    return md5(str(value).lower().encode()).hexdigest()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_url_encode(txt):\n",
    "    return urllib.parse.quote(txt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def json2rdf(doc):\n",
    "\n",
    "    doc_fields = doc.keys()\n",
    "\n",
    "    g = ConjunctiveGraph()\n",
    "\n",
    "    doc_id = get_url_encode(doc[\"id\"])\n",
    "    \n",
    "    doc_id_uri = URIRef(f'{DOC_NODE}{doc_id}')\n",
    "\n",
    "    g.add((doc_id_uri, RDF.type, URIRef(f\"{ENTITY_TYPE}doc\")))\n",
    "                    \n",
    "    doc_dicts = []\n",
    "          \n",
    "    for field in doc_fields: \n",
    "        if isinstance(doc[field], dict):\n",
    "            doc_dicts += [field]\n",
    "            continue\n",
    "        field_value = Literal(str(doc[field]))\n",
    "        field = re.sub('[^A-Za-z0-9]+', ' ', field)\n",
    "        field = field.strip().lower().replace(' ','-')\n",
    "        g.add((doc_id_uri, URIRef(f\"{PROPERTY}{field}\"), field_value))\n",
    "                        \n",
    "    for field in doc_dicts:\n",
    "        for key, value in doc[field].items():\n",
    "            key = re.sub('[^A-Za-z0-9]+', ' ', key)\n",
    "            key = key.strip().lower().replace(' ','-')\n",
    "            \n",
    "            if type(value) in [list, set]:\n",
    "                for element in value:\n",
    "                    add_nested_property(g, doc_id_uri, key, element)\n",
    "            else:\n",
    "                add_nested_property(g, doc_id_uri, key, value)\n",
    "\n",
    "    return g.serialize(format = 'ntriples').decode()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_nested_property(g, doc_id_uri, key, value):    \n",
    "    value_hash = get_hash(f\"{value}\")\n",
    "    \n",
    "    value_node_uri = URIRef(f\"{NESTED_DOC_NODE}{key}/{value_hash}\")  \n",
    "    \n",
    "    g.add((doc_id_uri, URIRef(f\"{HAS_NESTED_PROPERTY}\"), value_node_uri))  \n",
    "    \n",
    "    node_value = Literal(str(value))\n",
    "    g.add((value_node_uri, RDFS.label, node_value))  \n",
    "    \n",
    "    g.add((value_node_uri, RDF.type, URIRef(f\"{ENTITY_TYPE}nested-property\"))) \n",
    "    \n",
    "    g.add((value_node_uri, URIRef(TYPE), URIRef(f'{TYPE}/{key}')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Apply transformation to NERC output"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Specify the bucket and key of the NERC model output to transform it from JSON to RDF format and store result to the same bucket under neptune prefix in s3."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "s3_client = boto3.client('s3')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def transform(obj, triplets = ''):\n",
    "    items = json.loads(obj['Body'].read().decode('utf-8'))\n",
    "\n",
    "    for i in tqdm(range(len(items))):\n",
    "\n",
    "        item_triplets = json2rdf(items[i])\n",
    "\n",
    "        triplets = f'{item_triplets}\\n{triplets}'\n",
    "    \n",
    "    return triplets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sagemaker\n",
    "\n",
    "# Taking the output of the NERC\n",
    "sagemaker_session = sagemaker.Session()\n",
    "\n",
    "#We'll be using the sagemaker default bucket\n",
    "BUCKET = sagemaker_session.default_bucket()\n",
    "PREFIX = 'graph-nerc-blog' #Feel free to change this"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "triplets = ''\n",
    "\n",
    "bucket = BUCKET\n",
    "key = PREFIX + '/data_with_entities.json'\n",
    "\n",
    "obj = s3_client.get_object(Bucket=bucket, Key=key)\n",
    "\n",
    "triplets = transform(obj, triplets)\n",
    "\n",
    "s3_client.put_object(Body=triplets, Bucket=bucket, Key=f\"neptune/{key.replace('.json', '.nt')}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Load Data to Neptune"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This functionality is only supported in Jupyter, not Jupyter Lab."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Use SPARQL to query the graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%sparql \n",
    "\n",
    "SELECT ?s ?p ?o\n",
    "WHERE {\n",
    "    ?s a <http://example.com/entity-type/doc>;\n",
    "      ?p ?o .\n",
    "}\n",
    "Limit 1000"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_tensorflow_p36",
   "language": "python",
   "name": "conda_tensorflow_p36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}