{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Knowledge Graph" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Transform JSON to RDF format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set up transformation function to handle transforming the json object to RDF format as per your graph model. The code below handles the transformation for the following model: [link pic]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install rdflib\n", "!pip install tqdm" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import rdflib\n", "import urllib\n", "import uuid\n", "import logging\n", "import re\n", "\n", "from hashlib import md5\n", "\n", "from rdflib import URIRef, Literal, ConjunctiveGraph\n", "from rdflib.namespace import RDF, RDFS, XSD" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "URL = 'http://example.com'\n", "\n", "## ****************** NEPTUNE NAMESPACE ********************\n", "DATA_GRAPH = f'{URL}/graph/data'\n", "\n", "RELATION = f'{URL}/relationship' # node -> node\n", "RESOURCE = f'{URL}/resource' # node/id\n", "PROPERTY = f'{URL}/property/' # node -> literal\n", "ENTITY_TYPE = f'{URL}/entity-type/' # attached to every node with rdf.type\n", "TYPE = f'{URL}/type' # attached to product property node (material/colour/..)\n", "\n", "## ****************** NODES NAMESPACE ********************\n", "DOC_NODE = f'{RESOURCE}/doc/'\n", "NESTED_DOC_NODE = f'{RESOURCE}/nested-doc/'\n", "\n", "HAS_NESTED_PROPERTY = f'{RELATION}/has-nested-property' # product -> properties" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_hash(value):\n", " return md5(str(value).lower().encode()).hexdigest()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_url_encode(txt):\n", " return urllib.parse.quote(txt)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def json2rdf(doc):\n", "\n", " doc_fields = doc.keys()\n", "\n", " g = ConjunctiveGraph()\n", "\n", " doc_id = get_url_encode(doc[\"id\"])\n", " \n", " doc_id_uri = URIRef(f'{DOC_NODE}{doc_id}')\n", "\n", " g.add((doc_id_uri, RDF.type, URIRef(f\"{ENTITY_TYPE}doc\")))\n", " \n", " doc_dicts = []\n", " \n", " for field in doc_fields: \n", " if isinstance(doc[field], dict):\n", " doc_dicts += [field]\n", " continue\n", " field_value = Literal(str(doc[field]))\n", " field = re.sub('[^A-Za-z0-9]+', ' ', field)\n", " field = field.strip().lower().replace(' ','-')\n", " g.add((doc_id_uri, URIRef(f\"{PROPERTY}{field}\"), field_value))\n", " \n", " for field in doc_dicts:\n", " for key, value in doc[field].items():\n", " key = re.sub('[^A-Za-z0-9]+', ' ', key)\n", " key = key.strip().lower().replace(' ','-')\n", " \n", " if type(value) in [list, set]:\n", " for element in value:\n", " add_nested_property(g, doc_id_uri, key, element)\n", " else:\n", " add_nested_property(g, doc_id_uri, key, value)\n", "\n", " return g.serialize(format = 'ntriples').decode()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def add_nested_property(g, doc_id_uri, key, value): \n", " value_hash = get_hash(f\"{value}\")\n", " \n", " value_node_uri = URIRef(f\"{NESTED_DOC_NODE}{key}/{value_hash}\") \n", " \n", " g.add((doc_id_uri, URIRef(f\"{HAS_NESTED_PROPERTY}\"), value_node_uri)) \n", " \n", " node_value = Literal(str(value))\n", " g.add((value_node_uri, RDFS.label, node_value)) \n", " \n", " g.add((value_node_uri, RDF.type, URIRef(f\"{ENTITY_TYPE}nested-property\"))) \n", " \n", " g.add((value_node_uri, URIRef(TYPE), URIRef(f'{TYPE}/{key}')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Apply transformation to NERC output" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Specify the bucket and key of the NERC model output to transform it from JSON to RDF format and store result to the same bucket under neptune prefix in s3." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import boto3\n", "from tqdm.notebook import tqdm\n", "\n", "s3_client = boto3.client('s3')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def transform(obj, triplets = ''):\n", " items = json.loads(obj['Body'].read().decode('utf-8'))\n", "\n", " for i in tqdm(range(len(items))):\n", "\n", " item_triplets = json2rdf(items[i])\n", "\n", " triplets = f'{item_triplets}\\n{triplets}'\n", " \n", " return triplets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sagemaker\n", "\n", "# Taking the output of the NERC\n", "sagemaker_session = sagemaker.Session()\n", "\n", "#We'll be using the sagemaker default bucket\n", "BUCKET = sagemaker_session.default_bucket()\n", "PREFIX = 'graph-nerc-blog' #Feel free to change this" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "triplets = ''\n", "\n", "bucket = BUCKET\n", "key = PREFIX + '/data_with_entities.json'\n", "\n", "obj = s3_client.get_object(Bucket=bucket, Key=key)\n", "\n", "triplets = transform(obj, triplets)\n", "\n", "s3_client.put_object(Body=triplets, Bucket=bucket, Key=f\"neptune/{key.replace('.json', '.nt')}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Load Data to Neptune" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This functionality is only supported in Jupyter, not Jupyter Lab." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%load" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Use SPARQL to query the graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%sparql \n", "\n", "SELECT ?s ?p ?o\n", "WHERE {\n", " ?s a ;\n", " ?p ?o .\n", "}\n", "Limit 1000" ] } ], "metadata": { "kernelspec": { "display_name": "conda_tensorflow_p36", "language": "python", "name": "conda_tensorflow_p36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 4 }