# Knowledge Graph

## 1. Transform JSON to RDF format

Set up transformation function to handle transforming the json object to RDF format as per your graph model. The code below handles the transformation for the following model: [link pic]

In [None]:
!pip install rdflib
!pip install tqdm

In [None]:
import json
import rdflib
import urllib
import uuid
import logging
import re

from hashlib import md5

from rdflib import URIRef, Literal, ConjunctiveGraph
from rdflib.namespace import RDF, RDFS, XSD

In [None]:
URL = 'http://example.com'

## ****************** NEPTUNE NAMESPACE ********************
DATA_GRAPH = f'{URL}/graph/data'

RELATION = f'{URL}/relationship' # node -> node
RESOURCE = f'{URL}/resource' # node/id
PROPERTY = f'{URL}/property/' # node -> literal
ENTITY_TYPE = f'{URL}/entity-type/' # attached to every node with rdf.type
TYPE = f'{URL}/type' # attached to product property node (material/colour/..)

## ****************** NODES NAMESPACE ********************
DOC_NODE = f'{RESOURCE}/doc/'
NESTED_DOC_NODE = f'{RESOURCE}/nested-doc/'

HAS_NESTED_PROPERTY = f'{RELATION}/has-nested-property' # product -> properties

In [None]:
def get_hash(value):
 return md5(str(value).lower().encode()).hexdigest()

In [None]:
def get_url_encode(txt):
 return urllib.parse.quote(txt)

In [None]:
def json2rdf(doc):

 doc_fields = doc.keys()

 g = ConjunctiveGraph()

 doc_id = get_url_encode(doc["id"])
 
 doc_id_uri = URIRef(f'{DOC_NODE}{doc_id}')

 g.add((doc_id_uri, RDF.type, URIRef(f"{ENTITY_TYPE}doc")))
 
 doc_dicts = []
 
 for field in doc_fields: 
 if isinstance(doc[field], dict):
 doc_dicts += [field]
 continue
 field_value = Literal(str(doc[field]))
 field = re.sub('[^A-Za-z0-9]+', ' ', field)
 field = field.strip().lower().replace(' ','-')
 g.add((doc_id_uri, URIRef(f"{PROPERTY}{field}"), field_value))
 
 for field in doc_dicts:
 for key, value in doc[field].items():
 key = re.sub('[^A-Za-z0-9]+', ' ', key)
 key = key.strip().lower().replace(' ','-')
 
 if type(value) in [list, set]:
 for element in value:
 add_nested_property(g, doc_id_uri, key, element)
 else:
 add_nested_property(g, doc_id_uri, key, value)

 return g.serialize(format = 'ntriples').decode()

In [None]:
def add_nested_property(g, doc_id_uri, key, value): 
 value_hash = get_hash(f"{value}")
 
 value_node_uri = URIRef(f"{NESTED_DOC_NODE}{key}/{value_hash}") 
 
 g.add((doc_id_uri, URIRef(f"{HAS_NESTED_PROPERTY}"), value_node_uri)) 
 
 node_value = Literal(str(value))
 g.add((value_node_uri, RDFS.label, node_value)) 
 
 g.add((value_node_uri, RDF.type, URIRef(f"{ENTITY_TYPE}nested-property"))) 
 
 g.add((value_node_uri, URIRef(TYPE), URIRef(f'{TYPE}/{key}')))

## 2. Apply transformation to NERC output

Specify the bucket and key of the NERC model output to transform it from JSON to RDF format and store result to the same bucket under neptune prefix in s3.

In [None]:
import boto3
from tqdm.notebook import tqdm

s3_client = boto3.client('s3')

In [None]:
def transform(obj, triplets = ''):
 items = json.loads(obj['Body'].read().decode('utf-8'))

 for i in tqdm(range(len(items))):

 item_triplets = json2rdf(items[i])

 triplets = f'{item_triplets}\n{triplets}'
 
 return triplets

In [None]:
import sagemaker

# Taking the output of the NERC
sagemaker_session = sagemaker.Session()

#We'll be using the sagemaker default bucket
BUCKET = sagemaker_session.default_bucket()
PREFIX = 'graph-nerc-blog' #Feel free to change this

In [None]:
triplets = ''

bucket = BUCKET
key = PREFIX + '/data_with_entities.json'

obj = s3_client.get_object(Bucket=bucket, Key=key)

triplets = transform(obj, triplets)

s3_client.put_object(Body=triplets, Bucket=bucket, Key=f"neptune/{key.replace('.json', '.nt')}")

## 3. Load Data to Neptune

This functionality is only supported in Jupyter, not Jupyter Lab.

In [None]:
%load

## 4. Use SPARQL to query the graph

In [None]:
%%sparql 

SELECT ?s ?p ?o
WHERE {
 ?s a ;
 ?p ?o .
}
Limit 1000