# Neptune MultiModel (Ask KG Data Product Questions)
This notebook shows the movie example from my talk/blog post on using Amazon Neptune to help model a multimodel database solution.

The overall flow is discussed in the blog post (TBD) and in the repo https://github.com/aws-samples/amazon-neptune-ontology-example-blog/blob/main/multimodel/README.md. Read that first to understand what we're trying to accomplish.

Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. SPDX-License-Identifier: MIT-0

Begin by setting up. Run the next cell to instruct the notebook to get Neptune data from S3 bucket provisioned for you.

In [None]:
import os
import subprocess

stream = os.popen("source ~/.bashrc ; echo $STAGE_BUCKET; echo $M2C_ANALYSIS_BUCKET")
lines=stream.read().split("\n")
STAGING_BUCKET=lines[0]
STAGING_BUCKET

## Obtain UML files
We use UML to draw our data products, their implementations, and their relationships. 

Run the next cell to get a copy of those UML models in the notebook instance.

In [None]:
%%bash -s "$STAGING_BUCKET"

mkdir -p uml
mkdir -p mmgen
cd uml
aws s3 cp s3://$1/uml . --recursive


## Extract data products/impl from UML files
UML is represented in XML Metadata Interchange (XMI) form. Let's extract the main details from those files into Python data structures. 

Run the next cell to extract the details we need from UML.

In [None]:
import sys
import json
import xml.etree.ElementTree as ET

# Namespace stuff for XMI parsing
NS={
 "uml": "http://www.omg.org/spec/UML/20131001",
 "xmi": "http://www.omg.org/spec/XMI/20131001",
 "MMProfile": "http://www.magicdraw.com/schemas/MMProfile.xmi"
}
XMI_ID = "{" + NS['xmi'] + "}id"
XMI_IDREF = "{" + NS['xmi'] + "}idref"

def get_attrib(elem, name):
 if name in elem.attrib:
 return elem.attrib[name]
 else:
 return ""
 
def add_tagval(stereotype, tag, val):
 if tag in stereotype['tags']:
 stereotype['tags'][tag].append(val)
 else:
 stereotype['tags'][tag] = [val]

def get_tags(elem, tags, stereotype):
 for tag in tags:
 aval = get_attrib(elem, tag)
 if len(aval) > 0:
 add_tagval(stereotype, tag, aval)
 tagvs = elem.findall(tag)
 for tagv in tagvs:
 add_tagval(stereotype, tag, tagv.text)

umlextract = {}

def extract(filename):
 path = f"uml/{filename}"
 packages = {}
 classes = {}
 usages = {}
 props = {}
 imports = {}
 datatypes = {}
 enums = {}
 
 print("Parsing " + path)
 tree = ET.parse(path)

 # packages
 for elem in tree.findall("uml:Model//packagedElement[@xmi:type='uml:Package']", NS):
 id = get_attrib(elem, XMI_ID)
 name = get_attrib(elem, 'name')
 parent = tree.findall(f'.//packagedElement[@xmi:id="{id}"]...', NS)
 parent_id = get_attrib(parent[0], XMI_ID) if len(parent) == 1 else "";
 packages[id] = { 'id': id, 'name': name, 'parent': parent_id, 'stereotypes': []}

 # package imports
 for elem in tree.findall("uml:Model//packageImport[@xmi:type='uml:PackageImport']", NS):
 id = get_attrib(elem, XMI_ID)
 name = get_attrib(elem, 'name')
 parent = tree.findall(f'.//packagedElement[@xmi:id="{id}"]...', NS)
 parent_id = get_attrib(parent[0], XMI_ID) if len(parent) == 1 else "";
 ip = elem.find("importedPackage", NS)
 href = get_attrib(ip, "href")
 imports[id] = { 'id': id, 'name': name, 'parent': parent_id, 'href': href, 'stereotypes': []}

 # classes
 for elem in tree.findall("uml:Model//packagedElement[@xmi:type='uml:Class']", NS):
 id = get_attrib(elem, XMI_ID)
 name = get_attrib(elem, 'name')
 parent = tree.findall(f'.//packagedElement[@xmi:id="{id}"]...', NS)
 parent_id = get_attrib(parent[0], XMI_ID) if len(parent) == 1 else "";

 attribs = elem.findall("ownedAttribute", NS)
 class_attribs = {}
 for a in attribs:
 aid = get_attrib(a, XMI_ID)
 aname = get_attrib(a, 'name')
 aggregation = get_attrib(a, 'aggregation')
 atype = get_attrib(a, 'type')
 assoc = get_attrib(a, 'association')
 props[aid] = id #map property to clazz
 class_attribs[aid] = { 'id': aid, 'name': aname, 'aggregation': aggregation, 'type': atype, 'association': assoc, 'stereotypes': []}

 classes[id] = { 'id': id, 'name': name, 'parent': parent_id, 'usages': {}, 'stereotypes': [], 'properties': class_attribs}

 # datatypes
 for elem in tree.findall("uml:Model//packagedElement[@xmi:type='uml:DataType']", NS):
 id = get_attrib(elem, XMI_ID)
 name = get_attrib(elem, 'name')
 datatypes[id] = { 'id': id, 'name': name}
 
 # enums
 for elem in tree.findall("uml:Model//packagedElement[@xmi:type='uml:Enumeration']", NS):
 id = get_attrib(elem, XMI_ID)
 name = get_attrib(elem, 'name')
 lits = []
 for lit_node in elem.findall("./ownedLiteral", NS ):
 lits.append(get_attrib(lit_node, 'name'))
 
 enums[id] = { 'id': id, 'name': name, 'literals': lits}


 # usages
 for elem in tree.findall("uml:Model//packagedElement[@xmi:type='uml:Usage']", NS):
 id = get_attrib(elem, XMI_ID)
 target = get_attrib(elem.find('supplier'), XMI_IDREF)
 source = get_attrib(elem.find('client'), XMI_IDREF)
 targetHref = get_attrib(elem.find('supplier'),'href')
 usages[id] = source
 if not(source in classes):
 print(f"Warn: usage broken ref {source}")
 else:
 u = {'id': id, 'target': target, 'targetHref': targetHref, 'stereotypes': []}
 classes[source]['usages'][id] = u

 # data products - and link to classes
 for elem in tree.findall("./MMProfile:DataProduct", NS):
 id = get_attrib(elem, XMI_ID)
 clazz_id = get_attrib(elem, 'base_Class')
 if not(clazz_id in classes):
 print(f"Warn: stereotype (id) broken ref {clazz_id}")
 else:
 classes[clazz_id]['isProduct'] = True

 # impls, and link to classes
 for elem in tree.findall("./MMProfile:DataProductImpl", NS):
 id = get_attrib(elem, XMI_ID)
 clazz_id = get_attrib(elem, 'base_Class')
 if not(clazz_id in classes):
 print(f"Warn: stereotype (id) broken ref {clazz_id}")
 else:
 classes[clazz_id]['isImpl'] = True

 # usage rels
 urels = {'joins': ['joinAttrib', 'myAttrib'], 
 'refersTo': ['refersAttrib', 'myAttrib'], 
 'hasImpl': [], 'caches': [], 'copies': [], 'locatedIn':[],
 'similarTo': ['simReason', 'simAlgorithm'], 
 'config': ['configKV'],
 'hasSource': ['integrationType', 'sourceDesc', 'sourceDataSet', 'sourceEventType'],
 'federates': ['fedURI']
 }
 for u in urels:
 for elem in tree.findall(f"./MMProfile:{u}", NS):
 id = get_attrib(elem, XMI_ID)
 usage = get_attrib(elem, 'base_Usage')
 elem_id = get_attrib(elem, 'base_Element')
 stereotype = {'name': u, 'tags': {}}
 get_tags(elem, urels[u], stereotype)

 if usage in usages:
 source = usages[usage]
 classes[source]['usages'][usage]['stereotypes'].append(stereotype)
 elif elem_id in usages:
 source = usages[elem_id]
 classes[source]['usages'][elem_id]['stereotypes'].append(stereotype)

 # stereotypes at class/package level
 srels = {'awsService': ['service'], 
 'awsResource': ['resource'], 
 'usagePattern': ['pattern'], 
 'config': ['configKV'],
 'hasSource': ['integrationType', 'sourceDesc', 'sourceDataSet', 'sourceEventType'],
 'federates': ['fedURI']
 }
 for u in srels:
 for elem in tree.findall(f"./MMProfile:{u}", NS):
 id = get_attrib(elem, XMI_ID)
 clazz = get_attrib(elem, 'base_Class')
 pkg = get_attrib(elem, 'base_Package')
 pkgi = get_attrib(elem, 'base_PackageImport')
 elem_id = get_attrib(elem, 'base_Element')
 stereotype = {'name': u, 'tags': {}}
 get_tags(elem, srels[u], stereotype)
 if clazz in classes:
 classes[clazz]['stereotypes'].append(stereotype)
 elif pkg in packages:
 packages[pkg]['stereotypes'].append(stereotype)
 elif pkgi in imports:
 imports[pkgi]['stereotypes'].append(stereotype)
 elif elem_id in classes:
 classes[elem_id]['stereotypes'].append(stereotype)
 elif elem_id in packages:
 packages[elem_id]['stereotypes'].append(stereotype)

 # property-level
 prels = {'productKey': [], 'config': ['configKV'] }
 for u in prels:
 for elem in tree.findall(f"./MMProfile:{u}", NS):
 id = get_attrib(elem, XMI_ID)
 property = get_attrib(elem, 'base_Property')
 elem_id = get_attrib(elem, 'base_Element')
 stereotype = {'name': u, 'tags': {}}
 get_tags(elem, prels[u], stereotype)
 if property in props:
 clazz = props[property]
 classes[clazz]['properties'][property]['stereotypes'].append(stereotype)
 elif elem_id in props:
 clazz = props[elem_id]
 classes[clazz]['properties'][property]['stereotypes'].append(stereotype)
 
 print("done")
 umlextract[filename] = {'packages': packages, 'classes': classes, 
 'usages': usages, 'props': props, 'imports': imports, 'datatypes': datatypes, 'enums': enums}


UML_FILES = ['DataLake.xml', 'VideoAnalysis.xml', 'StoryAnalysis.xml', 'MovieDoc.xml', 'KnowledgeGraph.xml', 'Bookstore.xml']
for uf in UML_FILES:
 extract(uf)

umlextract

## Combine UML output
We collected lots of different details from UML. Let's bring them together into a clean list products and impls.

Run the next cell.

In [None]:
file_prod_impl_map = None
file_prod_impl_map = {}

# Build properties for the given class at the given level
def build_properties(filename, clazz, level, visited):
 props = umlextract[filename]['classes'][clazz]['properties']
 for p in props:
 name = props[p]['name']
 type_id = props[p]['type']
 stereotypes = props[p]['stereotypes']
 type_name = "" 
 subtype = None
 literals = None
 
 if type_id in umlextract[filename]['classes'] and \
 not('isProduct' in umlextract[filename]['classes'][type_id]) and \
 not('isImpl' in umlextract[filename]['classes'][type_id]):
 
 type_name = umlextract[filename]['classes'][type_id]['name']
 if not(clazz in visited):
 visited[clazz] = clazz
 subtype = build_properties(filename, type_id, [], visited)
 
 if type_id in umlextract[filename]['datatypes']:
 type_name = umlextract[filename]['datatypes'][type_id]['name']
 
 if type_id in umlextract[filename]['enums']:
 type_name = umlextract[filename]['enums'][type_id]['name']
 literals = umlextract[filename]['enums'][type_id]['literals']
 
 prop_entry = {
 'name': name,
 'type': type_name,
 'subtype': subtype,
 'literals': literals,
 'stereotypes': stereotypes
 }
 level.append(prop_entry)
 
 return level
 
def find_targets(filename, target_id, target_href):
 target_spec = {'target': None, 'targets': [], 'target_file': None}
 if len(target_id) > 0:
 if target_id in umlextract[filename]['classes']:
 target_spec['target'] = umlextract[filename]['classes'][target_id]['name']
 target_spec['targets'].append(target_spec['target'])
 target_spec['target_file'] = filename
 else:
 print(f"Warn: broken target in usage {u}")
 elif len(target_href) > 0: 
 toks = target_href.split("#")
 if len(toks) == 2:
 target_spec['target_file'] = toks[0]
 target_elem = toks[1]
 if target_spec['target_file'] in umlextract:
 if target_elem in umlextract[target_spec['target_file']]['classes']:
 target_spec['targets'].append(umlextract[target_spec['target_file']]['classes'][target_elem]['name'])
 elif target_elem in umlextract[target_spec['target_file']]['packages']:
 for c in file_prod_impl_map[target_spec['target_file']]['products']:
 if target_elem in c['packageAncestry']:
 target_spec['targets'].append(c['name'])

 else:
 print(f"Warn: unknown target file in usage {u}")
 else:
 print(f"Warn: unexpected target ref in usage {u}")
 if len(target_spec['targets']) == 1:
 target_spec['target'] = target_spec['targets'][0]
 return target_spec

# Bring together all properties, usages, and stereotypes of products an impls for the given UML filename
def combine(filename):
 print("Combining " + filename)
 products = []
 impls = []
 
 # consider all the classes in the UML file
 for c in umlextract[filename]['classes']:
 uobj = umlextract[filename]['classes'][c]
 obj = {
 'name': uobj['name'],
 'stereotypes': [],
 'properties': build_properties(filename, c, [], {}),
 'packageAncestry': []
 }

 # remember if it's a product or impl
 if 'isProduct' in uobj:
 products.append(obj)
 if 'isImpl' in uobj:
 impls.append(obj)
 
 # class stereotypes - incorporate
 for st in uobj['stereotypes']:
 obj['stereotypes'].append({
 'name': st['name'],
 'tags': st['tags']
 })

 # stereotype package imports
 for imp in umlextract[filename]['imports']: 
 target_href = umlextract[filename]['imports'][imp]['href']
 target_spec = find_targets(filename, "", target_href)

 for s in umlextract[filename]['imports'][imp]['stereotypes']: 
 print("Add import " + s['name'])
 obj['stereotypes'].append({
 'name': s['name'],
 'tags': s['tags'],
 'targetClass': target_spec['target'],
 'targetClasses': target_spec['targets'],
 'targetFile': target_spec['target_file']
 })

 for u in uobj['usages']:
 target_id = uobj['usages'][u]['target']
 target_href = uobj['usages'][u]['targetHref']
 target_spec = find_targets(filename, target_id, target_href)
 
 for s in uobj['usages'][u]['stereotypes']:
 print("Add usage " + s['name'])
 obj['stereotypes'].append({
 'name': s['name'],
 'tags': s['tags'],
 'targetClass': target_spec['target'],
 'targetClasses': target_spec['targets'],
 'targetFile': target_spec['target_file']
 })
 
 # inherit stereotypes from ancestor packages
 currobj=uobj
 while len(currobj['parent']) > 0 and currobj['parent'] in umlextract[filename]['packages']: 
 currobj = umlextract[filename]['packages'][uobj['parent']]
 obj['packageAncestry'].append(currobj['id'])
 for s in currobj['stereotypes']:
 print("Add ancestry " + s['name'])
 obj['stereotypes'].append(s)
 
 file_prod_impl_map[filename] = {'products': products, 'impls': impls}

for uf in UML_FILES:
 combine(uf)

file_prod_impl_map

## Look at Movie DocStore products
Before we continue, let's take a closer look at what we have so far. We'll look at data products in the movie docstore. 

Run the next cell and review the products. You'll be reading JSON. Notice that each product has stereotypes (source, join, impl and properties (a few levels deep in some cases).

We'll be converting these to RDF shortly. We won't transform all properties, just those with stereotypes or involved in reference relationships.

What's you're looking at here is product summaries with enough detail to publish to a mesh marketplace.

Run the cell.

In [None]:
import json
finder = ['MovieDocument', 'ContributorDocument', 'RoleDocument']
for p in file_prod_impl_map['MovieDoc.xml']['products']:
 if p['name'] in finder:
 j = json.dumps(p, indent=2)
 print("\n\n**** Looking at ***** " + p['name'])
 print(j)


## Create RDF
Now let's convert the details we collected from the UML models into RDF. We'll be generating an ontology in the file mmgen.ttl, which we'll save to the folder mmgen in the notebook instance. This ontology includes all data products and implementations from the UML models, including the stereotypes! 

The generated ontology builds on the existing core multimodel ontology mm.ttl. 

Generated products are implemented on several AWS services, including Amazon Neptune. Neptune plays two roles. First, it is the knowledge graph in which we load the generated ontology to keep track of all data products and ask questions about how they are related. Second, it PROVIDES its own data products as RDF resources. And we use an ontology to represent those resources. That ontology is movkg.ttl. 

That makes three ontologies: mmgen.ttl (which we are about to generate), mm.ttl (which is already written and mmgen.ttl builds on top of), movkg.ttl (which is aleady written and expands on the KG products described in mmgen.ttl). 

Run the next cell to generate mmgen.ttl.

In [None]:
!pip install rdflib

In [None]:
import os
import json
from rdflib import Graph, Literal, RDF, RDFS, URIRef, XSD, OWL, BNode

NS = "http://amazon.com/aws/wwso/neptune/demo/multimodel"
DPURI = URIRef(f"{NS}/DataProduct")
IMPLURI = URIRef(f"{NS}/DataProductImpl")
MM_ONTOLOGY = URIRef(f"{NS}/ontology")
NIL = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#nil")


PKG = {
 "DataLake.xml": "lake", 
 "KnowledgeGraph.xml": "kg",
 "MovieDoc.xml": "moviedoc",
 "StoryAnalysis.xml": "story",
 "VideoAnalysis.xml": "video",
 "Bookstore.xml": "bookstoredemo"
}

LITERAL_STEREOS = {
 "federates": {"tag": "fedURI", "obj": lambda val, umlfile, clazz, stereo: URIRef(val) } , 
 "awsService": {"tag": "service", "obj": lambda val, umlfile, clazz, stereo: make_uri("aws", val) } , 
 "awsResource": {"tag": "resource", "obj": lambda val, umlfile, clazz, stereo: Literal(val) } , #keep is loose for now
 "usagePattern": {"tag": "pattern", "obj": lambda val, umlfile, clazz, stereo: Literal(val) } 
}

USAGE_STEREOS = ["copies", "caches", "locatedIn", "hasImpl"]

OREL_STEREOS = {
 "hasSource": {
 "orel": "Source",
 "tags": {
 "integrationType": {
 "p": lambda val, umlfile, clazz, stereo, target_file:make_top_uri("integrationType"), 
 "o": lambda val, umlfile, clazz, stereo, target_file: URIRef(make_top_uri(val))
 },
 "sourceDesc": {
 "p": lambda val, umlfile, clazz, stereo, target_file:RDFS.comment, 
 "o": lambda val, umlfile, clazz, stereo, target_file: Literal(val)
 },
 "sourceDataSet": {
 "p": lambda val, umlfile, clazz, stereo, target_file:make_top_uri("sourceDataSet"),
 "o": lambda val, umlfile, clazz, stereo, target_file: Literal(val)
 },
 "sourceEventType": {
 "p": lambda val, umlfile, clazz, stereo, target_file:make_top_uri("sourceEventType"),
 "o": lambda val, umlfile, clazz, stereo, target_file: make_uri("aws", val)
 }
 }
 } , 
 "similarTo": {
 "orel": "Similarity",
 "tags": {
 "simReason": {
 "p": lambda val, umlfile, clazz, stereo, target_file:make_top_uri("simReason"),
 "o": lambda val, umlfile, clazz, stereo, target_file: Literal(val)
 },
 "simAlgorithm": {
 "p": lambda val, umlfile, clazz, stereo, target_file:make_top_uri("simAlgorithm"),
 "o": lambda val, umlfile, clazz, stereo, target_file: Literal(val) 
 }
 }
 },
 "joins": {
 "orel": "Ref",
 "tags": {
 "joinAttrib": {
 "p": lambda val, umlfile, clazz, stereo, target_file: make_top_uri("hasNeighborAttribute"),
 "o": lambda val, umlfile, clazz, stereo, target_file: make_uri(PKG[target_file], val)
 }, 
 "myAttrib": {
 "p": lambda val, umlfile, clazz, stereo, target_file: make_top_uri("hasMyAttribute"),
 "o": lambda val, umlfile, clazz, stereo, target_file: make_uri(PKG[target_file], val)
 }
 } 
 },
 "refersTo": {
 "orel": "Ref",
 "tags": {
 "refersAttrib": {
 "p": lambda val, umlfile, clazz, stereo, target_file :make_top_uri("hasNeighborAttribute"),
 "o": lambda val, umlfile, clazz, stereo, target_file: make_uri(PKG[target_file], val)
 }, 
 "myAttrib": {
 "p": lambda val, umlfile, clazz, stereo, target_file: make_top_uri("hasMyAttribute"),
 "o": lambda val, umlfile, clazz, stereo, target_file: make_uri(PKG[umlfile], val) 
 }
 } 
 }
}

def make_top_uri(name):
 return URIRef(f"{NS}/{name}")

def make_uri(ns, name):
 return URIRef(f"{NS}/{ns}/{name}")

def create_prop_from_rel(g, puri, name):
 g.add((puri, RDF.type, OWL.DatatypeProperty))
 g.add((puri, RDFS.label, Literal(name)))
 g.add((puri, RDFS.isDefinedBy, MM_ONTOLOGY))
 return puri

def create_data_prop(g, ns, name, domain_clazz):
 puri = make_uri(ns, name)
 g.add((puri, RDF.type, OWL.DatatypeProperty))
 g.add((puri, RDFS.label, Literal(name)))
 g.add((puri, RDFS.isDefinedBy, MM_ONTOLOGY))
 g.add((puri, make_top_uri("domainIncludes"), domain_clazz))
 return puri

def create_data_type_prop(g, ns, name, propuri):
 puri = make_uri(ns, name)
 g.add((puri, RDF.type, OWL.DatatypeProperty))
 g.add((puri, RDFS.label, Literal(name)))
 g.add((puri, RDFS.isDefinedBy, MM_ONTOLOGY))
 g.add((propuri, RDFS.subPropertyOf, puri))
 return puri


def create_data_product(g, ns, name):
 puri = make_uri(ns, name)
 g.add((puri, RDF.type, OWL.Class))
 g.add((puri, RDFS.subClassOf, DPURI))
 g.add((puri, RDFS.label, Literal(name)))
 g.add((puri, RDFS.isDefinedBy, MM_ONTOLOGY))
 return puri

def create_impl(g, ns, name):
 puri = make_uri(ns, name)
 g.add((puri, RDF.type, OWL.Class))
 g.add((puri, RDFS.subClassOf, IMPLURI))
 g.add((puri, RDFS.label, Literal(name)))
 g.add((puri, RDFS.isDefinedBy, MM_ONTOLOGY))
 return puri
 
def add_config(g, cfg):
 g.add((cfg, RDFS.subClassOf, make_top_uri("config")))
 
def add_po(g, s, p, o):
 g.add((s, p, o))

def create_orel(g, oreltype, s, p, po):
 po_uri = BNode()
 g.add((s, p, po_uri))
 g.add((po_uri, RDF.type, OWL.NamedIndividual))
 g.add((po_uri, RDF.type, make_top_uri(oreltype)))
 g.add((po_uri, RDFS.isDefinedBy, MM_ONTOLOGY))
 for one_po in po:
 g.add((po_uri, one_po['p'], one_po['o']))

def convert_class(umlfile, clazz, s):
 # class-level stereotypes
 for stereo in clazz['stereotypes']:
 # the predicate
 sname = stereo['name']
 p = make_top_uri(sname)
 if sname in LITERAL_STEREOS:
 # TODO - some of the objects should be URIs, not literals
 tag = LITERAL_STEREOS[sname]['tag']
 obj = LITERAL_STEREOS[sname]['obj']
 if tag in stereo['tags']:
 for val in stereo['tags'][tag]:
 add_po(g, s, p, obj(val, umlfile, clazz, stereo))
 else:
 print("Warn: incorrect tag structure in " + str(stereo))
 
 elif sname in USAGE_STEREOS:
 if len(stereo['targetClass']) > 0:
 o = make_uri(PKG[stereo['targetFile']], stereo['targetClass'])
 add_po(g, s, p, o)
 elif len(stereo['targetClasses']) > 0:
 for cl in stereo['targetClasses']:
 o = make_uri(PKG[stereo['targetFile']], cl)
 add_po(g, s, p, o)

 elif sname in OREL_STEREOS:
 po = []
 target_file = stereo['targetFile'] if 'targetFile' in stereo else None
 for t in OREL_STEREOS[sname]['tags']:
 if t in stereo['tags']:
 for val in stereo['tags'][t]:
 po_p = OREL_STEREOS[sname]['tags'][t]["p"](val, umlfile, clazz, stereo, target_file)
 po_o = OREL_STEREOS[sname]['tags'][t]["o"](val, umlfile, clazz, stereo, target_file)
 po.append({"p": po_p, "o": po_o})
 if po_p == make_top_uri("hasNeighborAttribute") or po_o == make_top_uri("hasMyAttribute"):
 create_prop_from_rel(g, po_o, val)
 if 'targetClass' in stereo and not (stereo['targetClass'] is None):
 po.append({"p": make_top_uri("hasNeighbor"), "o": make_uri(PKG[stereo['targetFile']], stereo['targetClass'])})
 elif 'targetClasses' in stereo and len(stereo['targetClasses']) > 0:
 for t in stereo['targetClasses']:
 po.append({"p": make_top_uri("hasNeighbor"), "o": make_uri(PKG[stereo['targetFile']], t)})
 create_orel(g, OREL_STEREOS[sname]['orel'], s, p, po)

 elif sname == 'config':
 if 'configKV' in stereo['tags']:
 for kv in stereo['tags']['configKV']:
 toks = kv.split(",")
 if len(toks) == 2:
 cp = make_uri("aws", "config-" + toks[0])
 co = Literal(toks[1])
 add_po(g, s, cp, co)
 add_config(g, cp)
 else:
 print("Warn: illegal config in " + str(stereo))
 else:
 print("Warn: unknown stereotype " + str(stereo))
 
 
 # Now onto properties
 # 
 # We won't take all the properties. That's great input for data mesh, but we'll consider only a few properties
 # Products keys
 # Props with config (*** Not today, maybe later ***)
 # Properties used in join/refers
 #
 # Will not consider
 # Enums
 # Properties not dealt with above
 # Properties further down the tree.
 # 
 # How to handle types
 # Model as subClassOf instead of range
 # The IMDBID type is a great illustration
 # It is better to say "MovieID is subclass of IMDBID" than 
 # "MovieID has range IMDBID".
 keys = []
 for prop in clazz['properties']:
 prop_name = prop['name']
 for prop_stereo in prop['stereotypes']:
 propuri = create_data_prop(g, PKG[umlfile], prop_name, s)
 if prop_stereo['name'] == 'productKey':
 keys.append(propuri)
 if len(prop['type']) > 0:
 create_data_type_prop(g, PKG[umlfile], prop['type'], propuri)
 
 if len(keys) > 0:
 list_uri = BNode()
 add_po(g, s, OWL.hasKey, list_uri)
 idx = 0
 for k in keys:
 add_po(g, list_uri, RDF.first, k)
 idx += 1
 if idx == len(keys):
 add_po(g, list_uri, RDF.rest, NIL)
 else:
 next_list_uri = BNode()
 add_po(g, list_uri, RDF.rest, next_list_uri) 
 list_uri = next_list_uri

g = Graph()
for umlfile in file_prod_impl_map:
 for product in file_prod_impl_map[umlfile]['products']:
 print(product['name'])
 s = create_data_product(g, PKG[umlfile], product['name'])
 convert_class(umlfile, product, s)
 for impl in file_prod_impl_map[umlfile]['impls']:
 print(impl['name'])
 s = create_impl(g, PKG[umlfile], impl['name'])
 convert_class(umlfile, impl, s)
 
g.serialize(destination = 'mmgen/mmgen.ttl', format='turtle')

## Copy generated RDF to S3
mmgen.ttl is on the notebook instance, but we need it in S3 to load it into Neptune. 

Run the next cell to move it to S3.

In [None]:
%%bash -s "$STAGING_BUCKET"

cd mmgen
aws s3 cp mmgen.ttl s3://$1/data/mmgen.ttl

## Upload three ontologies
Load from S3 all three aforementioned ontologies to Neptune.

Run each of the next six cells in sequential order. There are three loads, and three load statuses. The loads might take a few seconds to complete. Wait for the spinner to stop with a status of LOAD_COMPLETED or LOAD_FAILED. The load statuses reveal any issues in each load. We don't expect any.

In [None]:
%load -s s3://{STAGING_BUCKET}/data/mm.ttl -f turtle --store-to loadres1 --run

In [None]:
%load_status {loadres1['payload']['loadId']} --errors --details

In [None]:
%load -s s3://{STAGING_BUCKET}/data/mmgen.ttl -f turtle --store-to loadres2 --run

In [None]:
%load_status {loadres2['payload']['loadId']} --errors --details

In [None]:
%load -s s3://{STAGING_BUCKET}/data/movkg.ttl -f turtle --store-to loadres3 --run

In [None]:
%load_status {loadres3['payload']['loadId']} --errors --details

## Query the products 
Now some queries

### Get list of data products
Run the cell and compare to the UML models. The only product not in the UML models is the LonelyProduct. More on this in a moment.

In [None]:
%%sparql

PREFIX : 

select ?product where {
 ?product rdfs:subClassOf :DataProduct .
} 
ORDER BY ?product 

### Get list of products and their impls
Let's now see for each product its implementation.

First, let's fill in some gaps. An impl can have an impl. We know MovieDocument hasImpl MovieDocumentImpl, and that MovieDocumentImpl copies MovieSearchDocument. Let's complete the chain. The following insert takes impls and impls and ties them to the original product.

In [None]:
%%sparql

PREFIX : 

INSERT {
 ?product :hasImpl ?impl 
}
WHERE {
 ?product rdfs:subClassOf :DataProduct .
 ?impl rdfs:subClassOf :DataProductImpl . 
 ?product (:hasImpl|:copies|:caches|:locatedIn)+ ?impl .
}

And now query to bring back products and their impls (including those that are impls or impls)

In [None]:
%%sparql

PREFIX : 

select ?product (GROUP_CONCAT(?impl;SEPARATOR=",") AS ?impls) where {
 ?product rdfs:subClassOf :DataProduct .
 OPTIONAL { 
 ?product :hasImpl ?impl .
 } .
} 
GROUP BY ?product
ORDER BY ?product 

### Describe a product
Try both the Table and Graph tabs!

In [None]:
%%sparql

# describe mode https://docs.aws.amazon.com/neptune/latest/userguide/sparql-query-hints-for-describe.html#sparql-query-hints-describeMode
PREFIX : 
prefix movkg: 
PREFIX hint: 

describe movkg:MovieResource
{
 hint:Query hint:describeMode "CBD"
}

### Which products use OpenSearch and Elasticache?

In [None]:
%%sparql

PREFIX : 
prefix aws: 

select * where {
 ?product rdfs:subClassOf :DataProduct .
 ?product :hasImpl/:awsService aws:OpenSearch .
 ?product :hasImpl/:awsService aws:Elasticache .
}

### Story and Movie Related?
Use SPARQL ASK to check if StoryAnalysis product is somehow connected to MovieResource or MovieDocument products. It IS!!!!! The blog post discusses why.

In [None]:
%%sparql
PREFIX : 
prefix movkg: 
prefix movvideo: 
prefix movstory: 
prefix movlake: 
prefix movdoc: 
prefix aws: 

ask where {
 BIND(movstory:StoryAnalysis as ?product) .
 
 ?product ((:hasNeighbor|:hasNeighborAttribute|:joins|:refersTo|:similarTo|
 :hasSource|:sourceDataSet|rdfs:subPropertyOf|owl:hasKey/rdf:first|rdfs:domain|rdfs:range|rdfs:subPropertyOf) |^ 
 (:hasNeighbor|:hasNeighborAttribute|:joins|:refersTo|:similarTo|
 :hasSource|:sourceDataSet|rdfs:subPropertyOf|owl:hasKey/rdf:first|rdfs:domain|rdfs:range|rdfs:subPropertyOf))* ?rel .

 FILTER(?rel = movkg:MovieResource || ?rel = movdoc:MovieDocument) . 
} 


### Story and IMDB?
Are we able to connect StoryAnalysis to the IMDB? We are!

In [None]:
%%sparql
PREFIX : 
prefix movkg: 
prefix movvideo: 
prefix movstory: 
prefix movlake: 
prefix movdoc: 
prefix aws: 

ask where {
 BIND(movstory:StoryAnalysis as ?product) .
 
 ?product ((:hasNeighbor|:hasNeighborAttribute|:joins|:refersTo|:similarTo|
 :hasSource|:sourceDataSet|rdfs:subPropertyOf|owl:hasKey/rdf:first|rdfs:domain|rdfs:range|rdfs:subPropertyOf) |^ 
 (:hasNeighbor|:hasNeighborAttribute|:joins|:refersTo|:similarTo|
 :hasSource|:sourceDataSet|rdfs:subPropertyOf|owl:hasKey/rdf:first|rdfs:domain|rdfs:range|rdfs:subPropertyOf))* ?rel .

 FILTER(?rel = movdoc:IMDBID) . 
} 

### Story and the Lonely product
Can we connect the story product to the lonely product? Of course not. Lonely product is "lonely" in the graph sense - it has no neighbors.

In [None]:
%%sparql
PREFIX : 
prefix movkg: 
prefix movvideo: 
prefix movstory: 
prefix movlake: 
prefix movdoc: 
prefix aws: 

ask where {
 BIND(movstory:StoryAnalysis as ?product) .
 
 ?product ((:hasNeighbor|:hasNeighborAttribute|:joins|:refersTo|:hasSimilarity|
 :hasSource|:sourceDataSet|rdfs:subPropertyOf|owl:hasKey/rdf:first|rdfs:domain|rdfs:range|rdfs:subPropertyOf) |^ 
 (:hasNeighbor|:hasNeighborAttribute|:joins|:refersTo|:hasSimilarity|
 :hasSource|:sourceDataSet|rdfs:subPropertyOf|owl:hasKey/rdf:first|rdfs:domain|rdfs:range|rdfs:subPropertyOf))* ?rel .

 FILTER(?rel = :LonelyProduct) . 
} 

## Movie Example

### Populate sample data
Insert a movie, a couple of its roles, stories that mention, video analysis, links to IMDB, DBPedia, Wikidata

In [None]:
%%sparql
PREFIX : 
prefix movkg: 
prefix movvideo: 
prefix movstory: 
prefix movlake: 
prefix movdoc: 

INSERT DATA {
 movkg:Shining a movkg:MovieResource .
 movkg:Shining movdoc:MovieID "tt0081505" .
 movkg:Shining movkg:hasDBPediaRef .
 movkg:Shining movkg:hasWikidataRef .
 
 # cast - a couple contributors to give the idea
 movkg:RoleShining_Jack a movkg:RoleResource .
 movkg:RoleShining_Jack movkg:hasMovie movkg:Shining .
 movkg:RoleShining_Jack movkg:hasContribClass movkg:Actor .
 movkg:RoleShining_Jack movkg:hasContrib movkg:JackNicholson .

 movkg::RoleShining_Kubrick_Dir a movkg:RoleResource .
 movkg::RoleShining_Kubrick_Dir movkg:hasMovie movkg:Shining .
 movkg::RoleShining_Kubrick_Dir movkg:hasContribClass movkg:Director .
 movkg::RoleShining_Kubrick_Dir movkg:hasContrib movkg:StanleyKubrick .

 movkg:RoleShining_Kubrick_Prod a movkg:RoleResource .
 movkg:RoleShining_Kubrick_Prod movkg:hasMovie movkg:Shining .
 movkg:RoleShining_Kubrick_Prod movkg:hasContribClass movkg:Producer .
 movkg:RoleShining_Kubrick_Prob movkg:hasContrib movkg:StanleyKubrick .

 movkg:JackNicholson a movkg:ContributorResource . 
 movkg:JackNihcolson movkg:ContribID "nm0000197" .
 movkg:JackNicholson movkg:hasDBPediaRef .
 movkg:JackNicholson movkg:hasWikidataRef .

 movkg:StanleyKubrick a movkg:ContributorResource . 
 movkg:StanleyKubrick movkg:ContribID "nm0000040" .
 movkg:StanleyKubrick movkg:hasDBPediaRef .
 movkg:StanleyKubrick movkg:hasWikidataRef .

 # stories that mention
 movkg:Story_Staycation_in_Hollywood a movkg:StorytResource .
 movkg:Story_Staycation_in_Hollywood movstory:StoryTitle "Staycation in Hollywood" .
 movkg:Story_Staycation_in_Hollywood movkg:mentions movkg:Shining .
 movkg:Story_Starve_Cabin_Fever_Until_Spring a movkg:StorytResource .
 movkg:Story_Starve_Cabin_Fever_Until_Spring movstory:StoryTitle "Starve Cabin Fever Until Spring" .
 movkg:Story_Starve_Cabin_Fever_Until_Spring movkg:mentions movkg:Shining .
 
 # video analysis
 movkg:Analysis_123456789 a movkg:VideoAnalysisResource .
 movkg:Shining movkg:hasVideoAnalysis movkg:Analysis_123456789 .
 movkg:Analysis_123456789 movvideo:VideoID "123456789" .
 movkg:Analysis_123456789 movvideo:S3IngestLocation "s3://va_abcderfg_123456789/ingest" .
 movkg:Analysis_123456789 movvideo:S3AnalysisLocation "s3://va_abcderfg_123456789/analysis" .
 movkg:Analysis_123456789 movvideo:MP4FileName "0081505_shining.mp4" .
 movkg:Analysis_123456789 movkg:hasRekognitionCeleb movkg:Analysis_123456789_celeb0 .
 movkg:Analysis_123456789_celeb0 movkg:celebName "Jeff Bezos" .
 movkg:Analysis_123456789_celeb0 movkg:hasWikidataRef .
 movkg:Analysis_123456789_celeb0 movdoc:ContribID "nm1757263" . # this is an IMDB ID
 # more detail on occurences of cebeb in video in S3AnalysisLocation given above
 
}

### With MovieID (IMDBID) as input, get basic details of the movie

In [None]:
%%sparql

PREFIX : 
prefix movkg: 
prefix movvideo: 
prefix movstory: 
prefix movlake: 
prefix movdoc: 
prefix aws: 

SELECT ?movie ?dbp ?wiki ?storyMention ?video ?mp4
WHERE 
{
 ?movie movdoc:MovieID "tt0081505" .
 ?movie a movkg:MovieResource .
 OPTIONAL {?movie movkg:hasDBPediaRef ?dbp . } .
 OPTIONAL {?movie movkg:hasWikidataRef ?wiki . } .
 
 # bring in story mentions
 OPTIONAL {?storyMention movkg:mentions ?movie . } .
 
 # bring in video analyis
 OPTIONAL {?movie movkg:hasVideoAnalysis ?video . ?video movvideo:MP4FileName ?mp4 . } .
} 


### Knowing the movie URI, DESCRIBE it

See https://docs.aws.amazon.com/neptune/latest/userguide/sparql-query-hints-for-describe.html for more on DESCRIBE in Neptune. Try the Graph view too.

In [None]:
%%sparql

PREFIX : 
prefix movkg: 
PREFIX hint: 

describe movkg:Shining
{
 hint:Query hint:describeMode "CBD"
}

### Get video analysis - celebs

In [None]:
%%sparql

PREFIX : 
prefix movkg: 
prefix movvideo: 
prefix movstory: 
prefix movlake: 
prefix movdoc: 
prefix aws: 

SELECT ?movie ?mp4 ?celebName ?celebWikdata ?celebIMDB ?roleX
WHERE 
{
 ?movie movdoc:MovieID "tt0081505" .
 ?movie a movkg:MovieResource .
 
 ?movie movkg:hasVideoAnalysis ?video . 
 ?video movvideo:MP4FileName ?mp4 .
 OPTIONAL {
 # bring in celebs in video analysis
 ?video movkg:hasRekognitionCeleb ?celeb .
 ?celeb movkg:celebName ?celebName .
 ?celeb movkg:hasWikidataRef ?celebWikdata .
 ?celeb movdoc:ContribID ?celebIMDB .
 OPTIONAL {
 # Is the celeb a contributor in the movie
 ?roleX movkg:hasContributor ?contribX .
 ?contribX a movkg:ContributorResource .
 ?roleX movkg:hasMovie ?movie .
 ?contribX movdoc:ContribID ?celebIMDB . 
 }
 } 
} 


### Pull in DBPedia

In [None]:
%%sparql

PREFIX : 
prefix movkg: 
prefix movdoc: 
prefix aws: 

SELECT ?p ?o 
WHERE 
{
 ?movie movdoc:MovieID "tt0081505" .
 ?movie a movkg:MovieResource .
 ?movie movkg:hasDBPediaRef ?dbp .
 SERVICE {
 ?dbp ?p ?o . 
 }
}


### Pull in Wikidata

In [None]:
%%sparql

PREFIX : 
prefix movkg: 
prefix movdoc: 

SELECT ?p ?o 
WHERE 
{
 ?movie movdoc:MovieID "tt0081505" .
 ?movie a movkg:MovieResource .
 ?movie movkg:hasWikidataRef ?wiki .
 SERVICE {
 ?wiki ?p ?o . 
 } 
}


## Cleanup
If you messed up... 
Either of the two approaches works.

In [None]:
%%sparql

delete {?s ?p ?o} where {?s ?p ?o}

In [None]:
%db_reset