"""
"""
import io
from rdflib.term import Identifier
from rdflib import ConjunctiveGraph, Namespace, URIRef
from rdflib.namespace import NamespaceManager
from rdflib.tools import rdf2dot
import pyshacl
import graphviz
import json
import requests
from extruct.jsonld import JsonLdExtractor
import logging
import re
SCHEMA_ORG = "https://schema.org/"
SO_PREFIX = "SO"
SPARQL_PREFIXES = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX SO: <https://schema.org/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX datacite: <http://purl.org/spar/datacite/>
"""
# Mapping to undo case confusion
# For example, "propertyId" should be "propertyID"
# The LHS is the lowercase match to the correct RHS value
SO_TERMS = {"propertyid": "propertyID", "dataset": "Dataset"}
# Match variants of "https://schema.org/"
RE_SO = re.compile(r"^http.{0,1}://schema\.org/{0,1}")
logger = logging.getLogger(__name__)
def _desloppifyTerm(g, t):
"""
Deal with sloppy case consistency in SO term use
for example:
SO:propertyId should be SO:propertyID
Args:
g: graph containing t
t: term to de-slop
Returns:
term, de-slopped
"""
if isinstance(t, URIRef):
try:
qname = g.namespace_manager.compute_qname(t)
# 0 = prefix, 1 = namespace, 2 = term
if qname[0] == SO_PREFIX:
# Check the term value for case errors
t_val = SO_TERMS.get(qname[2].lower(), qname[2])
if t_val != qname[2]:
logger.info(f"replacing SO:{qname[2]} with {t_val}")
# return the normalized term
return URIRef(t_val, qname[1])
# not a qname. Odd, but continue
except Exception:
pass
return t
def _normalizeTerm(t):
"""
Hack the URIRefs to normalize schema.org to use "https://schema.org/"
This is an ugly solution to the problem of variable representations of
the schema.org namespace in the wild.
Args:
t: Graph term to process
Returns:
Graph term normalized to namespace <https://schema.org/>
"""
if isinstance(t, URIRef):
v = str(t)
so_match = RE_SO.match(v)
if so_match is not None:
v = v[so_match.end() :]
if v[-1] == "/":
v = v[:-1]
return URIRef(v, SCHEMA_ORG)
return t
[docs]def loadSOGraph(
filename=None,
data=None,
publicID=None,
normalize=True,
deslop=True,
format="json-ld",
):
"""
Load RDF string or file to an RDFLib ConjunctiveGraph
Creates a ConjunctiveGraph from the provided file or text. If both are
provided then text is used.
NOTE: Namespace use of ``<http://schema.org>``, ``<https://schema.org>``, or
``<http://schema.org/>`` is normalized to ``<https://schema.org/>`` if
``normalize`` is True.
NOTE: Case of ``SO:`` properties in `SO_TERMS` is adjusted consistency if
``deslop`` is True
Args:
filename (string): path to RDF file on disk
data (string): RDF text
publicID (string): (from rdflib) The logical URI to use as the document base. If None specified the document location is used.
normalize (boolean): Normalize the use of schema.org namespace
deslop (boolean): Adjust schema.org terms for case consistency
format (string): The serialization format of the RDF to load
Returns:
ConjunctiveGraph: The loaded graph
Example:
.. jupyter-execute:: examples/code/eg_loadsograph_01.py
"""
g = ConjunctiveGraph()
if data is not None:
g.parse(data=data, format=format, publicID=publicID)
elif filename is not None:
g.parse(filename, format=format, publicID=publicID)
if not (normalize or deslop):
return g
# Now normalize the graph namespace use to https://schema.org/
ns = NamespaceManager(g)
ns.bind(SO_PREFIX, SCHEMA_ORG, override=True, replace=True)
g2 = ConjunctiveGraph()
g2.namespace_manager = ns
for s, p, o in g:
trip = [s, p, o]
if normalize:
for i, t in enumerate(trip):
trip[i] = _normalizeTerm(t)
if deslop:
for i, t in enumerate(trip):
trip[i] = _desloppifyTerm(g, t)
g2.add(trip)
return g2
[docs]def loadSOGraphFromHtml(html, url):
"""
Extract jsonld entries from provided HTML text
Args:
html(string): HTML text to be parsed
Returns:
ConjunctiveGraph: Graph loaded from html
"""
jslde = JsonLdExtractor()
json_content = jslde.extract(html)
g = ConjunctiveGraph()
for json_data in json_content:
g_data = loadSOGraph(data=json.dumps(json_data), publicID=url)
g += g_data
return g
[docs]def loadSOGraphFromUrl(url):
"""
Loads graph from json-ld contained in a landing page.
Args:
url (string): Url to process
Returns:
ConjunctiveGraph: Graph of instance
Example:
.. jupyter-execute:: examples/code/eg_loadfromurl_01.py
"""
response = requests.get(url)
if response.status_code != requests.codes.ok:
raise ValueError(
f"GET request to {url} returned a status of {response.status_code}"
)
return loadSOGraphFromHtml(response.text, response.url)
[docs]def inflateSubgraph(g, sg, ts, depth=0, max_depth=100):
"""
Inflate the subgraph sg to contain all children of sg appearing in g.
Args:
g (Graph): The master graph from which the subgraph is extracted
sg (Graph): The subgraph, modified in place
ts (iterable of triples): list of triples, the objects of which identify subjects to copy frmm g
depth (integer): tracks depth of recursion
max_depth (integer): maximum recursion depth for retrieving terms
Returns:
None
"""
new_trips = []
for t in ts:
if isinstance(t[2], Identifier):
trips = g.triples((t[2], None, None))
for trip in trips:
if not trip in sg:
sg.add(trip)
new_trips.append(trip)
if len(new_trips) > 0:
depth += 1
if depth > max_depth:
return
inflateSubgraph(g, sg, new_trips, depth=depth)
return
[docs]def getSubgraph(g, subject, max_depth=100):
"""
Retrieve the subgraph of g with subject.
Given the graph ``g``, extract the subgraph identified
as the object of the triple with subject ``subject``.
Args:
g (Graph): Source graph
subject (URIRef): Subject of the root of the subgraph to retrieve
max_depth (integer): Maximum recursion depth
Returns:
(Graph) The subgraph of g with subject.
Example:
.. jupyter-execute:: examples/code/eg_getsubgraph_01.py
"""
sg = ConjunctiveGraph()
sg.namespace_manager = NamespaceManager(g)
sg += g.triples((subject, None, None))
inflateSubgraph(g, sg, sg, max_depth=max_depth)
return sg
[docs]def validateSHACL(shape_graph, data_graph):
"""
Validate data against a SHACL shape using common options.
Args:
shape_graph (ConjunctiveGraph): A SHACL shape graph
data_graph (ConjunctiveGraph): Data graph to be validated with shape_graph
Returns (tuple): Conformance (boolean), result graph (Graph) and result text
Example:
.. jupyter-execute:: examples/code/eg_validate_01.py
"""
conforms, result_graph, result_text = pyshacl.validate(
data_graph,
shacl_graph=shape_graph,
inference="rdfs",
meta_shacl=True,
abort_on_error=False,
debug=False,
advanced=True,
)
return conforms, result_graph, result_text
[docs]def renderGraph(g):
"""
For rendering an rdflib graph in Jupyter notebooks
Args:
g (Graph): The graph to render
Returns:
Jupyter cell: Output for rendering directly in the notebook
Example:
.. jupyter-execute:: examples/code/eg_rendergraph_01.py
"""
fp = io.StringIO()
rdf2dot.rdf2dot(g, fp)
return graphviz.Source(fp.getvalue())
[docs]def hasDataset(g):
"""
Number of SO:Dataset graphs in g
Args:
g (Graph): The graph to evaluate
Returns:
integer: Number of SO:Dataset graphs in g
Example:
.. jupyter-execute:: examples/code/eg_hasdataset_01.py
"""
q = (
SPARQL_PREFIXES
+ """
SELECT ?x
{
?x rdf:type SO:Dataset .
}
"""
)
qres = g.query(q)
return len(qres)
[docs]def getLiteralDatasetIdentifiers(g):
"""
Retrieve literal SO:Dataset.identifier entries
Args:
g (Graph): Graph containing ``SO:Dataset``
Returns:
list: A list of ``{value:, url:, propertyId:}`` with url=None and propertyId="Literal"
"""
q = (
SPARQL_PREFIXES
+ """
SELECT ?y
WHERE {
?x rdf:type SO:Dataset .
?x SO:identifier ?y .
FILTER (isLiteral(?y)) .
}
"""
)
res = []
qres = g.query(q)
for v in qres:
res.append({"value": str(v[0]), "propertyId": "Literal", "url": None})
return res
[docs]def getStructuredDatasetIdentifiers(g):
"""
Extract structured SO:Dataset.identifier entries
Args:
g (Graph): Graph containing ``SO:Dataset``
Returns:
list: A list of ``{value:, url:, propertyId:}``
"""
q = (
SPARQL_PREFIXES
+ """
SELECT DISTINCT ?value ?url ?propid
WHERE {
?x rdf:type SO:Dataset .
?x SO:identifier ?y .
?y rdf:type ?tt .
?y SO:value ?value .
?y SO:propertyID ?propid .
OPTIONAL { ?y SO:url ?url } .
FILTER (?tt = SO:PropertyValue || ?tt = datacite:ResourceIdentifier)
}
"""
)
res = []
qres = g.query(q)
for v in qres:
i = {"value": str(v[0]), "url": str(v[1]), "propertyId": str(v[2])}
res.append(i)
return res
[docs]def getDatasetIdentifiers(g):
"""
Return a list of ``SO:Dataset.identifier`` entries from the provided Graph
Args:
g (Graph): Graph containing ``SO:Dataset``
Returns:
list: A list of ``{value:, url:, propertyId:}``
Example:
.. jupyter-execute:: examples/code/eg_datasetidentifiers_01.py
"""
# First get any identifiers that are literals with no additional context
res = getLiteralDatasetIdentifiers(g)
return res + getStructuredDatasetIdentifiers(g)