In this guide we will use neosemantics and APOC NLP to create a sports knowledge graph.
neosemantics (n10s) is a plugin that enables the use of RDF and its associated vocabularies like (OWL,RDFS,SKOS and others) in Neo4j.
You can use n10s to easily build integrations with RDF-generating / RDF-consuming components. You can also use it to validate your graph against constraints expressed in SHACL or to run basic inferencing.
Wikidata is a free and open knowledge base that can be read and edited by both humans and machines. It acts as central storage for the structured data of its Wikimedia sister projects including Wikipedia.
CREATE CONSTRAINT n10s_unique_uri ON (r:Resource) ASSERT r.uri IS UNIQUE;
CALL n10s.graphconfig.init({handleVocabUris: "MAP"});
CALL n10s.nsprefixes.add('neo','neo4j://voc#');
CALL n10s.mapping.add("neo4j://voc#about","ABOUT");
CALL n10s.mapping.add("neo4j://voc#part_of","PART_OF");
UNWIND ["Q324867","Q9448","Q15804"] as leagueId with leagueId
CALL n10s.rdf.import.fetch(
'https://query.wikidata.org/sparql?query=prefix%20neo%3A%20%3Cneo4j%3A%2F%2Fvoc%23%3E%20%0ACONSTRUCT%20%7B%0A%20%20%3Fitem%20neo%3Aname%20%3Flabel%3B%0A%20%20%20%20%20%20%20%20neo%3Apart_of%20%3Fclub%3B%0A%20%20%20%20%20%20%20%20a%20neo%3AHuman.%0A%20%20%3Farticle%20a%20neo%3AWikipediaPage%3B%20%0A%20%20%20%20%20%20%20%20%20%20%20%20%20neo%3Aabout%20%3Fitem%20%3B%20%20%20.%0A%20%20%3Fclub%20a%20neo%3AClub%20%3B%20%0A%20%20%20%20%20%20%20%20%20%20neo%3Aname%20%3FclubName%20%3B%0A%20%20%20%20%20%20%20%20%20%20neo%3Apart_of%20%3Fleague%20.%0A%20%20%3Fleague%20a%20neo%3ALeague%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20neo%3Aname%20%3FleagueName%20.%0A%20%20%0A%20%20%3FclubArticle%20a%20neo%3AWikipediaPage%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20neo%3Aabout%20%3Fclub%20.%20%20%20%20%20%20%0A%7D%0AWHERE%20%0A%7B%0A%20%20%3Fclub%20wdt%3AP118%20wd%3A' + leagueId + '%2C%20%3Fleague%20.%0A%20%20%0A%20%20%3Fitem%20wdt%3AP106%20wd%3AQ937857%20%3B%20wdt%3AP569%20%3Fdob%20.%0A%20%20FILTER(xsd%3Adate(%3Fdob)%20%3E%3D%20%221970-01-01%22%5E%5Exsd%3Adate%20)%0A%20%20%3Fitem%20wdt%3AP54%20%3Fclub%20.%0A%20%20%0A%20%20%0A%20%20%3Fitem%20rdfs%3Alabel%20%3Flabel%20.%0A%20%20filter(lang(%3Flabel)%20%3D%20%22en%22)%0A%20%20%0A%20%20%3Fclub%20rdfs%3Alabel%20%3FclubName%20.%0A%20%20filter(lang(%3FclubName)%20%3D%20%22en%22)%0A%20%20%0A%20%20%3Fleague%20rdfs%3Alabel%20%3FleagueName%20.%0A%20%20filter(lang(%3FleagueName)%20%3D%20%22en%22)%0A%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%20%3Farticle%20schema%3Aabout%20%3Fitem%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20schema%3AinLanguage%20%22en%22%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20schema%3AisPartOf%20%3Chttps%3A%2F%2Fen.wikipedia.org%2F%3E%20.%0A%20%20%7D%0A%20%20%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%20%3FclubArticle%20schema%3Aabout%20%3Fclub%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20schema%3AinLanguage%20%22en%22%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20schema%3AisPartOf%20%3Chttps%3A%2F%2Fen.wikipedia.org%2F%3E%20.%20%20%20%0A%20%20%20%20%7D%0A%20%20%7D%20',
'Turtle' ,
{ headerParams: { Accept: "application/x-turtle" } }
) yield terminationStatus, triplesLoaded, triplesParsed, namespaces, extraInfo, callParams
return leagueId, terminationStatus, triplesLoaded, triplesParsed, namespaces, extraInfo, callParams
CALL n10s.rdf.import.fetch("https://query.wikidata.org/sparql?query=prefix%20neo%3A%20%3Cneo4j%3A%2F%2Fvoc%23%3E%20%0ACONSTRUCT%20%7B%0A%20%20%3Fleague%20a%20neo%3ALeague%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20neo%3Aname%20%3FleagueName%20.%0A%20%20%0A%20%20%3FleagueArticle%20a%20neo%3AWikipediaPage%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20neo%3Aabout%20%3Fleague%20.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%0A%7D%0AWHERE%20%0A%7B%0A%20%20%3Fclub%20wdt%3AP118%20%3Fleague%20.%0A%20%0A%20%20FILTER%20(%3Fleague%20IN%20(wd%3AQ324867%2C%20wd%3AQ9448%2C%20wd%3AQ15804%20)%20)%20%20%0A%20%20%0A%20%20%3Fleague%20rdfs%3Alabel%20%3FleagueName%20.%0A%20%20filter(lang(%3FleagueName)%20%3D%20%22en%22)%0A%0A%20%20OPTIONAL%20%7B%20%20%20%20%0A%20%20%20%20%20%20%3FleagueArticle%20schema%3Aabout%20%3Fleague%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20schema%3AinLanguage%20%22en%22%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20schema%3AisPartOf%20%3Chttps%3A%2F%2Fen.wikipedia.org%2F%3E%20.%20%20%20%20%0A%20%20%20%20%7D%0A%20%20%7D", 'Turtle' , { headerParams: { Accept: "application/x-turtle" } })
CALL n10s.rdf.import.fetch("https://query.wikidata.org/sparql?query=prefix%20neo%3A%20%3Cneo4j%3A%2F%2Fvoc%23%3E%20%0ACONSTRUCT%20%7B%0A%20%20%3Fitem%20neo%3Aname%20%3Flabel%3B%0A%20%20%20%20%20%20%20%20neo%3Apart_of%20%3Fclub%3B%0A%20%20%20%20%20%20%20%20a%20neo%3AHuman.%0A%20%20%3Farticle%20a%20neo%3AWikipediaPage%3B%20%0A%20%20%20%20%20%20%20%20%20%20%20%20%20neo%3Aabout%20%3Fitem%20%3B%20%20%20.%0A%20%20%3Fclub%20a%20neo%3AClub%20%3B%20%0A%20%20%20%20%20%20%20%20%20%20neo%3Aname%20%3FclubName%20%3B%0A%20%20%20%20%20%20%20%20%20%20neo%3Apart_of%20%3Fleague%20.%0A%20%20%3Fleague%20a%20neo%3ALeague%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20neo%3Aname%20%3FleagueName%20.%0A%20%20%0A%20%20%3FclubArticle%20a%20neo%3AWikipediaPage%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20neo%3Aabout%20%3Fclub%20.%20%20%20%20%20%20%20%20%20%0A%7D%0AWHERE%20%0A%7B%0A%20%20%3Fitem%20wdt%3AP106%20wd%3AQ628099%20.%0A%20%20%3Fitem%20wdt%3AP6087%20%3Fclub%20.%0A%20%20%3Fclub%20wdt%3AP118%20%3Fleague%20.%0A%20%20%0A%20%20FILTER%20(%3Fleague%20IN%20(wd%3AQ324867%2C%20wd%3AQ9448%2C%20wd%3AQ15804%20)%20)%20%20%0A%0A%20%20%0A%20%20%3Fitem%20rdfs%3Alabel%20%3Flabel%20.%0A%20%20filter(lang(%3Flabel)%20%3D%20%22en%22)%0A%20%20%0A%20%20%3Fclub%20rdfs%3Alabel%20%3FclubName%20.%0A%20%20filter(lang(%3FclubName)%20%3D%20%22en%22)%0A%20%20%0A%20%20%3Fleague%20rdfs%3Alabel%20%3FleagueName%20.%0A%20%20filter(lang(%3FleagueName)%20%3D%20%22en%22)%0A%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%20%3Farticle%20schema%3Aabout%20%3Fitem%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20schema%3AinLanguage%20%22en%22%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20schema%3AisPartOf%20%3Chttps%3A%2F%2Fen.wikipedia.org%2F%3E%20.%0A%20%20%20%20%0A%20%20%20%20%20%20%3FclubArticle%20schema%3Aabout%20%3Fclub%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20schema%3AinLanguage%20%22en%22%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20schema%3AisPartOf%20%3Chttps%3A%2F%2Fen.wikipedia.org%2F%3E%20.%0A%20%20%20%20%7D%0A%20%20%7D", 'Turtle' , { headerParams: { Accept: "application/x-turtle" } });
MATCH path = (c:Resource {name: "Premier League"})<-[:PART_OF*]-(child)
RETURN path
LIMIT 25;
MATCH path = (:Resource {name: "Manchester United F.C."})-[:PART_OF*]->(root)<-[:PART_OF]-(sibling)
WHERE not((root)-[:PART_OF]->())
RETURN path
LIMIT 50;
We’re going to some articles from the Guardian football section into Neo4j. We can access an RSS feed of the articles are https://www.theguardian.com/football/rss
CALL apoc.load.xml('https://www.theguardian.com/football/rss','rss/channel/item')
YIELD value
WITH value WHERE value._type = 'item'
RETURN value AS c;
We’re going to use the apoc.load.xml
procedure to extract the body and title from each of these articles.
We’ll wrap the call to apoc.load.xml
with a call to apoc.periodic.iterate
so that we can process the articles concurrently:
CREATE CONSTRAINT on (c:Category)
ASSERT c.name is UNIQUE;
CREATE CONSTRAINT on (a:Article)
ASSERT a.uri is UNIQUE;
Import this static feed for predictable results: https://raw.githubusercontent.com/neo4j-examples/nlp-knowledge-graph/master/import/guardian.football.rss.xml
CALL apoc.periodic.iterate(
"CALL apoc.load.xml('https://www.theguardian.com/football/rss','rss/channel/item')
YIELD value
WITH value WHERE value._type = 'item'
RETURN value AS c",
"WITH c, [i in c._children where i._type = 'title' | i][0]._text AS title,
[i in c._children where i._type = 'link' | i][0]._text AS link,
[i in c._children where i._type = 'description' | i][0]._text AS description,
[i in c._children where i._type = 'date' | i][0]._text AS date,
[cat in [i in c._children where i._type = 'category' | i] | cat._text] AS categories
MERGE (a:Article {uri: link})
SET a.body = apoc.text.regreplace(description, '<[^>]*>', ' ') , a.title = title, a.datetime = datetime(date)
FOREACH(c in categories | MERGE (category:Category {name: c}) MERGE (a)-[:IN_CATEGORY]->(category) )",
{batchSize: 5, parallel: false}
)
YIELD batches, total, timeTaken, committedOperations
RETURN batches, total, timeTaken, committedOperations;
MATCH (a:Article)
RETURN a.uri, a.title, a.body, a.datetime
ORDER BY a.datetime DESC;
MATCH (n:Article)
RETURN n.uri, n.title,
[(n)-[:IN_CATEGORY]->(c) | c.name] AS categories;
APOC is Neo4j’s standard utility library. It includes over 450 standard procedures, providing functionality for utilities, conversions, graph updates, and more.
It has procedures that wrap the Natural Language Processing APIs for the major cloud providers, AWS, GCP, and Azure.
CALL apoc.periodic.iterate(
"MATCH (a:Article)
WHERE not(exists(a.processed))
RETURN a",
"CALL apoc.nlp.gcp.entities.stream([item in $_batch | item.a], {
nodeProperty: 'body',
key: $key
})
YIELD node, value
SET node.processed = true
WITH node, value
UNWIND value.entities AS entity
WITH entity, node
WHERE not(entity.metadata.wikipedia_url is null)
WITH node, entity.metadata.wikipedia_url AS uri
MERGE (page:Resource {uri: 'https://en.wikipedia.org/wiki/' + apoc.text.urlencode(n10s.rdf.getIRILocalName(uri))})
SET page:WikipediaPage
MERGE (node)-[:HAS_ENTITY]->(page)",
{batchMode: "BATCH_SINGLE", batchSize: 10, params: {key: $key}});
MATCH (a:Article)-[:HAS_ENTITY]->()-[:ABOUT]->(category)-[:PART_OF*]->(parent),
(a)-[parentEntity:HAS_ENTITY]->()-[:ABOUT]->(parent)
DELETE parentEntity;
MATCH (n:Article)
RETURN n.uri, n.title,
[(n)-[:IN_CATEGORY]->(c) | c.name] AS categories,
[(n)-[:HAS_ENTITY]->(e) | apoc.text.urldecode(n10s.rdf.getIRILocalName(e.uri))] AS entities;
We can write a query that starts from a top level category and finds all the articles attached to the underlying taxonomy.
The n10s.inference.nodesInCategory
procedure automates this for us, as shown below:
MATCH (c:Resource { name : "Real Madrid CF" })
CALL n10s.inference.nodesInCategory(c, {
inCatRel: "ABOUT",
subCatRel: "PART_OF"
})
YIELD node
MATCH (node)<-[:HAS_ENTITY]-(article)
RETURN article.uri AS uri, article.title AS title, article.datetime AS date,
collect([ p in (node)-[:ABOUT]->() | nodes(p)[-1].name][0]) as connectingTopics,
[ p in (article)-[:HAS_ENTITY]->()-[:ABOUT]->() | nodes(p)[-1].name ] as allTopics
ORDER BY date DESC
MATCH (a:Article {uri: "https://www.theguardian.com/football/2020/oct/14/weston-mckinnie-juventus-covid-19-cristiano-ronaldo-coronavirus"}),
path = (a)-[:HAS_ENTITY]->(wiki)-[:ABOUT]->(cat),
otherPath = (wiki)<-[:HAS_ENTITY]-(other)
return path, otherPath;
We are exploring the :PART_OF
hierarchy up to two levels deep -[:PART_OF*..2]-
but we are excluding top level nodes none(x in nodes(path) where x:League)
as they are too "noisy"
MATCH (a:Article {uri: "https://www.theguardian.com/football/2020/oct/14/weston-mckinnie-juventus-covid-19-cristiano-ronaldo-coronavirus"}),
entityPath = (a)-[:HAS_ENTITY]->(wiki)-[:ABOUT]->(cat),
path = (cat)-[:PART_OF*..2]->(parent)<-[:PART_OF*..2]-(otherCat),
otherEntityPath = (otherCat)<-[:ABOUT]-(otherWiki)<-[:HAS_ENTITY]-(other)
WHERE other <> a and none(x in nodes(path) where x:League)
RETURN [(a)-[:HAS_ENTITY]->()-[:ABOUT]->(entity) | entity.name] AS articlesEntities, other.title as similarArticle, other.uri,
[(other)-[:HAS_ENTITY]->()-[:ABOUT]->(entity) | entity.name] AS similarArticleEntities,
count(path) as weight,
collect(substring(reduce(res = "", node in nodes(path) | res + "--(" + node.name + ")"),2)) AS explanation
ORDER by weight DESC;
For other articles we get no similarities at all. This one in particular (https://www.theguardian.com/football/ng-interactive/2020/oct/06/david-squires-on-anarchy-in-the-premier-league-and-extinct-dinosaurs) is about Manchester United and Liverpool FC
MATCH (a:Article {uri: "https://www.theguardian.com/football/ng-interactive/2020/oct/06/david-squires-on-anarchy-in-the-premier-league-and-extinct-dinosaurs"}),
entityPath = (a)-[:HAS_ENTITY]->(wiki)-[:ABOUT]->(cat),
path = (cat)-[:PART_OF*..2]->(parent)<-[:PART_OF*..2]-(otherCat),
otherEntityPath = (otherCat)<-[:ABOUT]-(otherWiki)<-[:HAS_ENTITY]-(other)
WHERE other <> a and none(x in nodes(path) where x:League)
RETURN [(a)-[:HAS_ENTITY]->()-[:ABOUT]->(entity) | entity.name] AS articlesEntities,
other.title as similarArticle,
other.uri,
[(other)-[:HAS_ENTITY]->()-[:ABOUT]->(entity) | entity.name] AS similarArticleEntities,
count(path) as weight,
collect(substring(reduce(res = "", node in nodes(path) | res + "--(" + node.name + ")"),2)) AS explanation
ORDER by weight DESC;
We’re now going to add a custom ontology of football rivalries.
CALL n10s.nsprefixes.add('skos','http://www.w3.org/2004/02/skos/core#');
CALL n10s.nsprefixes.add('rdfs','http://www.w3.org/2000/01/rdf-schema#');
CALL n10s.mapping.add("http://www.w3.org/2004/02/skos/core#broader","PART_OF");
CALL n10s.mapping.add("http://www.w3.org/2000/01/rdf-schema#label","name");
CALL n10s.mapping.add("http://www.w3.org/2004/02/skos/core#Concept","Concept");
call n10s.rdf.preview.fetch("http://www.nsmntx.org/2020/10/fo-riv",'Turtle')
call n10s.rdf.import.fetch("http://www.nsmntx.org/2020/10/fo-riv",'Turtle')
YIELD terminationStatus, triplesLoaded, triplesParsed, namespaces, callParams
RETURN terminationStatus, triplesLoaded, triplesParsed, namespaces, callParams;
And now if we try our similarity query again with the article that returned no similar ones, we’ll see articles popping up based on the relationships defined in the football rivalry ontology.
MATCH (a:Article {uri: "https://www.theguardian.com/football/ng-interactive/2020/oct/06/david-squires-on-anarchy-in-the-premier-league-and-extinct-dinosaurs"}),
entityPath = (a)-[:HAS_ENTITY]->(wiki)-[:ABOUT]->(cat),
path = (cat)-[:PART_OF*0..2]->(parent)<-[:PART_OF*0..2]-(otherCat),
otherEntityPath = (otherCat)<-[:ABOUT]-(otherWiki)<-[:HAS_ENTITY]-(other)
WHERE other <> a and none(x in nodes(path) where x:League)
RETURN [(a)-[:HAS_ENTITY]->()-[:ABOUT]->(entity) | entity.name] AS articlesEntities,
other.title as similarArticle,
other.uri,
[(other)-[:HAS_ENTITY]->()-[:ABOUT]->(entity) | entity.name] AS similarArticleEntities,
count(path) as weight,
collect(substring(reduce(res = "", node in nodes(path) | res + "--(" + node.name + ")"),2)) AS explanation
ORDER by weight DESC;
Some of the articles ranking at the top are connected to the initial article via the Team Rivalry
, which isn’t a good measure of similarity.
Let’s fix that.
Let’s remove any paths that contain the Concept
label, which will sort out that problem.
MATCH (a:Article {uri: "https://www.theguardian.com/football/ng-interactive/2020/oct/06/david-squires-on-anarchy-in-the-premier-league-and-extinct-dinosaurs"}),
entityPath = (a)-[:HAS_ENTITY]->(wiki)-[:ABOUT]->(cat),
path = (cat)-[:PART_OF*0..2]->(parent)<-[:PART_OF*0..2]-(otherCat),
otherEntityPath = (otherCat)<-[:ABOUT]-(otherWiki)<-[:HAS_ENTITY]-(other)
WHERE other <> a and none(x in nodes(path) where x:League OR x:Concept)
RETURN [(a)-[:HAS_ENTITY]->()-[:ABOUT]->(entity) | entity.name] AS articlesEntities,
other.title as similarArticle,
other.uri,
[(other)-[:HAS_ENTITY]->()-[:ABOUT]->(entity) | entity.name] AS similarArticleEntities,
count(path) as weight,
collect(substring(reduce(res = "", node in nodes(path) | res + "--(" + node.name + ")"),2)) AS explanation
ORDER by weight DESC;
That looks better. At the moment we’re ordering the results by the number of paths that exist between the articles. Let’s update that so that we favour articles with shorter paths.
We’ll build a score that sums the inverse path lengths. This means that:
-
A path of length 5 adds (1/5) to the score
-
A path of length 2 adds (1/2) to the score
MATCH (a:Article {uri: "https://www.theguardian.com/football/ng-interactive/2020/oct/06/david-squires-on-anarchy-in-the-premier-league-and-extinct-dinosaurs"}),
entityPath = (a)-[:HAS_ENTITY]->(wiki)-[:ABOUT]->(cat),
path = (cat)-[:PART_OF*0..2]->(parent)<-[:PART_OF*0..2]-(otherCat),
otherEntityPath = (otherCat)<-[:ABOUT]-(otherWiki)<-[:HAS_ENTITY]-(other)
WHERE other <> a and none(x in nodes(path) where x:League OR x:Concept)
WITH [(a)-[:HAS_ENTITY]->()-[:ABOUT]->(entity) | entity.name] AS articlesEntities,
other.title as similarArticle,
other.uri AS uri,
[(other)-[:HAS_ENTITY]->()-[:ABOUT]->(entity) | entity.name] AS similarArticleEntities,
count(path) as weight,
collect(DISTINCT substring(reduce(res = "", node in nodes(path) | res + "--(" + node.name + ")"),2)) AS explanation,
collect(path) AS paths
RETURN articlesEntities,
similarArticle, uri,
similarArticleEntities, weight,
explanation,
reduce(score=0.0, x in paths | score + 1.0/length(x)) AS weightedScore
ORDER by weightedScore DESC;
Hmmm, our top ranking article is still a quiz that contains the names of lots of ex Liverpool and Manchester United players. That probably isn’t the most relevant article, so let’s dampen the score by the number of entities that an article has.
We can do this by dividing the weightedScore
by the size of similarArticleEntities
:
MATCH (a:Article {uri: "https://www.theguardian.com/football/ng-interactive/2020/oct/06/david-squires-on-anarchy-in-the-premier-league-and-extinct-dinosaurs"}),
entityPath = (a)-[:HAS_ENTITY]->(wiki)-[:ABOUT]->(cat),
path = (cat)-[:PART_OF*0..2]->(parent)<-[:PART_OF*0..2]-(otherCat),
otherEntityPath = (otherCat)<-[:ABOUT]-(otherWiki)<-[:HAS_ENTITY]-(other)
WHERE other <> a and none(x in nodes(path) where x:League OR x:Concept)
WITH [(a)-[:HAS_ENTITY]->()-[:ABOUT]->(entity) | entity.name] AS articlesEntities,
other.title as similarArticle,
other.uri AS uri,
[(other)-[:HAS_ENTITY]->()-[:ABOUT]->(entity) | entity.name] AS similarArticleEntities,
count(path) as weight,
collect(DISTINCT substring(reduce(res = "", node in nodes(path) | res + "--(" + node.name + ")"),2)) AS explanation,
collect(reduce(res = [], l in [node in nodes(path) | [l in labels(node) where l <> "Resource"]] | res + l)) AS nodeTypes
RETURN articlesEntities,
similarArticle, uri,
similarArticleEntities, weight,
explanation,
reduce(score=0.0, x in nodeTypes | score + 1.0/size(x)) / size(similarArticleEntities) AS weightedScore
ORDER by weightedScore DESC;