Skip to content

Commit

Permalink
Updated country linking, added constraints and indexes
Browse files Browse the repository at this point in the history
  • Loading branch information
willu47 committed Oct 24, 2024
1 parent c3ceebf commit 5d21415
Show file tree
Hide file tree
Showing 7 changed files with 91 additions and 24 deletions.
7 changes: 4 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ exclude: '^docs/conf.py'

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: check-added-large-files
Expand All @@ -21,9 +21,10 @@ repos:
rev: 5.13.2
hooks:
- id: isort
args: ["--profile", "black", "--filter-files"]

- repo: https://github.com/psf/black
rev: 24.8.0
rev: 24.10.0
hooks:
- id: black
language_version: python3
Expand All @@ -36,7 +37,7 @@ repos:
# additional_dependencies: [flake8-bugbear]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.11.1 # Use the sha / tag you want to point at
rev: v1.13.0 # Use the sha / tag you want to point at
hooks:
- id: mypy
additional_dependencies: ['types-requests']
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,6 @@ exclude_lines = [

[tool.black]
line-length=79

[tool.isort]
profile = "black"
66 changes: 50 additions & 16 deletions src/research_index_backend/create_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- Units - organisational units, such as work streams, work packages, partners
"""

from abc import ABC
from os.path import join
from typing import Dict
Expand Down Expand Up @@ -94,7 +95,9 @@ def __init__(self) -> None:
self.g.bind("dbp", DBP)
PROJECT = URIRef(CCG)
self.g.add((PROJECT, RDF.type, ORG.OrganizationalCollaboration))
self.g.add((PROJECT, SKOS.prefLabel, Literal("Climate Compatible Growth")))
self.g.add(
(PROJECT, SKOS.prefLabel, Literal("Climate Compatible Growth"))
)
for oa in ["oa1", "oa2", "oa3"]:
self.g.add((PROJECT, ORG.hasUnit, CCG[f"unit/{oa}"]))
self.g.add((CCG[f"unit/{oa}"], ORG.unitOf, PROJECT))
Expand Down Expand Up @@ -138,8 +141,12 @@ def add_author(row):

def add_author_details(author_id: URIRef, row: pd.DataFrame):
self.g.add((author_id, RDF.type, SDO.Person))
self.g.add((author_id, SDO.givenName, Literal(row["First Name"])))
self.g.add((author_id, SDO.familyName, Literal(row["Last Name"])))
self.g.add(
(author_id, SDO.givenName, Literal(row["First Name"]))
)
self.g.add(
(author_id, SDO.familyName, Literal(row["Last Name"]))
)
self.g.add(
(
author_id,
Expand Down Expand Up @@ -306,7 +313,10 @@ def add_papers(self, df):
def add_paper(row):
uuid = row["paper_uuid"]
self.outputs[uuid] = Article(
uuid=uuid, doi=row["DOI"], title=row["title"], abstract=row["Abstract"]
uuid=uuid,
doi=row["DOI"],
title=row["title"],
abstract=row["Abstract"],
).save(self.g)

df.apply(add_paper, axis=1)
Expand All @@ -320,7 +330,8 @@ def add_authorship(row):
loaded_output = Article(uuid=paper_uuid).load(db=self.g)

author_of(
_start_node_id=loaded_author._id, _end_node_id=loaded_output._id
_start_node_id=loaded_author._id,
_end_node_id=loaded_output._id,
).save(self.g)

df.apply(add_authorship, axis=1)
Expand All @@ -343,7 +354,9 @@ def add_ws_structure(row):
parent = Workstream(id=row["parent"]).load(self.g)
child = Workstream(id=row["child"]).load(self.g)

unit_of(_start_node_id=child._id, _end_node_id=parent._id).save(self.g)
unit_of(_start_node_id=child._id, _end_node_id=parent._id).save(
self.g
)

df.apply(add_ws_structure, axis=1)

Expand Down Expand Up @@ -371,15 +384,19 @@ def add_work_package_member(row):
match()
.node(labels="Author", variable="a")
.where(
item="a.orcid", operator=Operator.EQUAL, literal=row["orcid"]
item="a.orcid",
operator=Operator.EQUAL,
literal=row["orcid"],
)
.return_([("a.uuid", "uuid")])
.execute()
)

if results:
author = Author(uuid=results[0]["uuid"]).load(self.g)
member_of(_start_node_id=author._id, _end_node_id=ws._id).save(self.g)
member_of(_start_node_id=author._id, _end_node_id=ws._id).save(
self.g
)
else:
print(f"Could not find {row['name']} in the database")

Expand Down Expand Up @@ -424,21 +441,22 @@ def add_affiliation(row):
match()
.node(labels="Author", variable="a")
.where(
item="a.orcid", operator=Operator.EQUAL, literal=row["orcid"]
item="a.orcid",
operator=Operator.EQUAL,
literal=row["orcid"],
)
.return_(results=[("a.uuid", "uuid")])
.execute()
)
if results:
author = Author(uuid=results[0]["uuid"]).load(self.g)
member_of(_start_node_id=author._id, _end_node_id=partner._id).save(
self.g
)
member_of(
_start_node_id=author._id, _end_node_id=partner._id
).save(self.g)

df.apply(add_affiliation, axis=1)

@classmethod
def add_country_relations(graph):
def add_country_relations(self):
query = """
MATCH (c:Country)
CALL {
Expand All @@ -451,12 +469,24 @@ def add_country_relations(graph):
}
RETURN r
"""
graph.execute(query)
self.g.execute(query)

def create_constraints(self):
query = [
"CREATE CONSTRAINT ON (n:Output) ASSERT n.doi IS UNIQUE;",
"CREATE CONSTRAINT ON (n:Output) ASSERT n.uuid IS UNIQUE;",
"CREATE CONSTRAINT ON (a:Author) ASSERT a.uuid IS UNIQUE;",
"CREATE CONSTRAINT ON (a:Author) ASSERT a.orcid IS UNIQUE;",
]
for q in query:
self.g.execute(q)


def main(graph: GraphMemGraph):
"""Create the graph of authors and papers"""
work_streams = pd.read_excel("project_partners.xlsx", sheet_name="workstream")
work_streams = pd.read_excel(
"project_partners.xlsx", sheet_name="workstream"
)
graph.add_work_streams(work_streams)

structure = pd.read_excel("project_partners.xlsx", sheet_name="subws")
Expand Down Expand Up @@ -486,6 +516,7 @@ def main(graph: GraphMemGraph):
graph.add_countries(df)

graph.add_country_relations()
graph.create_constraints()

return graph.g

Expand Down Expand Up @@ -525,3 +556,6 @@ def load_initial_data(graph: Memgraph, file_path: str):

df = pd.read_csv(join(file_path, "countries.csv"), quotechar='"')
memgraph.add_countries(df)

memgraph.add_country_relations()
memgraph.create_constraints()
20 changes: 17 additions & 3 deletions src/research_index_backend/create_graph_from_doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,7 @@ def get_personal_token():


def get_output_metadata(
session: requests_cache.CachedSession, doi: str,
source: str = "OpenAire"
session: requests_cache.CachedSession, doi: str, source: str = "OpenAire"
) -> Dict:
"""Request metadata from OpenAire Graph
Expand Down Expand Up @@ -386,14 +385,29 @@ def add_country_relations(graph: Memgraph):
"""
graph.execute(query)

query = """
MATCH (c:Country)
CALL {
WITH c
MATCH (o:Output)
WHERE o.title CONTAINS c.name
AND NOT exists((o:Output)-[:REFERS_TO]->(c:Country))
CREATE (o)-[r:REFERS_TO]->(c)
RETURN r
LIMIT 1
}
RETURN r
"""
graph.execute(query)


def add_indexes(graph: Memgraph):
queries = [
"CREATE INDEX ON :Country(id);",
"CREATE INDEX ON :Author(uuid);",
"CREATE INDEX ON :Article(uuid);",
"CREATE INDEX ON :Article(result_type);",
"CREATE EDGE INDEX ON :author_of(rank);",
# "CREATE EDGE INDEX ON :author_of(rank);",
"ANALYZE GRAPH;",
]
for query in queries:
Expand Down
2 changes: 1 addition & 1 deletion src/research_index_backend/get_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def get_metadata_from_openalex(session, doi):
except JSONDecodeError as ex:
logger.error(str(ex))
except requests.exceptions.HTTPError as err:
print(str(err))
logger.error(str(err))

if response.json():
return response.json()
Expand Down
7 changes: 7 additions & 0 deletions src/research_index_backend/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"""

from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional

from gqlalchemy import Node, Relationship
Expand Down Expand Up @@ -36,6 +37,9 @@ class ArticleMetadata:
result_type: Optional[str]
resource_type: Optional[str]
openalex: Optional[str]
cited_by_count: Optional[int]
cited_by_count_date: Optional[datetime]
counts_by_year: Optional[dict]


class Author(Node):
Expand Down Expand Up @@ -70,6 +74,9 @@ class Article(Output):
result_type: Optional[str]
resource_type: Optional[str]
openalex: Optional[str]
cited_by_count: Optional[int]
cited_by_count_date: Optional[datetime]
counts_by_year: Optional[dict]


class author_of(Relationship):
Expand Down
10 changes: 9 additions & 1 deletion src/research_index_backend/parser.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datetime import datetime
from logging import getLogger
from typing import Dict, List

Expand Down Expand Up @@ -160,6 +161,10 @@ def parse_metadata(
resource_type["@schemeid"] == "dnet:result_typologies"
):
resource_type = resource_type.get("@classname")
elif resource_type and (
resource_type["@schemeid"] == "dnet:publication_resource"
):
resource_type = resource_type.get("@classname")
else:
logger.debug(
f"Could not identify instance type from {resource_type}"
Expand Down Expand Up @@ -194,7 +199,10 @@ def parse_metadata(
publisher,
result_type,
resource_type,
openalex_metadata["id"],
openalex_metadata.get("id"),
openalex_metadata.get("cited_by_count"),
datetime.today(),
None,
)
articles_metadata.append(article_object)

Expand Down

0 comments on commit 5d21415

Please sign in to comment.