From 5d21415dae146efa68e85ff2466e4412e065a127 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Thu, 24 Oct 2024 11:50:39 +0200 Subject: [PATCH] Updated country linking, added constraints and indexes --- .pre-commit-config.yaml | 7 +- pyproject.toml | 3 + src/research_index_backend/create_graph.py | 66 ++++++++++++++----- .../create_graph_from_doi.py | 20 +++++- src/research_index_backend/get_metadata.py | 2 +- src/research_index_backend/models.py | 7 ++ src/research_index_backend/parser.py | 10 ++- 7 files changed, 91 insertions(+), 24 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8f25e18..330e857 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ exclude: '^docs/conf.py' repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: check-added-large-files @@ -21,9 +21,10 @@ repos: rev: 5.13.2 hooks: - id: isort + args: ["--profile", "black", "--filter-files"] - repo: https://github.com/psf/black - rev: 24.8.0 + rev: 24.10.0 hooks: - id: black language_version: python3 @@ -36,7 +37,7 @@ repos: # additional_dependencies: [flake8-bugbear] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.1 # Use the sha / tag you want to point at + rev: v1.13.0 # Use the sha / tag you want to point at hooks: - id: mypy additional_dependencies: ['types-requests'] diff --git a/pyproject.toml b/pyproject.toml index 0e63cae..e992e86 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,3 +84,6 @@ exclude_lines = [ [tool.black] line-length=79 + +[tool.isort] +profile = "black" diff --git a/src/research_index_backend/create_graph.py b/src/research_index_backend/create_graph.py index cd74182..3b2376c 100644 --- a/src/research_index_backend/create_graph.py +++ b/src/research_index_backend/create_graph.py @@ -5,6 +5,7 @@ - Units - organisational units, such as work streams, work packages, partners """ + from abc import ABC from os.path import join from typing import Dict @@ -94,7 +95,9 @@ def __init__(self) -> None: self.g.bind("dbp", DBP) PROJECT = URIRef(CCG) self.g.add((PROJECT, RDF.type, ORG.OrganizationalCollaboration)) - self.g.add((PROJECT, SKOS.prefLabel, Literal("Climate Compatible Growth"))) + self.g.add( + (PROJECT, SKOS.prefLabel, Literal("Climate Compatible Growth")) + ) for oa in ["oa1", "oa2", "oa3"]: self.g.add((PROJECT, ORG.hasUnit, CCG[f"unit/{oa}"])) self.g.add((CCG[f"unit/{oa}"], ORG.unitOf, PROJECT)) @@ -138,8 +141,12 @@ def add_author(row): def add_author_details(author_id: URIRef, row: pd.DataFrame): self.g.add((author_id, RDF.type, SDO.Person)) - self.g.add((author_id, SDO.givenName, Literal(row["First Name"]))) - self.g.add((author_id, SDO.familyName, Literal(row["Last Name"]))) + self.g.add( + (author_id, SDO.givenName, Literal(row["First Name"])) + ) + self.g.add( + (author_id, SDO.familyName, Literal(row["Last Name"])) + ) self.g.add( ( author_id, @@ -306,7 +313,10 @@ def add_papers(self, df): def add_paper(row): uuid = row["paper_uuid"] self.outputs[uuid] = Article( - uuid=uuid, doi=row["DOI"], title=row["title"], abstract=row["Abstract"] + uuid=uuid, + doi=row["DOI"], + title=row["title"], + abstract=row["Abstract"], ).save(self.g) df.apply(add_paper, axis=1) @@ -320,7 +330,8 @@ def add_authorship(row): loaded_output = Article(uuid=paper_uuid).load(db=self.g) author_of( - _start_node_id=loaded_author._id, _end_node_id=loaded_output._id + _start_node_id=loaded_author._id, + _end_node_id=loaded_output._id, ).save(self.g) df.apply(add_authorship, axis=1) @@ -343,7 +354,9 @@ def add_ws_structure(row): parent = Workstream(id=row["parent"]).load(self.g) child = Workstream(id=row["child"]).load(self.g) - unit_of(_start_node_id=child._id, _end_node_id=parent._id).save(self.g) + unit_of(_start_node_id=child._id, _end_node_id=parent._id).save( + self.g + ) df.apply(add_ws_structure, axis=1) @@ -371,7 +384,9 @@ def add_work_package_member(row): match() .node(labels="Author", variable="a") .where( - item="a.orcid", operator=Operator.EQUAL, literal=row["orcid"] + item="a.orcid", + operator=Operator.EQUAL, + literal=row["orcid"], ) .return_([("a.uuid", "uuid")]) .execute() @@ -379,7 +394,9 @@ def add_work_package_member(row): if results: author = Author(uuid=results[0]["uuid"]).load(self.g) - member_of(_start_node_id=author._id, _end_node_id=ws._id).save(self.g) + member_of(_start_node_id=author._id, _end_node_id=ws._id).save( + self.g + ) else: print(f"Could not find {row['name']} in the database") @@ -424,21 +441,22 @@ def add_affiliation(row): match() .node(labels="Author", variable="a") .where( - item="a.orcid", operator=Operator.EQUAL, literal=row["orcid"] + item="a.orcid", + operator=Operator.EQUAL, + literal=row["orcid"], ) .return_(results=[("a.uuid", "uuid")]) .execute() ) if results: author = Author(uuid=results[0]["uuid"]).load(self.g) - member_of(_start_node_id=author._id, _end_node_id=partner._id).save( - self.g - ) + member_of( + _start_node_id=author._id, _end_node_id=partner._id + ).save(self.g) df.apply(add_affiliation, axis=1) - @classmethod - def add_country_relations(graph): + def add_country_relations(self): query = """ MATCH (c:Country) CALL { @@ -451,12 +469,24 @@ def add_country_relations(graph): } RETURN r """ - graph.execute(query) + self.g.execute(query) + + def create_constraints(self): + query = [ + "CREATE CONSTRAINT ON (n:Output) ASSERT n.doi IS UNIQUE;", + "CREATE CONSTRAINT ON (n:Output) ASSERT n.uuid IS UNIQUE;", + "CREATE CONSTRAINT ON (a:Author) ASSERT a.uuid IS UNIQUE;", + "CREATE CONSTRAINT ON (a:Author) ASSERT a.orcid IS UNIQUE;", + ] + for q in query: + self.g.execute(q) def main(graph: GraphMemGraph): """Create the graph of authors and papers""" - work_streams = pd.read_excel("project_partners.xlsx", sheet_name="workstream") + work_streams = pd.read_excel( + "project_partners.xlsx", sheet_name="workstream" + ) graph.add_work_streams(work_streams) structure = pd.read_excel("project_partners.xlsx", sheet_name="subws") @@ -486,6 +516,7 @@ def main(graph: GraphMemGraph): graph.add_countries(df) graph.add_country_relations() + graph.create_constraints() return graph.g @@ -525,3 +556,6 @@ def load_initial_data(graph: Memgraph, file_path: str): df = pd.read_csv(join(file_path, "countries.csv"), quotechar='"') memgraph.add_countries(df) + + memgraph.add_country_relations() + memgraph.create_constraints() diff --git a/src/research_index_backend/create_graph_from_doi.py b/src/research_index_backend/create_graph_from_doi.py index 0b100db..9757bb3 100644 --- a/src/research_index_backend/create_graph_from_doi.py +++ b/src/research_index_backend/create_graph_from_doi.py @@ -104,8 +104,7 @@ def get_personal_token(): def get_output_metadata( - session: requests_cache.CachedSession, doi: str, - source: str = "OpenAire" + session: requests_cache.CachedSession, doi: str, source: str = "OpenAire" ) -> Dict: """Request metadata from OpenAire Graph @@ -386,6 +385,21 @@ def add_country_relations(graph: Memgraph): """ graph.execute(query) + query = """ + MATCH (c:Country) + CALL { + WITH c + MATCH (o:Output) + WHERE o.title CONTAINS c.name + AND NOT exists((o:Output)-[:REFERS_TO]->(c:Country)) + CREATE (o)-[r:REFERS_TO]->(c) + RETURN r + LIMIT 1 + } + RETURN r + """ + graph.execute(query) + def add_indexes(graph: Memgraph): queries = [ @@ -393,7 +407,7 @@ def add_indexes(graph: Memgraph): "CREATE INDEX ON :Author(uuid);", "CREATE INDEX ON :Article(uuid);", "CREATE INDEX ON :Article(result_type);", - "CREATE EDGE INDEX ON :author_of(rank);", + # "CREATE EDGE INDEX ON :author_of(rank);", "ANALYZE GRAPH;", ] for query in queries: diff --git a/src/research_index_backend/get_metadata.py b/src/research_index_backend/get_metadata.py index 141d044..f68a97c 100644 --- a/src/research_index_backend/get_metadata.py +++ b/src/research_index_backend/get_metadata.py @@ -86,7 +86,7 @@ def get_metadata_from_openalex(session, doi): except JSONDecodeError as ex: logger.error(str(ex)) except requests.exceptions.HTTPError as err: - print(str(err)) + logger.error(str(err)) if response.json(): return response.json() diff --git a/src/research_index_backend/models.py b/src/research_index_backend/models.py index e7cfb58..baf5ba2 100644 --- a/src/research_index_backend/models.py +++ b/src/research_index_backend/models.py @@ -7,6 +7,7 @@ """ from dataclasses import dataclass +from datetime import datetime from typing import List, Optional from gqlalchemy import Node, Relationship @@ -36,6 +37,9 @@ class ArticleMetadata: result_type: Optional[str] resource_type: Optional[str] openalex: Optional[str] + cited_by_count: Optional[int] + cited_by_count_date: Optional[datetime] + counts_by_year: Optional[dict] class Author(Node): @@ -70,6 +74,9 @@ class Article(Output): result_type: Optional[str] resource_type: Optional[str] openalex: Optional[str] + cited_by_count: Optional[int] + cited_by_count_date: Optional[datetime] + counts_by_year: Optional[dict] class author_of(Relationship): diff --git a/src/research_index_backend/parser.py b/src/research_index_backend/parser.py index c330483..55e259f 100644 --- a/src/research_index_backend/parser.py +++ b/src/research_index_backend/parser.py @@ -1,3 +1,4 @@ +from datetime import datetime from logging import getLogger from typing import Dict, List @@ -160,6 +161,10 @@ def parse_metadata( resource_type["@schemeid"] == "dnet:result_typologies" ): resource_type = resource_type.get("@classname") + elif resource_type and ( + resource_type["@schemeid"] == "dnet:publication_resource" + ): + resource_type = resource_type.get("@classname") else: logger.debug( f"Could not identify instance type from {resource_type}" @@ -194,7 +199,10 @@ def parse_metadata( publisher, result_type, resource_type, - openalex_metadata["id"], + openalex_metadata.get("id"), + openalex_metadata.get("cited_by_count"), + datetime.today(), + None, ) articles_metadata.append(article_object)