From 5d21415dae146efa68e85ff2466e4412e065a127 Mon Sep 17 00:00:00 2001
From: Will Usher <wusher@kth.se>
Date: Thu, 24 Oct 2024 11:50:39 +0200
Subject: [PATCH] Updated country linking, added constraints and indexes

---
 .pre-commit-config.yaml                       |  7 +-
 pyproject.toml                                |  3 +
 src/research_index_backend/create_graph.py    | 66 ++++++++++++++-----
 .../create_graph_from_doi.py                  | 20 +++++-
 src/research_index_backend/get_metadata.py    |  2 +-
 src/research_index_backend/models.py          |  7 ++
 src/research_index_backend/parser.py          | 10 ++-
 7 files changed, 91 insertions(+), 24 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8f25e18..330e857 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ exclude: '^docs/conf.py'
 
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.6.0
+  rev: v5.0.0
   hooks:
   - id: trailing-whitespace
   - id: check-added-large-files
@@ -21,9 +21,10 @@ repos:
   rev: 5.13.2
   hooks:
   - id: isort
+    args: ["--profile", "black", "--filter-files"]
 
 - repo: https://github.com/psf/black
-  rev: 24.8.0
+  rev: 24.10.0
   hooks:
   - id: black
     language_version: python3
@@ -36,7 +37,7 @@ repos:
   #  additional_dependencies: [flake8-bugbear]
 
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.11.1  # Use the sha / tag you want to point at
+    rev: v1.13.0  # Use the sha / tag you want to point at
     hooks:
     -   id: mypy
         additional_dependencies: ['types-requests']
diff --git a/pyproject.toml b/pyproject.toml
index 0e63cae..e992e86 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -84,3 +84,6 @@ exclude_lines = [
 
 [tool.black]
 line-length=79
+
+[tool.isort]
+profile = "black"
diff --git a/src/research_index_backend/create_graph.py b/src/research_index_backend/create_graph.py
index cd74182..3b2376c 100644
--- a/src/research_index_backend/create_graph.py
+++ b/src/research_index_backend/create_graph.py
@@ -5,6 +5,7 @@
 - Units - organisational units, such as work streams, work packages, partners
 
 """
+
 from abc import ABC
 from os.path import join
 from typing import Dict
@@ -94,7 +95,9 @@ def __init__(self) -> None:
         self.g.bind("dbp", DBP)
         PROJECT = URIRef(CCG)
         self.g.add((PROJECT, RDF.type, ORG.OrganizationalCollaboration))
-        self.g.add((PROJECT, SKOS.prefLabel, Literal("Climate Compatible Growth")))
+        self.g.add(
+            (PROJECT, SKOS.prefLabel, Literal("Climate Compatible Growth"))
+        )
         for oa in ["oa1", "oa2", "oa3"]:
             self.g.add((PROJECT, ORG.hasUnit, CCG[f"unit/{oa}"]))
             self.g.add((CCG[f"unit/{oa}"], ORG.unitOf, PROJECT))
@@ -138,8 +141,12 @@ def add_author(row):
 
             def add_author_details(author_id: URIRef, row: pd.DataFrame):
                 self.g.add((author_id, RDF.type, SDO.Person))
-                self.g.add((author_id, SDO.givenName, Literal(row["First Name"])))
-                self.g.add((author_id, SDO.familyName, Literal(row["Last Name"])))
+                self.g.add(
+                    (author_id, SDO.givenName, Literal(row["First Name"]))
+                )
+                self.g.add(
+                    (author_id, SDO.familyName, Literal(row["Last Name"]))
+                )
                 self.g.add(
                     (
                         author_id,
@@ -306,7 +313,10 @@ def add_papers(self, df):
         def add_paper(row):
             uuid = row["paper_uuid"]
             self.outputs[uuid] = Article(
-                uuid=uuid, doi=row["DOI"], title=row["title"], abstract=row["Abstract"]
+                uuid=uuid,
+                doi=row["DOI"],
+                title=row["title"],
+                abstract=row["Abstract"],
             ).save(self.g)
 
         df.apply(add_paper, axis=1)
@@ -320,7 +330,8 @@ def add_authorship(row):
             loaded_output = Article(uuid=paper_uuid).load(db=self.g)
 
             author_of(
-                _start_node_id=loaded_author._id, _end_node_id=loaded_output._id
+                _start_node_id=loaded_author._id,
+                _end_node_id=loaded_output._id,
             ).save(self.g)
 
         df.apply(add_authorship, axis=1)
@@ -343,7 +354,9 @@ def add_ws_structure(row):
             parent = Workstream(id=row["parent"]).load(self.g)
             child = Workstream(id=row["child"]).load(self.g)
 
-            unit_of(_start_node_id=child._id, _end_node_id=parent._id).save(self.g)
+            unit_of(_start_node_id=child._id, _end_node_id=parent._id).save(
+                self.g
+            )
 
         df.apply(add_ws_structure, axis=1)
 
@@ -371,7 +384,9 @@ def add_work_package_member(row):
                     match()
                     .node(labels="Author", variable="a")
                     .where(
-                        item="a.orcid", operator=Operator.EQUAL, literal=row["orcid"]
+                        item="a.orcid",
+                        operator=Operator.EQUAL,
+                        literal=row["orcid"],
                     )
                     .return_([("a.uuid", "uuid")])
                     .execute()
@@ -379,7 +394,9 @@ def add_work_package_member(row):
 
             if results:
                 author = Author(uuid=results[0]["uuid"]).load(self.g)
-                member_of(_start_node_id=author._id, _end_node_id=ws._id).save(self.g)
+                member_of(_start_node_id=author._id, _end_node_id=ws._id).save(
+                    self.g
+                )
             else:
                 print(f"Could not find {row['name']} in the database")
 
@@ -424,21 +441,22 @@ def add_affiliation(row):
                     match()
                     .node(labels="Author", variable="a")
                     .where(
-                        item="a.orcid", operator=Operator.EQUAL, literal=row["orcid"]
+                        item="a.orcid",
+                        operator=Operator.EQUAL,
+                        literal=row["orcid"],
                     )
                     .return_(results=[("a.uuid", "uuid")])
                     .execute()
                 )
             if results:
                 author = Author(uuid=results[0]["uuid"]).load(self.g)
-                member_of(_start_node_id=author._id, _end_node_id=partner._id).save(
-                    self.g
-                )
+                member_of(
+                    _start_node_id=author._id, _end_node_id=partner._id
+                ).save(self.g)
 
         df.apply(add_affiliation, axis=1)
 
-    @classmethod
-    def add_country_relations(graph):
+    def add_country_relations(self):
         query = """
             MATCH (c:Country)
             CALL {
@@ -451,12 +469,24 @@ def add_country_relations(graph):
             }
             RETURN r
             """
-        graph.execute(query)
+        self.g.execute(query)
+
+    def create_constraints(self):
+        query = [
+            "CREATE CONSTRAINT ON (n:Output) ASSERT n.doi IS UNIQUE;",
+            "CREATE CONSTRAINT ON (n:Output) ASSERT n.uuid IS UNIQUE;",
+            "CREATE CONSTRAINT ON (a:Author) ASSERT a.uuid IS UNIQUE;",
+            "CREATE CONSTRAINT ON (a:Author) ASSERT a.orcid IS UNIQUE;",
+        ]
+        for q in query:
+            self.g.execute(q)
 
 
 def main(graph: GraphMemGraph):
     """Create the graph of authors and papers"""
-    work_streams = pd.read_excel("project_partners.xlsx", sheet_name="workstream")
+    work_streams = pd.read_excel(
+        "project_partners.xlsx", sheet_name="workstream"
+    )
     graph.add_work_streams(work_streams)
 
     structure = pd.read_excel("project_partners.xlsx", sheet_name="subws")
@@ -486,6 +516,7 @@ def main(graph: GraphMemGraph):
     graph.add_countries(df)
 
     graph.add_country_relations()
+    graph.create_constraints()
 
     return graph.g
 
@@ -525,3 +556,6 @@ def load_initial_data(graph: Memgraph, file_path: str):
 
     df = pd.read_csv(join(file_path, "countries.csv"), quotechar='"')
     memgraph.add_countries(df)
+
+    memgraph.add_country_relations()
+    memgraph.create_constraints()
diff --git a/src/research_index_backend/create_graph_from_doi.py b/src/research_index_backend/create_graph_from_doi.py
index 0b100db..9757bb3 100644
--- a/src/research_index_backend/create_graph_from_doi.py
+++ b/src/research_index_backend/create_graph_from_doi.py
@@ -104,8 +104,7 @@ def get_personal_token():
 
 
 def get_output_metadata(
-    session: requests_cache.CachedSession, doi: str,
-    source: str = "OpenAire"
+    session: requests_cache.CachedSession, doi: str, source: str = "OpenAire"
 ) -> Dict:
     """Request metadata from OpenAire Graph
 
@@ -386,6 +385,21 @@ def add_country_relations(graph: Memgraph):
         """
     graph.execute(query)
 
+    query = """
+        MATCH (c:Country)
+        CALL {
+        WITH c
+        MATCH (o:Output)
+        WHERE o.title CONTAINS c.name
+        AND NOT exists((o:Output)-[:REFERS_TO]->(c:Country))
+        CREATE (o)-[r:REFERS_TO]->(c)
+        RETURN r
+        LIMIT 1
+        }
+        RETURN r
+        """
+    graph.execute(query)
+
 
 def add_indexes(graph: Memgraph):
     queries = [
@@ -393,7 +407,7 @@ def add_indexes(graph: Memgraph):
         "CREATE INDEX ON :Author(uuid);",
         "CREATE INDEX ON :Article(uuid);",
         "CREATE INDEX ON :Article(result_type);",
-        "CREATE EDGE INDEX ON :author_of(rank);",
+        # "CREATE EDGE INDEX ON :author_of(rank);",
         "ANALYZE GRAPH;",
     ]
     for query in queries:
diff --git a/src/research_index_backend/get_metadata.py b/src/research_index_backend/get_metadata.py
index 141d044..f68a97c 100644
--- a/src/research_index_backend/get_metadata.py
+++ b/src/research_index_backend/get_metadata.py
@@ -86,7 +86,7 @@ def get_metadata_from_openalex(session, doi):
             except JSONDecodeError as ex:
                 logger.error(str(ex))
     except requests.exceptions.HTTPError as err:
-        print(str(err))
+        logger.error(str(err))
 
     if response.json():
         return response.json()
diff --git a/src/research_index_backend/models.py b/src/research_index_backend/models.py
index e7cfb58..baf5ba2 100644
--- a/src/research_index_backend/models.py
+++ b/src/research_index_backend/models.py
@@ -7,6 +7,7 @@
 """
 
 from dataclasses import dataclass
+from datetime import datetime
 from typing import List, Optional
 
 from gqlalchemy import Node, Relationship
@@ -36,6 +37,9 @@ class ArticleMetadata:
     result_type: Optional[str]
     resource_type: Optional[str]
     openalex: Optional[str]
+    cited_by_count: Optional[int]
+    cited_by_count_date: Optional[datetime]
+    counts_by_year: Optional[dict]
 
 
 class Author(Node):
@@ -70,6 +74,9 @@ class Article(Output):
     result_type: Optional[str]
     resource_type: Optional[str]
     openalex: Optional[str]
+    cited_by_count: Optional[int]
+    cited_by_count_date: Optional[datetime]
+    counts_by_year: Optional[dict]
 
 
 class author_of(Relationship):
diff --git a/src/research_index_backend/parser.py b/src/research_index_backend/parser.py
index c330483..55e259f 100644
--- a/src/research_index_backend/parser.py
+++ b/src/research_index_backend/parser.py
@@ -1,3 +1,4 @@
+from datetime import datetime
 from logging import getLogger
 from typing import Dict, List
 
@@ -160,6 +161,10 @@ def parse_metadata(
             resource_type["@schemeid"] == "dnet:result_typologies"
         ):
             resource_type = resource_type.get("@classname")
+        elif resource_type and (
+            resource_type["@schemeid"] == "dnet:publication_resource"
+        ):
+            resource_type = resource_type.get("@classname")
         else:
             logger.debug(
                 f"Could not identify instance type from {resource_type}"
@@ -194,7 +199,10 @@ def parse_metadata(
             publisher,
             result_type,
             resource_type,
-            openalex_metadata["id"],
+            openalex_metadata.get("id"),
+            openalex_metadata.get("cited_by_count"),
+            datetime.today(),
+            None,
         )
         articles_metadata.append(article_object)