f-hafner · monadap · Sep 22, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 26, 2023
diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -0,0 +1,93 @@
+# Link SciSciNet_Links_NSF table with Paper_Author_Affiliations, Authors, and NSF_Investigator 
+# Keeps only those with link between NSF grant and author ID.
+# Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder
+
+
+# Note: Not sure if calculating string distance now works correctly
+
+
+
+packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist")
+lapply(packages, library, character.only = TRUE)
+
+datapath <- "/mnt/ssd/"
+db_file  <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite")
+#sciscinet_path <- paste0(datapath,"sciscinet_data/")
+
+
+#filepath_nsf=paste0(sciscinet_path,"SciSciNet_Link_NSF.tsv")
+
+con <- DBI::dbConnect(RSQLite::SQLite(), db_file)
+cat("The database connection is: \n")
+src_dbi(con)
+
+# Create table with all links between NSF-grant and authors via papers 
+
+NSF_to_Authors <- tbl(con, sql("
+                  select a. PaperID, a.Type, a.GrantID, b.AuthorId, b.OriginalAuthor 
+                        ,c.NormalizedName, Position, FirstName, LastName
+                                      from scinet_links_nsf as a
+                                      inner join (
+                                        select PaperId AS PaperID, AuthorId, OriginalAuthor
+                                        from PaperAuthorAffiliations 
+                                      )b 
+                                      using (PaperID)
+                                      inner join (
+                                        select AuthorId, NormalizedName
+                                        from Authors
+                                      ) c
+                                      using (AuthorId)
+                                      inner join (
+                                        select GrantID, Position, FirstName, LastName
+                                        from NSF_Investigator
+                               ) d 
+                               using (GrantID)
+                               "))
+
+nsf_to_authors <- collect(NSF_to_Authors)
+
+# Create a variable with the full name from mag 
+nsf_to_authors$mag_name <- paste(nsf_to_authors$FirstName, nsf_to_authors$LastName, sep = " ")
+
+## Still running, not sure if running correctly from here
+
+### Compare name similarity
+# Set a threshold for similarity
+threshold <- 0.8
+
+# Calculate string similarity for each row and add a new column
+name_similarity <- numeric(0)
+
+
+# Iterate through rows and calculate string distances
+for (i in 1:nrow(nsf_to_authors)) {
+  mag_name <- nsf_to_authors$mag_name[i]
+  NormalizedName <- nsf_to_authors$NormalizedName[i]
+
+  # Calculate string distance for this row
+  row_similarity <- stringsim(
+    mag_name,
+    NormalizedName
+  )
+
+  # Append the calculated distance to the results vector
+  name_similarity <- c(name_similarity, row_similarity)
+}
+
+# Assign the calculated distances to a new column in data frame
+nsf_to_authors$name_similarity <- name_similarity
+
+# Filter observations where the names are above the threshold
+similar_names <- nsf_to_authors %>%
+  filter(name_similarity >= threshold)
+
+# drop unnecessary variables
+df <- similar_names %>%
+  select(GrantID, AuthorId, Position) %>% 
+  distinct()
+
+# Write table to db: 
+dbWriteTable(con, name = "links_nsf_mag", value = df, overwrite = TRUE)
+
+# close connection to db
+DBI::dbDisconnect(con)
diff --git a/src/dataprep/main/prep_nsf/scinet_data_to_db.py b/src/dataprep/main/prep_nsf/scinet_data_to_db.py
@@ -0,0 +1,97 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# %%
+"""
+Download SciSciNet table SciSciNet_Link_NSF with links between NSF-grants and papers
+Upload into db
+link to Paper_Author_Affiliations, Authors, NSF_Investigator in R file: test_sciscinet_data.R in same folder
+
+
+Create table in database:
+- scinet_links_nsf
+
+SciSciNet_Link_NSF schema is:
+
+GrantID TEXT, PaperID INTEGER, Type TEXT
+
+
+unique index on Grantid and PaperID (multiple PaperIDs per GrantID)
+"""
+
+import subprocess
+import sqlite3 as sqlite
+import argparse
+import os
+from os import listdir
+from os.path import isfile, join
+import pandas as pd
+import numpy as np 
+import re 
+import sys
+import requests
+
+sys.path.append('/home/mona/mag_sample/src/dataprep/')  
+
+from helpers.variables import db_file, datapath, databasepath
+from helpers.functions import analyze_db 
+
+scinet_path = os.path.join(datapath, "sciscinet_data/")
+filepath_nsf = os.path.join(scinet_path, "SciSciNet_Link_NSF.tsv")
+
+
+
+# Download file 
+
+url_nsf = "https://ndownloader.figstatic.com/files/36139242"
+response = requests.get(url_nsf)
+with open(filepath_nsf, "wb") as file:
+    file.write(response.content)
+print("Downloaded data")   
+
+
+# ## Read files in loop and dump to db
+
+def load_scinet(filepath):
+    df = pd.read_csv(filepath, 
+                        sep="\t",
+                        names=["NSF_Award_Number", "PaperID", "Type", "Diff_ZScore"], 
+                        skiprows=1)
+    df.drop_duplicates(inplace=True)
+
+    # Create the GrantID column by removing non-numeric characters and formatting
+
+    df['GrantID'] = df['NSF_Award_Number'].str.extract(r'-(\d+)') 
+    df = df.drop(columns=['NSF_Award_Number', 'Diff_ZScore'])
+
+    return df
+
+files = [f for f in listdir(scinet_path) if isfile(join(scinet_path, f))]
+
+
+con = sqlite.connect(database = db_file, isolation_level= None)
+with con: 
+    for (i,f) in enumerate(files):
+        df = load_scinet(scinet_path+f)
+        #print(df.head())
+        if i==0:
+            if_exists_opt="replace"
+        else:
+            if_exists_opt="append"
+
+        df.to_sql("scinet_links_nsf", 
+                        con=con, 
+                        if_exists=if_exists_opt, 
+                        index=False, 
+                        schema= """ PaperID INTEGER
+                                    , Type TEXT
+                                    , GrantID TEXT
+                                """ 
+                    )
+
+    # Make index and clean up
+    con.execute("CREATE UNIQUE INDEX idx_scinet_grantpaper ON scinet_links_nsf (GrantID ASC, PaperID ASC)")
+
+    analyze_db(con)
+
+con.close()