From 3c91d73f3e1bbc622ac2361ee92a5d8d7f4042c5 Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Fri, 22 Sep 2023 08:12:19 +0000
Subject: [PATCH 01/14] Download SciSciNet_NSF and link to mag authors

---
 .../main/prep_nsf/link_scinetnsf_to_mag.R     | 58 +++++++++++
 .../main/prep_nsf/scinet_data_to_db.py        | 97 +++++++++++++++++++
 2 files changed, 155 insertions(+)
 create mode 100644 src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
 create mode 100644 src/dataprep/main/prep_nsf/scinet_data_to_db.py

diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
new file mode 100644
index 0000000..7170733
--- /dev/null
+++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -0,0 +1,58 @@
+# Link SciSciNet_Links_NSF table with Paper_Author_Affiliations, Authors, and NSF_Investigator 
+# Keeps only those with link between NSF grant and author ID.
+# Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder
+
+packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "ggplot2", "stringdist", "DBI")
+lapply(packages, library, character.only = TRUE)
+
+datapath <- "/mnt/ssd/"
+db_file  <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite")
+sciscinet_path <- paste0(datapath,"sciscinet_data/")
+
+
+#filepath_nsf=paste0(sciscinet_path,"SciSciNet_Link_NSF.tsv")
+
+con <- DBI::dbConnect(RSQLite::SQLite(), db_file)
+cat("The database connection is: \n")
+src_dbi(con)
+
+# Create table with all links between NSF-grant and authors via papers 
+
+NSF_to_Authors <- tbl(con, sql("
+                  select a. PaperID, a.Type, a.GrantID, b.AuthorId, b.OriginalAuthor 
+                        ,c.NormalizedName, Position, FirstName, LastName
+                                      from scinet as a
+                                      inner join (
+                                        select PaperId AS PaperID, AuthorId, OriginalAuthor
+                                        from PaperAuthorAffiliations 
+                                      )b 
+                                      using (PaperID)
+                                      inner join (
+                                        select AuthorId, NormalizedName
+                                        from Authors
+                                      ) c
+                                      using (AuthorId)
+                                      inner join (
+                                        select GrantID, Position, FirstName, LastName
+                                        from NSF_Investigator
+                               ) d 
+                               using (GrantID)
+                               "))
+
+nsf_to_authors <- collect(NSF_to_Authors)
+
+# Split the "NormalizedName" column into "nsf_firstname" and "nsf_lastname" columns
+nsf_to_authors <- nsf_to_authors %>%
+  separate(NormalizedName, into = c("nsf_firstname", "nsf_lastname"), sep = " ", extra = "merge")
+
+nsf_author_links <- subset(nsf_to_authors, select = -c(OriginalAuthor, NormalizedName, Type, PaperID)) %>%
+  mutate(name_similarity = stringdist::stringdistmatrix(paste(nsf_firstname, nsf_lastname, sep = " "), paste(FirstName, LastName, sep = " ")))
+
+# Set a threshold for similarity (e.g., 0.8 means 80% similarity)
+threshold <- 0.8
+
+# Filter observations where the names are similar or above the threshold
+similar_names <- nsf_author_links %>%
+  filter(name_similarity >= threshold)
+
+DBI::dbDisconnect(con)
\ No newline at end of file
diff --git a/src/dataprep/main/prep_nsf/scinet_data_to_db.py b/src/dataprep/main/prep_nsf/scinet_data_to_db.py
new file mode 100644
index 0000000..a5990b9
--- /dev/null
+++ b/src/dataprep/main/prep_nsf/scinet_data_to_db.py
@@ -0,0 +1,97 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# %%
+"""
+Download SciSciNet table SciSciNet_Link_NSF with links between NSF-grants and papers
+Upload into db
+link to Paper_Author_Affiliations, Authors, NSF_Investigator in R file: test_sciscinet_data.R in same folder
+
+
+Create table in database:
+- SciSciNet_Link_NSF
+
+SciSciNet_Link_NSF schema is:
+
+GrantID INTEGER, PaperID INTEGER
+ 
+
+unique index on Grantid and PaperID (multiple paperIDs per GrantID)
+"""
+
+import subprocess
+import sqlite3 as sqlite
+import argparse
+import os
+from os import listdir
+from os.path import isfile, join
+import pandas as pd
+import numpy as np 
+import re 
+import sys
+import requests
+
+sys.path.append('/home/mona/mag_sample/src/dataprep/')  
+
+from helpers.variables import db_file, datapath, databasepath
+from helpers.functions import analyze_db 
+
+scinet_path = os.path.join(datapath, "sciscinet_data/")
+filepath_nsf = os.path.join(scinet_path, "SciSciNet_Link_NSF.tsv")
+
+
+
+# Download file 
+
+url_nsf = "https://ndownloader.figstatic.com/files/36139242"
+response = requests.get(url_nsf)
+with open(filepath_nsf, "wb") as file:
+    file.write(response.content)
+print("Downloaded data")   
+
+
+# ## Read files in loop and dump to db
+
+def load_scinet(filepath):
+    df = pd.read_csv(filepath, 
+                        sep="\t",
+                        names=["NSF_Award_Number", "PaperID", "Type", "Diff_ZScore"])
+    df.drop_duplicates(inplace=True)
+
+    # Create the GrantID column by removing non-numeric characters and formatting
+    
+    df['GrantID'] = df['NSF_Award_Number'].str.extract(r'-(\d+)') 
+    
+    return df
+
+files = [f for f in listdir(scinet_path) if isfile(join(scinet_path, f))]
+
+
+con = sqlite.connect(database = db_file, isolation_level= None)
+with con: 
+    for (i,f) in enumerate(files):
+        df = load_scinet(scinet_path+f)
+        #print(df.head())
+        if i==0:
+            if_exists_opt="replace"
+        else:
+            if_exists_opt="append"
+
+        df.to_sql("scinet", 
+                        con=con, 
+                        if_exists=if_exists_opt, 
+                        index=False, 
+                        schema= """NSF_Award_Number TEXT
+                                    , PaperID INTEGER
+                                    , Type TEXT
+                                    , Diff_ZScore NUMERIC
+                                    , GrantID TEXT
+                                """ 
+                    )
+
+    # Make index and clean up
+    con.execute("CREATE UNIQUE INDEX idx_scinet ON scinet (GrantID ASC, PaperID ASC)")
+
+    analyze_db(con)
+
+con.close()
\ No newline at end of file

From b2c08e00727dd73650c4f77feb6a1b2557fc4367 Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Mon, 25 Sep 2023 10:11:12 +0000
Subject: [PATCH 02/14] Some improvements in loading to db, linking

---
 .../main/prep_nsf/link_scinetnsf_to_mag.R     | 50 +++++++++++++++----
 .../main/prep_nsf/scinet_data_to_db.py        | 20 ++++----
 2 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
index 7170733..e2285c7 100644
--- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
+++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -2,12 +2,17 @@
 # Keeps only those with link between NSF grant and author ID.
 # Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder
 
-packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "ggplot2", "stringdist", "DBI")
+
+# Note: Not sure if calculating string distance now works correctly
+ 
+
+
+packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist")
 lapply(packages, library, character.only = TRUE)
 
 datapath <- "/mnt/ssd/"
 db_file  <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite")
-sciscinet_path <- paste0(datapath,"sciscinet_data/")
+#sciscinet_path <- paste0(datapath,"sciscinet_data/")
 
 
 #filepath_nsf=paste0(sciscinet_path,"SciSciNet_Link_NSF.tsv")
@@ -21,7 +26,7 @@ src_dbi(con)
 NSF_to_Authors <- tbl(con, sql("
                   select a. PaperID, a.Type, a.GrantID, b.AuthorId, b.OriginalAuthor 
                         ,c.NormalizedName, Position, FirstName, LastName
-                                      from scinet as a
+                                      from scinet_links_nsf as a
                                       inner join (
                                         select PaperId AS PaperID, AuthorId, OriginalAuthor
                                         from PaperAuthorAffiliations 
@@ -41,18 +46,41 @@ NSF_to_Authors <- tbl(con, sql("
 
 nsf_to_authors <- collect(NSF_to_Authors)
 
-# Split the "NormalizedName" column into "nsf_firstname" and "nsf_lastname" columns
-nsf_to_authors <- nsf_to_authors %>%
-  separate(NormalizedName, into = c("nsf_firstname", "nsf_lastname"), sep = " ", extra = "merge")
+# Create a variable with the full name from mag 
+nsf_to_authors$mag_name <- paste(nsf_to_authors$FirstName, nsf_to_authors$LastName, sep = " ")
 
-nsf_author_links <- subset(nsf_to_authors, select = -c(OriginalAuthor, NormalizedName, Type, PaperID)) %>%
-  mutate(name_similarity = stringdist::stringdistmatrix(paste(nsf_firstname, nsf_lastname, sep = " "), paste(FirstName, LastName, sep = " ")))
+## Still running, not sure if running correctly from here
 
-# Set a threshold for similarity (e.g., 0.8 means 80% similarity)
+### Compare name similarity
+# Set a threshold for similarity
 threshold <- 0.8
 
-# Filter observations where the names are similar or above the threshold
-similar_names <- nsf_author_links %>%
+# Calculate string similarity for each row and add a new column
+name_similarity <- numeric(0)
+
+# Iterate through rows and calculate string distances
+for (i in 1:nrow(nsf_to_authors)) {
+  mag_name <- nsf_to_authors$mag_name[i]
+  NormalizedName <- nsf_to_authors$NormalizedName[i]
+  
+  # Calculate string distance for this row
+  row_similarity <- stringdistmatrix(
+    mag_name,
+    NormalizedName
+  )
+  
+  # Append the calculated distance to the results vector
+  name_similarity <- c(name_similarity, row_similarity)
+}
+
+# Assign the calculated distances to a new column in data frame
+nsf_to_authors$name_similarity <- name_similarity
+
+# Filter observations where the names are above the threshold
+similar_names <- nsf_to_authors %>%
   filter(name_similarity >= threshold)
 
+# To do: write to db (keep only necessary variables: GrantID, AuthorID, Position, Paper ID(?))
+
+# close connection to db
 DBI::dbDisconnect(con)
\ No newline at end of file
diff --git a/src/dataprep/main/prep_nsf/scinet_data_to_db.py b/src/dataprep/main/prep_nsf/scinet_data_to_db.py
index a5990b9..61986f0 100644
--- a/src/dataprep/main/prep_nsf/scinet_data_to_db.py
+++ b/src/dataprep/main/prep_nsf/scinet_data_to_db.py
@@ -9,14 +9,14 @@
 
 
 Create table in database:
-- SciSciNet_Link_NSF
+- scinet_links_nsf
 
 SciSciNet_Link_NSF schema is:
 
-GrantID INTEGER, PaperID INTEGER
+GrantID TEXT, PaperID INTEGER, Type TEXT
  
 
-unique index on Grantid and PaperID (multiple paperIDs per GrantID)
+unique index on Grantid and PaperID (multiple PaperIDs per GrantID)
 """
 
 import subprocess
@@ -55,12 +55,14 @@
 def load_scinet(filepath):
     df = pd.read_csv(filepath, 
                         sep="\t",
-                        names=["NSF_Award_Number", "PaperID", "Type", "Diff_ZScore"])
+                        names=["NSF_Award_Number", "PaperID", "Type", "Diff_ZScore"], 
+                        skiprows=1)
     df.drop_duplicates(inplace=True)
 
     # Create the GrantID column by removing non-numeric characters and formatting
     
     df['GrantID'] = df['NSF_Award_Number'].str.extract(r'-(\d+)') 
+    df = df.drop(columns=['NSF_Award_Number', 'Diff_ZScore'])
     
     return df
 
@@ -77,21 +79,19 @@ def load_scinet(filepath):
         else:
             if_exists_opt="append"
 
-        df.to_sql("scinet", 
+        df.to_sql("scinet_links_nsf", 
                         con=con, 
                         if_exists=if_exists_opt, 
                         index=False, 
-                        schema= """NSF_Award_Number TEXT
-                                    , PaperID INTEGER
+                        schema= """ PaperID INTEGER
                                     , Type TEXT
-                                    , Diff_ZScore NUMERIC
                                     , GrantID TEXT
                                 """ 
                     )
 
     # Make index and clean up
-    con.execute("CREATE UNIQUE INDEX idx_scinet ON scinet (GrantID ASC, PaperID ASC)")
+    con.execute("CREATE UNIQUE INDEX idx_scinet_grantpaper ON scinet_links_nsf (GrantID ASC, PaperID ASC)")
 
     analyze_db(con)
 
-con.close()
\ No newline at end of file
+con.close()

From 5a17e7fad574142dd4d9372226f747a13662f21a Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Mon, 25 Sep 2023 11:24:37 +0000
Subject: [PATCH 03/14] Updated code for name similarity btw nsf and mag

---
 src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
index e2285c7..c768a9d 100644
--- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
+++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -58,13 +58,14 @@ threshold <- 0.8
 # Calculate string similarity for each row and add a new column
 name_similarity <- numeric(0)
 
+
 # Iterate through rows and calculate string distances
 for (i in 1:nrow(nsf_to_authors)) {
   mag_name <- nsf_to_authors$mag_name[i]
   NormalizedName <- nsf_to_authors$NormalizedName[i]
   
   # Calculate string distance for this row
-  row_similarity <- stringdistmatrix(
+  row_similarity <- stringsim(
     mag_name,
     NormalizedName
   )
@@ -80,7 +81,13 @@ nsf_to_authors$name_similarity <- name_similarity
 similar_names <- nsf_to_authors %>%
   filter(name_similarity >= threshold)
 
-# To do: write to db (keep only necessary variables: GrantID, AuthorID, Position, Paper ID(?))
+# drop unnecessary variables
+df <- similar_names %>%
+  select(GrantID, AuthorId, Position) %>% 
+  distinct()
+
+# Write table to db: 
+dbWriteTable(con, name = "links_nsf_mag", value = df, overwrite = TRUE)
 
 # close connection to db
 DBI::dbDisconnect(con)
\ No newline at end of file

From 64e10d92600d20118e73aa4e74c6229a1d60ba86 Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Tue, 26 Sep 2023 07:19:10 +0000
Subject: [PATCH 04/14] Final code to download nsf links and upload to db

---
 .../main/prep_nsf/scinet_data_to_db.py        | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/dataprep/main/prep_nsf/scinet_data_to_db.py b/src/dataprep/main/prep_nsf/scinet_data_to_db.py
index 61986f0..b42fc31 100644
--- a/src/dataprep/main/prep_nsf/scinet_data_to_db.py
+++ b/src/dataprep/main/prep_nsf/scinet_data_to_db.py
@@ -13,7 +13,7 @@
 
 SciSciNet_Link_NSF schema is:
 
-GrantID TEXT, PaperID INTEGER, Type TEXT
+GrantID TEXT, PaperID INTEGER, Type TEXT, Diff_ZScore NUMERIC
  
 
 unique index on Grantid and PaperID (multiple PaperIDs per GrantID)
@@ -28,10 +28,10 @@
 import pandas as pd
 import numpy as np 
 import re 
-import sys
-import requests
+#import sys
+#import requests
 
-sys.path.append('/home/mona/mag_sample/src/dataprep/')  
+#sys.path.append('/home/mona/mag_sample/src/dataprep/')  
 
 from helpers.variables import db_file, datapath, databasepath
 from helpers.functions import analyze_db 
@@ -50,8 +50,8 @@
 print("Downloaded data")   
 
 
-# ## Read files in loop and dump to db
-
+# ## Read file and dump to db
+# remove first row as it only contains column names that can't be overwritten
 def load_scinet(filepath):
     df = pd.read_csv(filepath, 
                         sep="\t",
@@ -59,15 +59,22 @@ def load_scinet(filepath):
                         skiprows=1)
     df.drop_duplicates(inplace=True)
 
-    # Create the GrantID column by removing non-numeric characters and formatting
+    # Create a GrantID variable in same format as previously used by removing non-numeric characters from NSF_Award_Number 
+    # drop NSF_Award_Number as we only need GrantID
     
     df['GrantID'] = df['NSF_Award_Number'].str.extract(r'-(\d+)') 
-    df = df.drop(columns=['NSF_Award_Number', 'Diff_ZScore'])
+    df = df.drop(columns=['NSF_Award_Number'])
+
+    #Check that all rows will be uploaded into db: in raw file 1309518 rows
+    num_observations = df.shape[0]
+    print(num_observations, "rows of 1309518 rows in the raw file will be loaded into the db")
     
     return df
 
 files = [f for f in listdir(scinet_path) if isfile(join(scinet_path, f))]
 
+    
+
 
 con = sqlite.connect(database = db_file, isolation_level= None)
 with con: 
@@ -86,10 +93,12 @@ def load_scinet(filepath):
                         schema= """ PaperID INTEGER
                                     , Type TEXT
                                     , GrantID TEXT
+                                    , Diff_ZScore NUMERIC
                                 """ 
                     )
 
-    # Make index and clean up
+    # Make index and clean up: 
+    # Serves as check that only unique observations part of the dataframe
     con.execute("CREATE UNIQUE INDEX idx_scinet_grantpaper ON scinet_links_nsf (GrantID ASC, PaperID ASC)")
 
     analyze_db(con)

From 5ac96321a99e5b48013bd0f1faa7c682f91f5f11 Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Tue, 26 Sep 2023 15:10:59 +0000
Subject: [PATCH 05/14] Final code to link mag and nsf + upload to db

---
 .../main/prep_nsf/link_scinetnsf_to_mag.R     | 99 ++++++++++++-------
 1 file changed, 63 insertions(+), 36 deletions(-)

diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
index c768a9d..2e5cfff 100644
--- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
+++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -1,36 +1,29 @@
 # Link SciSciNet_Links_NSF table with Paper_Author_Affiliations, Authors, and NSF_Investigator 
 # Keeps only those with link between NSF grant and author ID.
+# only those links with a similar name (similarity >=0.8) are loaded into db
 # Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder
 
-
-# Note: Not sure if calculating string distance now works correctly
- 
-
-
 packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist")
 lapply(packages, library, character.only = TRUE)
 
 datapath <- "/mnt/ssd/"
 db_file  <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite")
-#sciscinet_path <- paste0(datapath,"sciscinet_data/")
-
 
-#filepath_nsf=paste0(sciscinet_path,"SciSciNet_Link_NSF.tsv")
 
 con <- DBI::dbConnect(RSQLite::SQLite(), db_file)
 cat("The database connection is: \n")
 src_dbi(con)
 
-# Create table with all links between NSF-grant and authors via papers 
+# Create table with all links between NSF-grant and authors via papers
 
 NSF_to_Authors <- tbl(con, sql("
-                  select a. PaperID, a.Type, a.GrantID, b.AuthorId, b.OriginalAuthor 
-                        ,c.NormalizedName, Position, FirstName, LastName
+                  select a. PaperID, a.Type, a.GrantID, b.AuthorId
+                        ,c.NormalizedName, d.Position, d.PIFullName
                                       from scinet_links_nsf as a
                                       inner join (
-                                        select PaperId AS PaperID, AuthorId, OriginalAuthor
-                                        from PaperAuthorAffiliations 
-                                      )b 
+                                        select PaperId AS PaperID, AuthorId
+                                        from PaperAuthorAffiliations
+                                      )b
                                       using (PaperID)
                                       inner join (
                                         select AuthorId, NormalizedName
@@ -38,55 +31,89 @@ NSF_to_Authors <- tbl(con, sql("
                                       ) c
                                       using (AuthorId)
                                       inner join (
-                                        select GrantID, Position, FirstName, LastName
+                                        select GrantID, Position, PIFullName
                                         from NSF_Investigator
-                               ) d 
+                               ) d
                                using (GrantID)
                                "))
 
 nsf_to_authors <- collect(NSF_to_Authors)
 
-# Create a variable with the full name from mag 
-nsf_to_authors$mag_name <- paste(nsf_to_authors$FirstName, nsf_to_authors$LastName, sep = " ")
+# Create separate variables for first and last name for both nsf and mag names
+nsf_to_authors <- nsf_to_authors %>%
+  mutate(
+    mag_firstname = word(NormalizedName, 1),
+    mag_lastname = word(NormalizedName, -1),
+    mag_middlename = ifelse(str_count(NormalizedName, "\\s+") >= 2 &
+                              word(NormalizedName, 2) != word(NormalizedName, -1),
+                            word(NormalizedName, 2), NA_character_)
+  )
+
+
+nsf_to_authors <- nsf_to_authors %>%
+  mutate(
+    nsf_firstname = word(PIFullName, 1),
+    nsf_lastname = word(PIFullName, -1),
+    nsf_middlename = ifelse(str_count(PIFullName, "\\s+") >= 2 &
+                     word(PIFullName, 2) != word(PIFullName, -1),
+                     word(PIFullName, 2), NA_character_)
+  )
+
 
-## Still running, not sure if running correctly from here
 
 ### Compare name similarity
 # Set a threshold for similarity
 threshold <- 0.8
 
-# Calculate string similarity for each row and add a new column
-name_similarity <- numeric(0)
 
+### Test several distances
+
+# Calculate string similarity for first and last names by row and add a new column
+firstname_similarity <- numeric(0)
+lastname_similarity <- numeric(0)
 
-# Iterate through rows and calculate string distances
+# Iterate through rows and calculate string distances for first and last names separately
 for (i in 1:nrow(nsf_to_authors)) {
-  mag_name <- nsf_to_authors$mag_name[i]
-  NormalizedName <- nsf_to_authors$NormalizedName[i]
+  mag_firstname <- nsf_to_authors$mag_firstname[i]
+  nsf_firstname <- nsf_to_authors$nsf_firstname[i]
+  
+  # Calculate string distance for first name by row using Optimal String Alignment (default)
+  first_row_similarity <- stringsim(
+    mag_firstname,
+    nsf_firstname,
+    method="osa" )
+  
+  # Append the calculated distance to the results vector for first name
+ firstname_similarity <- c(firstname_similarity, first_row_similarity)
+
+ # Calculate string distance for last name by row
+    mag_lastname <- nsf_to_authors$mag_lastname[i]
+    nsf_lastname <- nsf_to_authors$nsf_lastname[i]
   
-  # Calculate string distance for this row
-  row_similarity <- stringsim(
-    mag_name,
-    NormalizedName
+    last_row_similarity <- stringsim(
+     mag_lastname,
+     nsf_lastname,
+     method="osa"
   )
   
-  # Append the calculated distance to the results vector
-  name_similarity <- c(name_similarity, row_similarity)
+  # Append the calculated distance to the results vector for last name
+  lastname_similarity <- c(lastname_similarity, last_row_similarity)
 }
 
 # Assign the calculated distances to a new column in data frame
-nsf_to_authors$name_similarity <- name_similarity
+nsf_to_authors$firstname_similarity <- firstname_similarity
+nsf_to_authors$lastname_similarity <- lastname_similarity
 
-# Filter observations where the names are above the threshold
+# Filter observations where the names are above the threshold: threshold seemed reasonable as it allows for a single typo
 similar_names <- nsf_to_authors %>%
-  filter(name_similarity >= threshold)
+  filter(firstname_similarity >= threshold & lastname_similarity >= threshold)
 
-# drop unnecessary variables
+# drop unnecessary variables and drop duplicates
 df <- similar_names %>%
-  select(GrantID, AuthorId, Position) %>% 
+  select(GrantID, AuthorId, Position, firstname_similarity, lastname_similarity) %>%
   distinct()
 
-# Write table to db: 
+# Write table to db:
 dbWriteTable(con, name = "links_nsf_mag", value = df, overwrite = TRUE)
 
 # close connection to db

From ae370fcda42a52be7e293fb195335eb0195d6f29 Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Tue, 26 Sep 2023 15:11:44 +0000
Subject: [PATCH 06/14] Comparison of methods for stringdistance

---
 .../main/prep_nsf/test_name_similarity.csv    | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 src/dataprep/main/prep_nsf/test_name_similarity.csv

diff --git a/src/dataprep/main/prep_nsf/test_name_similarity.csv b/src/dataprep/main/prep_nsf/test_name_similarity.csv
new file mode 100644
index 0000000..98f68a1
--- /dev/null
+++ b/src/dataprep/main/prep_nsf/test_name_similarity.csv
@@ -0,0 +1,101 @@
+"NormalizedName","PIFullName","mag_firstname","mag_lastname","mag_middlename","nsf_firstname","nsf_lastname","nsf_middlename","firstname_similarity_osa","lastname_similarity_osa","firstname_similarity_lv","lastname_similarity_lv","firstname_similarity_dl","lastname_similarity_dl","firstname_similarity_lcs","lastname_similarity_lcs","firstname_similarity_qgram","lastname_similarity_qgram","firstname_similarity_cosine","lastname_similarity_cosine","firstname_similarity_jac","lastname_similarity_jac","firstname_similarity_jw0","lastname_similarity_jw0","firstname_similarity_jw.1","lastname_similarity_jw.1","firstname_similarity_jw.2","lastname_similarity_jw.2"
+"jennifer fowler","jennifer w fowler","jennifer","fowler",NA,"jennifer","fowler","w",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"thomas colligan","jennifer w fowler","thomas","colligan",NA,"jennifer","fowler","w",0,0.25,0,0.25,0,0.25,0,0.285714285714286,0,0.285714285714286,0,0.387298334620742,0,0.181818181818182,0,0.527777777777778,0,0.527777777777778,0,0.527777777777778
+"jaxen godfrey","jennifer w fowler","jaxen","godfrey",NA,"jennifer","fowler","w",0.25,0.285714285714286,0.25,0.285714285714286,0.25,0.285714285714286,0.461538461538462,0.307692307692308,0.461538461538462,0.615384615384615,0.645497224367903,0.617213399848368,0.375,0.444444444444444,0.658333333333333,0.531746031746032,0.6925,0.531746031746032,0.726666666666667,0.531746031746032
+"carl spangrude","jennifer w fowler","carl","spangrude",NA,"jennifer","fowler","w",0,0.111111111111111,0,0.111111111111111,0,0.111111111111111,0.166666666666667,0.133333333333333,0.166666666666667,0.266666666666667,0.144337567297406,0.272165526975909,0.111111111111111,0.153846153846154,0,0.425925925925926,0,0.425925925925926,0,0.425925925925926
+"daqing wan","daqing wan","daqing","wan",NA,"daqing","wan",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"jiyou li","daqing wan","jiyou","li",NA,"daqing","wan",NA,0,0,0,0,0,0,0.181818181818182,0,0.181818181818182,0,0.182574185835055,0,0.1,0,0.455555555555555,0,0.455555555555555,0,0.455555555555555,0
+"daqing wan","daqing wan","daqing","wan",NA,"daqing","wan",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"krishna kaipa","daqing wan","krishna","kaipa",NA,"daqing","wan",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.307692307692308,0.25,0.461538461538462,0.25,0.462910049886276,0.436435780471985,0.3,0.166666666666667,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111
+"jun zhang","daqing wan","jun","zhang",NA,"daqing","wan",NA,0.166666666666667,0.4,0.166666666666667,0.4,0.166666666666667,0.4,0.222222222222222,0.5,0.222222222222222,0.5,0.235702260395516,0.516397779494322,0.125,0.333333333333333,0.5,0.688888888888889,0.5,0.688888888888889,0.5,0.688888888888889
+"daqing wan","daqing wan","daqing","wan",NA,"daqing","wan",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"jun zhang","daqing wan","jun","zhang",NA,"daqing","wan",NA,0.166666666666667,0.4,0.166666666666667,0.4,0.166666666666667,0.4,0.222222222222222,0.5,0.222222222222222,0.5,0.235702260395516,0.516397779494322,0.125,0.333333333333333,0.5,0.688888888888889,0.5,0.688888888888889,0.5,0.688888888888889
+"daqing wan","daqing wan","daqing","wan",NA,"daqing","wan",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"alicia marino","daqing wan","alicia","marino",NA,"daqing","wan",NA,0.166666666666667,0.333333333333333,0.166666666666667,0.333333333333333,0.166666666666667,0.333333333333333,0.333333333333333,0.444444444444444,0.333333333333333,0.444444444444444,0.516397779494322,0.471404520791032,0.25,0.285714285714286,0.555555555555555,0.666666666666667,0.555555555555555,0.666666666666667,0.555555555555555,0.666666666666667
+"angela robinson","daqing wan","angela","robinson",NA,"daqing","wan",NA,0,0.125,0,0.125,0,0.125,0.5,0.181818181818182,0.5,0.181818181818182,0.577350269189626,0.333333333333333,0.375,0.125,0.444444444444444,0.486111111111111,0.444444444444444,0.486111111111111,0.444444444444444,0.486111111111111
+"tim lai","daqing wan","tim","lai",NA,"daqing","wan",NA,0.166666666666667,0.333333333333333,0.166666666666667,0.333333333333333,0.166666666666667,0.333333333333333,0.222222222222222,0.333333333333333,0.222222222222222,0.333333333333333,0.235702260395516,0.333333333333333,0.125,0.2,0.5,0.555555555555555,0.5,0.555555555555555,0.5,0.555555555555555
+"nathan beckmann","nathan beckmann","nathan","beckmann",NA,"nathan","beckmann",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"brian c schwedock","nathan beckmann","brian","schwedock","c","nathan","beckmann",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.363636363636364,0.352941176470588,0.363636363636364,0.352941176470588,0.565685424949238,0.381385035698237,0.285714285714286,0.25,0.577777777777778,0.324074074074074,0.577777777777778,0.324074074074074,0.577777777777778,0.324074074074074
+"george yin","hongwei zhang","george","yin",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.461538461538462,0.25,0.461538461538462,0.25,0.597614304667197,0.258198889747161,0.375,0.142857142857143,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111
+"jayanthi rao","hongwei zhang","jayanthi","rao",NA,"hongwei","zhang",NA,0.125,0.2,0.125,0.2,0.125,0.2,0.266666666666667,0.25,0.4,0.25,0.358568582800318,0.258198889747161,0.272727272727273,0.142857142857143,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111
+"chuan li","hongwei zhang","chuan","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.333333333333333,0,0.333333333333333,0,0.338061701891406,0,0.2,0,0.561904761904762,0,0.561904761904762,0,0.561904761904762,0
+"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"le yi wang","hongwei zhang","le","wang","yi","hongwei","zhang",NA,0.142857142857143,0.6,0.142857142857143,0.6,0.142857142857143,0.6,0.222222222222222,0.666666666666667,0.222222222222222,0.666666666666667,0.267261241912424,0.670820393249937,0.125,0.5,0,0.783333333333333,0,0.783333333333333,0,0.783333333333333
+"george yin","hongwei zhang","george","yin",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.461538461538462,0.25,0.461538461538462,0.25,0.597614304667197,0.258198889747161,0.375,0.142857142857143,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111
+"jayanthi rao","hongwei zhang","jayanthi","rao",NA,"hongwei","zhang",NA,0.125,0.2,0.125,0.2,0.125,0.2,0.266666666666667,0.25,0.4,0.25,0.358568582800318,0.258198889747161,0.272727272727273,0.142857142857143,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111
+"chuan li","hongwei zhang","chuan","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.333333333333333,0,0.333333333333333,0,0.338061701891406,0,0.2,0,0.561904761904762,0,0.561904761904762,0,0.561904761904762,0
+"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"le yi wang","hongwei zhang","le","wang","yi","hongwei","zhang",NA,0.142857142857143,0.6,0.142857142857143,0.6,0.142857142857143,0.6,0.222222222222222,0.666666666666667,0.222222222222222,0.666666666666667,0.267261241912424,0.670820393249937,0.125,0.5,0,0.783333333333333,0,0.783333333333333,0,0.783333333333333
+"keqiang li","hongwei zhang","keqiang","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.285714285714286,0,0.571428571428571,0,0.571428571428571,0,0.4,0,0,0,0,0,0,0
+"shengbo eben li","hongwei zhang","shengbo","li","eben","hongwei","zhang",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.428571428571429,0,0.714285714285714,0,0.714285714285714,0,0.555555555555556,0,0.619047619047619,0,0.619047619047619,0,0.619047619047619,0
+"yang zheng","hongwei zhang","yang","zheng",NA,"hongwei","zhang",NA,0.285714285714286,0.8,0.285714285714286,0.8,0.285714285714286,0.8,0.363636363636364,0.8,0.363636363636364,0.8,0.377964473009227,0.8,0.222222222222222,0.666666666666667,0.595238095238095,0.866666666666667,0.595238095238095,0.893333333333333,0.595238095238095,0.92
+"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"jianqiang wang","hongwei zhang","jianqiang","wang",NA,"hongwei","zhang",NA,0.111111111111111,0.6,0.111111111111111,0.6,0.111111111111111,0.6,0.25,0.666666666666667,0.375,0.666666666666667,0.487950036474267,0.670820393249937,0.3,0.5,0.502645502645503,0.783333333333333,0.502645502645503,0.783333333333333,0.502645502645503,0.783333333333333
+"xiaohui qin","hongwei zhang","xiaohui","qin",NA,"hongwei","zhang",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.285714285714286,0.25,0.428571428571429,0.25,0.50395263067897,0.258198889747161,0.3,0.142857142857143,0.523809523809524,0.511111111111111,0.523809523809524,0.511111111111111,0.523809523809524,0.511111111111111
+"keqiang li","hongwei zhang","keqiang","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.285714285714286,0,0.571428571428571,0,0.571428571428571,0,0.4,0,0,0,0,0,0,0
+"shengbo eben li","hongwei zhang","shengbo","li","eben","hongwei","zhang",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.428571428571429,0,0.714285714285714,0,0.714285714285714,0,0.555555555555556,0,0.619047619047619,0,0.619047619047619,0,0.619047619047619,0
+"george yin","hongwei zhang","george","yin",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.461538461538462,0.25,0.461538461538462,0.25,0.597614304667197,0.258198889747161,0.375,0.142857142857143,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111
+"le yi wang","hongwei zhang","le","wang","yi","hongwei","zhang",NA,0.142857142857143,0.6,0.142857142857143,0.6,0.142857142857143,0.6,0.222222222222222,0.666666666666667,0.222222222222222,0.666666666666667,0.267261241912424,0.670820393249937,0.125,0.5,0,0.783333333333333,0,0.783333333333333,0,0.783333333333333
+"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"thu nguyen","hongwei zhang","thu","nguyen",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.2,0.363636363636364,0.2,0.363636363636364,0.218217890235992,0.474341649025257,0.111111111111111,0.25,0.492063492063492,0.455555555555555,0.492063492063492,0.455555555555555,0.492063492063492,0.455555555555555
+"j karl hedrick","hongwei zhang","j","hedrick","karl","hongwei","zhang",NA,0,0,0,0,0,0,0,0.166666666666667,0,0.166666666666667,0,0.169030850945703,0,0.0909090909090909,0,0.447619047619048,0,0.447619047619048,0,0.447619047619048
+"keqiang li","hongwei zhang","keqiang","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.285714285714286,0,0.571428571428571,0,0.571428571428571,0,0.4,0,0,0,0,0,0,0
+"shengbo eben li","hongwei zhang","shengbo","li","eben","hongwei","zhang",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.428571428571429,0,0.714285714285714,0,0.714285714285714,0,0.555555555555556,0,0.619047619047619,0,0.619047619047619,0,0.619047619047619,0
+"feng gao","hongwei zhang","feng","gao",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.363636363636364,0.25,0.545454545454545,0.5,0.566946709513841,0.516397779494322,0.375,0.333333333333333,0.595238095238095,0.511111111111111,0.595238095238095,0.511111111111111,0.595238095238095,0.511111111111111
+"yang zheng","hongwei zhang","yang","zheng",NA,"hongwei","zhang",NA,0.285714285714286,0.8,0.285714285714286,0.8,0.285714285714286,0.8,0.363636363636364,0.8,0.363636363636364,0.8,0.377964473009227,0.8,0.222222222222222,0.666666666666667,0.595238095238095,0.866666666666667,0.595238095238095,0.893333333333333,0.595238095238095,0.92
+"yujia wu","hongwei zhang","yujia","wu",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.166666666666667,0,0.166666666666667,0,0.169030850945703,0,0.0909090909090909,0,0,0,0,0,0,0
+"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"j karl hedrick","jing hua","j","hedrick","karl","jing","hua",NA,0.25,0.142857142857143,0.25,0.142857142857143,0.25,0.142857142857143,0.4,0.2,0.4,0.2,0.5,0.218217890235992,0.25,0.111111111111111,0.75,0.492063492063492,0.775,0.542857142857143,0.8,0.593650793650794
+"j karl hedrick","hongwei zhang","j","hedrick","karl","hongwei","zhang",NA,0,0,0,0,0,0,0,0.166666666666667,0,0.166666666666667,0,0.169030850945703,0,0.0909090909090909,0,0.447619047619048,0,0.447619047619048,0,0.447619047619048
+"j karl hedrick","jayanthi rao","j","hedrick","karl","jayanthi","rao",NA,0.125,0.142857142857143,0.125,0.142857142857143,0.125,0.142857142857143,0.222222222222222,0.2,0.222222222222222,0.2,0.316227766016838,0.218217890235992,0.142857142857143,0.111111111111111,0.708333333333333,0,0.7375,0,0.766666666666667,0
+"j karl hedrick","anthony holt","j","hedrick","karl","anthony","holt",NA,0,0.142857142857143,0,0.142857142857143,0,0.142857142857143,0,0.181818181818182,0,0.181818181818182,0,0.188982236504614,0,0.1,0,0.464285714285714,0,0.517857142857143,0,0.571428571428571
+"keqiang li","jing hua","keqiang","li",NA,"jing","hua",NA,0.428571428571429,0,0.428571428571429,0,0.428571428571429,0,0.545454545454545,0,0.545454545454545,0,0.566946709513841,0,0.375,0,0.464285714285714,0,0.464285714285714,0,0.464285714285714,0
+"keqiang li","hongwei zhang","keqiang","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.285714285714286,0,0.571428571428571,0,0.571428571428571,0,0.4,0,0,0,0,0,0,0
+"keqiang li","jayanthi rao","keqiang","li",NA,"jayanthi","rao",NA,0.125,0,0.125,0,0.125,0,0.266666666666667,0,0.4,0,0.478091443733757,0,0.272727272727273,0,0.511904761904762,0,0.511904761904762,0,0.511904761904762,0
+"keqiang li","anthony holt","keqiang","li",NA,"anthony","holt",NA,0.142857142857143,0.25,0.142857142857143,0.25,0.142857142857143,0.25,0.285714285714286,0.333333333333333,0.285714285714286,0.333333333333333,0.377964473009227,0.353553390593274,0.181818181818182,0.2,0.428571428571428,0,0.428571428571428,0,0.428571428571428,0
+"shengbo eben li","jing hua","shengbo","li","eben","jing","hua",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.363636363636364,0,0.363636363636364,0,0.377964473009227,0,0.222222222222222,0,0.595238095238095,0,0.595238095238095,0,0.595238095238095,0
+"shengbo eben li","hongwei zhang","shengbo","li","eben","hongwei","zhang",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.428571428571429,0,0.714285714285714,0,0.714285714285714,0,0.555555555555556,0,0.619047619047619,0,0.619047619047619,0,0.619047619047619,0
+"shengbo eben li","jayanthi rao","shengbo","li","eben","jayanthi","rao",NA,0.125,0,0.125,0,0.125,0,0.133333333333333,0,0.266666666666667,0,0.239045721866879,0,0.166666666666667,0,0.422619047619048,0,0.422619047619048,0,0.422619047619048,0
+"shengbo eben li","anthony holt","shengbo","li","eben","anthony","holt",NA,0,0.25,0,0.25,0,0.25,0.285714285714286,0.333333333333333,0.428571428571429,0.333333333333333,0.50395263067897,0.353553390593274,0.3,0.2,0.507936507936508,0,0.507936507936508,0,0.507936507936508,0
+"feng gao","jing hua","feng","gao",NA,"jing","hua",NA,0.5,0,0.5,0,0.5,0,0.5,0.333333333333333,0.5,0.333333333333333,0.5,0.333333333333333,0.333333333333333,0.2,0.666666666666667,0,0.666666666666667,0,0.666666666666667,0
+"feng gao","hongwei zhang","feng","gao",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.363636363636364,0.25,0.545454545454545,0.5,0.566946709513841,0.516397779494322,0.375,0.333333333333333,0.595238095238095,0.511111111111111,0.595238095238095,0.511111111111111,0.595238095238095,0.511111111111111
+"feng gao","jayanthi rao","feng","gao",NA,"jayanthi","rao",NA,0.125,0.666666666666667,0.125,0.666666666666667,0.125,0.666666666666667,0.166666666666667,0.666666666666667,0.166666666666667,0.666666666666667,0.158113883008419,0.666666666666667,0.1,0.5,0.458333333333333,0.777777777777778,0.458333333333333,0.777777777777778,0.458333333333333,0.777777777777778
+"feng gao","anthony holt","feng","gao",NA,"anthony","holt",NA,0.142857142857143,0,0.142857142857143,0,0.142857142857143,0,0.181818181818182,0.285714285714286,0.181818181818182,0.285714285714286,0.333333333333333,0.288675134594813,0.111111111111111,0.166666666666667,0.464285714285714,0.527777777777778,0.464285714285714,0.527777777777778,0.464285714285714,0.527777777777778
+"yang zheng","jing hua","yang","zheng",NA,"jing","hua",NA,0.5,0.2,0.5,0.2,0.5,0.2,0.5,0.25,0.5,0.25,0.5,0.258198889747161,0.333333333333333,0.142857142857143,0.666666666666667,0.511111111111111,0.666666666666667,0.511111111111111,0.666666666666667,0.511111111111111
+"yang zheng","hongwei zhang","yang","zheng",NA,"hongwei","zhang",NA,0.285714285714286,0.8,0.285714285714286,0.8,0.285714285714286,0.8,0.363636363636364,0.8,0.363636363636364,0.8,0.377964473009227,0.8,0.222222222222222,0.666666666666667,0.595238095238095,0.866666666666667,0.595238095238095,0.893333333333333,0.595238095238095,0.92
+"yang zheng","jayanthi rao","yang","zheng",NA,"jayanthi","rao",NA,0.375,0,0.375,0,0.375,0,0.5,0,0.5,0,0.632455532033676,0,0.375,0,0.597222222222222,0,0.597222222222222,0,0.597222222222222,0
+"yang zheng","anthony holt","yang","zheng",NA,"anthony","holt",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.363636363636364,0.222222222222222,0.545454545454545,0.222222222222222,0.666666666666667,0.223606797749979,0.428571428571429,0.125,0.595238095238095,0.483333333333333,0.595238095238095,0.483333333333333,0.595238095238095,0.483333333333333
+"yujia wu","jing hua","yujia","wu",NA,"jing","hua",NA,0.2,0.333333333333333,0.2,0.333333333333333,0.2,0.333333333333333,0.444444444444444,0.4,0.444444444444444,0.4,0.447213595499958,0.408248290463863,0.285714285714286,0.25,0,0.611111111111111,0,0.611111111111111,0,0.611111111111111
+"yujia wu","hongwei zhang","yujia","wu",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.166666666666667,0,0.166666666666667,0,0.169030850945703,0,0.0909090909090909,0,0,0,0,0,0,0
+"yujia wu","jayanthi rao","yujia","wu",NA,"jayanthi","rao",NA,0.125,0,0.125,0,0.125,0,0.307692307692308,0,0.615384615384615,0,0.707106781186547,0,0.5,0,0.491666666666667,0,0.491666666666667,0,0.491666666666667,0
+"yujia wu","anthony holt","yujia","wu",NA,"anthony","holt",NA,0,0,0,0,0,0,0.166666666666667,0,0.333333333333333,0,0.298142396999972,0,0.222222222222222,0,0,0,0,0,0,0
+"hongwei zhang","jing hua","hongwei","zhang",NA,"jing","hua",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.363636363636364,0.5,0.545454545454545,0.5,0.566946709513841,0.516397779494322,0.375,0.333333333333333,0.595238095238095,0.688888888888889,0.595238095238095,0.688888888888889,0.595238095238095,0.688888888888889
+"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"hongwei zhang","jayanthi rao","hongwei","zhang",NA,"jayanthi","rao",NA,0.125,0.2,0.125,0.2,0.125,0.2,0.266666666666667,0.25,0.4,0.25,0.358568582800318,0.258198889747161,0.272727272727273,0.142857142857143,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111
+"hongwei zhang","anthony holt","hongwei","zhang",NA,"anthony","holt",NA,0,0.2,0,0.2,0,0.2,0.428571428571429,0.222222222222222,0.428571428571429,0.222222222222222,0.50395263067897,0.223606797749979,0.3,0.125,0.428571428571428,0.483333333333333,0.428571428571428,0.483333333333333,0.428571428571428,0.483333333333333
+"kumaresh singh","adrian sandu","kumaresh","singh",NA,"adrian","sandu",NA,0.125,0.4,0.125,0.4,0.125,0.4,0.285714285714286,0.4,0.285714285714286,0.4,0.375,0.4,0.181818181818182,0.25,0.527777777777778,0.6,0.527777777777778,0.64,0.527777777777778,0.68
+"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"mohamed jardak","adrian sandu","mohamed","jardak",NA,"adrian","sandu",NA,0,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0.307692307692308,0.363636363636364,0.307692307692308,0.363636363636364,0.353553390593274,0.474341649025257,0.222222222222222,0.25,0.436507936507937,0.577777777777778,0.436507936507937,0.577777777777778,0.436507936507937,0.577777777777778
+"meemong lee","adrian sandu","meemong","lee",NA,"adrian","sandu",NA,0.142857142857143,0,0.142857142857143,0,0.142857142857143,0,0.153846153846154,0,0.153846153846154,0,0.106600358177805,0,0.111111111111111,0,0.436507936507937,0,0.436507936507937,0,0.436507936507937,0
+"kumaresh singh","adrian sandu","kumaresh","singh",NA,"adrian","sandu",NA,0.125,0.4,0.125,0.4,0.125,0.4,0.285714285714286,0.4,0.285714285714286,0.4,0.375,0.4,0.181818181818182,0.25,0.527777777777778,0.6,0.527777777777778,0.64,0.527777777777778,0.68
+"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"kevin w bowman","adrian sandu","kevin","bowman","w","adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.363636363636364,0.363636363636364,0.363636363636364,0.363636363636364,0.316227766016838,0.365148371670111,0.25,0.222222222222222,0.577777777777778,0,0.577777777777778,0,0.577777777777778,0
+"alexandru cioaca","adrian sandu","alexandru","cioaca",NA,"adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.4,0.181818181818182,0.666666666666667,0.181818181818182,0.746202507244636,0.282842712474619,0.444444444444444,0.125,0.611111111111111,0.455555555555555,0.65,0.455555555555555,0.688888888888889,0.455555555555555
+"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"elias d nino ruiz","adrian sandu","elias","ruiz","d","adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.363636363636364,0.222222222222222,0.363636363636364,0.222222222222222,0.474341649025257,0.223606797749979,0.25,0.125,0.577777777777778,0,0.577777777777778,0,0.577777777777778,0
+"mohamed jardak","adrian sandu","mohamed","jardak",NA,"adrian","sandu",NA,0,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0.307692307692308,0.363636363636364,0.307692307692308,0.363636363636364,0.353553390593274,0.474341649025257,0.222222222222222,0.25,0.436507936507937,0.577777777777778,0.436507936507937,0.577777777777778,0.436507936507937,0.577777777777778
+"meemong lee","adrian sandu","meemong","lee",NA,"adrian","sandu",NA,0.142857142857143,0,0.142857142857143,0,0.142857142857143,0,0.153846153846154,0,0.153846153846154,0,0.106600358177805,0,0.111111111111111,0,0.436507936507937,0,0.436507936507937,0,0.436507936507937,0
+"kumaresh singh","adrian sandu","kumaresh","singh",NA,"adrian","sandu",NA,0.125,0.4,0.125,0.4,0.125,0.4,0.285714285714286,0.4,0.285714285714286,0.4,0.375,0.4,0.181818181818182,0.25,0.527777777777778,0.6,0.527777777777778,0.64,0.527777777777778,0.68
+"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"kevin w bowman","adrian sandu","kevin","bowman","w","adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.363636363636364,0.363636363636364,0.363636363636364,0.363636363636364,0.316227766016838,0.365148371670111,0.25,0.222222222222222,0.577777777777778,0,0.577777777777778,0,0.577777777777778,0
+"ahmed attia","adrian sandu","ahmed","attia",NA,"adrian","sandu",NA,0.166666666666667,0,0.166666666666667,0,0.166666666666667,0,0.363636363636364,0.2,0.363636363636364,0.2,0.474341649025257,0.298142396999972,0.25,0.142857142857143,0.455555555555555,0.466666666666667,0.51,0.466666666666667,0.564444444444444,0.466666666666667
+"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"vishwas rao","adrian sandu","vishwas","rao",NA,"adrian","sandu",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.307692307692308,0.25,0.307692307692308,0.25,0.353553390593274,0.258198889747161,0.222222222222222,0.142857142857143,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111
+"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"vishwas rao","adrian sandu","vishwas","rao",NA,"adrian","sandu",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.307692307692308,0.25,0.307692307692308,0.25,0.353553390593274,0.258198889747161,0.222222222222222,0.142857142857143,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111
+"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"haiyan cheng","adrian sandu","haiyan","cheng",NA,"adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.666666666666667,0.2,0.666666666666667,0.2,0.75,0.2,0.428571428571429,0.111111111111111,0.777777777777778,0.466666666666667,0.777777777777778,0.466666666666667,0.777777777777778,0.466666666666667
+"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+"ahmed attia","adrian sandu","ahmed","attia",NA,"adrian","sandu",NA,0.166666666666667,0,0.166666666666667,0,0.166666666666667,0,0.363636363636364,0.2,0.363636363636364,0.2,0.474341649025257,0.298142396999972,0.25,0.142857142857143,0.455555555555555,0.466666666666667,0.51,0.466666666666667,0.564444444444444,0.466666666666667
+"vishwas rao","adrian sandu","vishwas","rao",NA,"adrian","sandu",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.307692307692308,0.25,0.307692307692308,0.25,0.353553390593274,0.258198889747161,0.222222222222222,0.142857142857143,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111

From 47aa945b5a5bbf8c8d2e95815da080c3f75936bd Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Thu, 28 Sep 2023 12:58:53 +0000
Subject: [PATCH 07/14] Added info on processing stage + updated pipeline

---
 .../main/prep_nsf/link_scinetnsf_to_mag.R     | 60 +++++++++++++++++--
 src/dataprep/pipeline.sh                      |  7 ++-
 2 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
index 2e5cfff..1aecf7d 100644
--- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
+++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -3,6 +3,11 @@
 # only those links with a similar name (similarity >=0.8) are loaded into db
 # Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder
 
+# Initialize variables for counting rows and timestamp
+row_count <- 0
+start_time <- Sys.time()
+cat(sprintf("Started at", start_time))
+
 packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist")
 lapply(packages, library, character.only = TRUE)
 
@@ -13,6 +18,7 @@ db_file  <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite")
 con <- DBI::dbConnect(RSQLite::SQLite(), db_file)
 cat("The database connection is: \n")
 src_dbi(con)
+cat("Connected to db...\n")
 
 # Create table with all links between NSF-grant and authors via papers
 
@@ -73,6 +79,7 @@ firstname_similarity <- numeric(0)
 lastname_similarity <- numeric(0)
 
 # Iterate through rows and calculate string distances for first and last names separately
+cat("Start comparing names...\n")
 for (i in 1:nrow(nsf_to_authors)) {
   mag_firstname <- nsf_to_authors$mag_firstname[i]
   nsf_firstname <- nsf_to_authors$nsf_firstname[i]
@@ -83,8 +90,7 @@ for (i in 1:nrow(nsf_to_authors)) {
     nsf_firstname,
     method="osa" )
   
-  # Append the calculated distance to the results vector for first name
- firstname_similarity <- c(firstname_similarity, first_row_similarity)
+
 
  # Calculate string distance for last name by row
     mag_lastname <- nsf_to_authors$mag_lastname[i]
@@ -96,10 +102,42 @@ for (i in 1:nrow(nsf_to_authors)) {
      method="osa"
   )
   
-  # Append the calculated distance to the results vector for last name
+  # Append the calculated distances to the results vector 
+  firstname_similarity <- c(firstname_similarity, first_row_similarity)
   lastname_similarity <- c(lastname_similarity, last_row_similarity)
+
+  # Increment row count
+  row_count <- row_count + 1
+  
+  # Progress after each 500,000th row
+  if (row_count %% 50 == 0) {
+    # Calculate elapsed time
+    elapsed_time <- Sys.time() - start_time 
+    elapsed_time <- as.numeric(elapsed_time)
+    
+    # Calculate percentage of data processed
+    percent_processed <- (row_count / nrow(nsf_to_authors)) * 100
+    
+    # Some information
+    cat(sprintf(
+      "Processed %d rows (%.2f%%) in %2.f minutes.\n",
+      row_count, 
+      percent_processed, 
+      elapsed_time
+    ))
+  }
 }
 
+elapsed_time <- Sys.time() - start_time
+elapsed_time <- as.numeric(elapsed_time)
+
+percent_processed <- (row_count / nrow(nsf_to_authors)) * 100
+cat(sprintf(
+  "Processed all rows (%.2f%%) in %2.f minutes.\n",
+  percent_processed, 
+  elapsed_time
+))
+
 # Assign the calculated distances to a new column in data frame
 nsf_to_authors$firstname_similarity <- firstname_similarity
 nsf_to_authors$lastname_similarity <- lastname_similarity
@@ -114,7 +152,19 @@ df <- similar_names %>%
   distinct()
 
 # Write table to db:
-dbWriteTable(con, name = "links_nsf_mag", value = df, overwrite = TRUE)
+cat("Starting data upload to the database...\n")
+#dbWriteTable(con, name = "links_nsf_mag2", value = df, overwrite = TRUE)
+cat("Data upload to the database is complete.\n")
+
+# Some info
+final_elapsed_time <- Sys.time() - start_time
+elapsed_time <- as.numeric(elapsed_time)
+
+cat(sprintf(
+  "Complete. Total time elapsed: %2.f minutes.\n",
+  elapsed_time
+))
 
 # close connection to db
-DBI::dbDisconnect(con)
\ No newline at end of file
+DBI::dbDisconnect(con)
+cat("Disconnected from db.\n")
\ No newline at end of file
diff --git a/src/dataprep/pipeline.sh b/src/dataprep/pipeline.sh
index 8431c27..2b09a04 100644
--- a/src/dataprep/pipeline.sh
+++ b/src/dataprep/pipeline.sh
@@ -143,9 +143,10 @@ Rscript -e "rmarkdown::render('$script_path/reports/quality_linking_advisors.Rmd
 # ## 3. Link NSF grants to MAG advisors
 bash $script_path/link/grants.sh $logfile_path
 
-# XXX adapt for grants - use mona train
-#python -m $script_path.link.write_csv_links --linking_type "advisors" --train_name "christoph_degree0" \
-#    &> $logfile_path/write_csv_links_advisors.log
+# XXX adapt for grants: links ScisciNet (nsf) to mag
+
+Rscript $script_path.prep_nsf.link_scinetnsf_to_mag.R &> \
+   &> $logfile_path/write_csv_links_grants.log
     
 Rscript -e "rmarkdown::render('$script_path/reports/quality_linking_grants.Rmd', output_dir = '$output_path')" \
     &> $logfile_path/quality_linking_grants.log

From 85820d7822937c284da71de9b7c85a5fe155385a Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Thu, 28 Sep 2023 13:03:55 +0000
Subject: [PATCH 08/14] just uncommented db-upload

---
 src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
index 1aecf7d..36339a5 100644
--- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
+++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -153,7 +153,7 @@ df <- similar_names %>%
 
 # Write table to db:
 cat("Starting data upload to the database...\n")
-#dbWriteTable(con, name = "links_nsf_mag2", value = df, overwrite = TRUE)
+dbWriteTable(con, name = "links_nsf_mag2", value = df, overwrite = TRUE)
 cat("Data upload to the database is complete.\n")
 
 # Some info

From ea8c1f4f0f22790eebcdfda794a5ff974862acc4 Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Fri, 29 Sep 2023 11:02:31 +0000
Subject: [PATCH 09/14] Optimized similarity check of names

---
 .../main/prep_nsf/link_scinetnsf_to_mag.R     | 173 ++++++++++--------
 1 file changed, 94 insertions(+), 79 deletions(-)

diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
index 36339a5..c167eb3 100644
--- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
+++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -4,11 +4,10 @@
 # Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder
 
 # Initialize variables for counting rows and timestamp
-row_count <- 0
 start_time <- Sys.time()
-cat(sprintf("Started at", start_time))
+cat(sprintf("Started at %s \n", start_time))
 
-packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist")
+packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist", "purrr", "furrr")
 lapply(packages, library, character.only = TRUE)
 
 datapath <- "/mnt/ssd/"
@@ -23,7 +22,7 @@ cat("Connected to db...\n")
 # Create table with all links between NSF-grant and authors via papers
 
 NSF_to_Authors <- tbl(con, sql("
-                  select a. PaperID, a.Type, a.GrantID, b.AuthorId
+                  select a. PaperID, a.GrantID, b.AuthorId
                         ,c.NormalizedName, d.Position, d.PIFullName
                                       from scinet_links_nsf as a
                                       inner join (
@@ -44,6 +43,9 @@ NSF_to_Authors <- tbl(con, sql("
                                "))
 
 nsf_to_authors <- collect(NSF_to_Authors)
+nsf_to_authors <- nsf_to_authors %>%
+  filter(!is.na(PIFullName) & !is.na(NormalizedName))
+cat("Loaded dataset. \n")
 
 # Create separate variables for first and last name for both nsf and mag names
 nsf_to_authors <- nsf_to_authors %>%
@@ -65,105 +67,118 @@ nsf_to_authors <- nsf_to_authors %>%
                      word(PIFullName, 2), NA_character_)
   )
 
-
-
 ### Compare name similarity
 # Set a threshold for similarity
-threshold <- 0.8
-
+threshold <- 0.7
 
-### Test several distances
+### Create function to calculate similarity and filter 
 
-# Calculate string similarity for first and last names by row and add a new column
-firstname_similarity <- numeric(0)
-lastname_similarity <- numeric(0)
+fct_similarity <- function(row) {
+  mag_firstname <- row$mag_firstname
+  nsf_firstname <- row$nsf_firstname
 
-# Iterate through rows and calculate string distances for first and last names separately
-cat("Start comparing names...\n")
-for (i in 1:nrow(nsf_to_authors)) {
-  mag_firstname <- nsf_to_authors$mag_firstname[i]
-  nsf_firstname <- nsf_to_authors$nsf_firstname[i]
-  
-  # Calculate string distance for first name by row using Optimal String Alignment (default)
+  # Calculate string distances by row using Optimal String Alignment (default)
   first_row_similarity <- stringsim(
     mag_firstname,
     nsf_firstname,
     method="osa" )
   
-
-
- # Calculate string distance for last name by row
-    mag_lastname <- nsf_to_authors$mag_lastname[i]
-    nsf_lastname <- nsf_to_authors$nsf_lastname[i]
   
-    last_row_similarity <- stringsim(
-     mag_lastname,
-     nsf_lastname,
-     method="osa"
+  mag_lastname <- row$mag_lastname
+  nsf_lastname <- row$nsf_lastname
+  
+  last_row_similarity <- stringsim(
+    mag_lastname,
+    nsf_lastname,
+    method = "osa"
   )
   
-  # Append the calculated distances to the results vector 
-  firstname_similarity <- c(firstname_similarity, first_row_similarity)
-  lastname_similarity <- c(lastname_similarity, last_row_similarity)
+    return(data.frame(firstname_similarity = first_row_similarity, lastname_similarity = last_row_similarity))
+}
+
+# Split the data into chunks of 50,000 rows
+chunk_size <- 50000
+chunks <- split(nsf_to_authors, ceiling(seq_len(nrow(nsf_to_authors)) / chunk_size))
+
+# Load the furrr package for parallel processing
+plan(multisession)
 
-  # Increment row count
-  row_count <- row_count + 1
+# Initialize variables for progress tracking
+total_chunks <- length(chunks)
+processed_chunks <- 0
+
+# Process and save each chunk as individual CSV files
+for (i in seq_along(chunks)) {
+  chunk <- chunks[[i]]
+  
+  # Calculate similarity and filter rows row by row
+  row_similarities <- purrr::map_df(1:nrow(chunk), ~fct_similarity(chunk[.x, ])) %>%
+    mutate(id = row_number())
+  
+  # Filter rows that meet the threshold criteria
+  chunk <- chunk %>%
+    mutate(id = row_number()) %>%
+    left_join(row_similarities, by = "id") %>%
+    filter(firstname_similarity >= threshold & lastname_similarity >= threshold) %>%
+    select(GrantID, AuthorId, Position, mag_firstname, nsf_firstname, firstname_similarity, mag_lastname, nsf_lastname, lastname_similarity) %>%
+    distinct()
   
-  # Progress after each 500,000th row
-  if (row_count %% 50 == 0) {
-    # Calculate elapsed time
-    elapsed_time <- Sys.time() - start_time 
-    elapsed_time <- as.numeric(elapsed_time)
-    
-    # Calculate percentage of data processed
-    percent_processed <- (row_count / nrow(nsf_to_authors)) * 100
-    
-    # Some information
+  
+  # Define the output file path
+  output_file <- file.path("/mnt/ssd/chunks_nsf_links", paste0("chunk_", i, ".csv"))
+  
+  # Write the chunk to a CSV file
+  write.csv(chunk, file = output_file, row.names = FALSE)
+  
+  # Update progress
+  processed_chunks <- processed_chunks + 1
+  percent_processed <- (processed_chunks / total_chunks) * 100
+  elapsed_time <- as.numeric(Sys.time() - start_time)
+
+  # Convert elapsed time to minutes and potentially hours
+  elapsed_minutes <- elapsed_time / 60
+  if (elapsed_minutes >= 60) {
+    elapsed_hours <- floor(elapsed_minutes / 60)
+    elapsed_minutes <- elapsed_minutes %% 60
+
+  # Display progress information
     cat(sprintf(
-      "Processed %d rows (%.2f%%) in %2.f minutes.\n",
-      row_count, 
-      percent_processed, 
-      elapsed_time
+      "Processed %d out of %d chunks (%.2f%%) in %d hours and %.2f minutes.\n",
+      processed_chunks, total_chunks, percent_processed, elapsed_hours, elapsed_minutes
+    ))
+  } else {
+    cat(sprintf(
+      "Processed %d out of %d chunks (%.2f%%) in %.2f minutes.\n",
+      processed_chunks, total_chunks, percent_processed, elapsed_minutes
     ))
   }
 }
 
-elapsed_time <- Sys.time() - start_time
-elapsed_time <- as.numeric(elapsed_time)
-
-percent_processed <- (row_count / nrow(nsf_to_authors)) * 100
-cat(sprintf(
-  "Processed all rows (%.2f%%) in %2.f minutes.\n",
-  percent_processed, 
-  elapsed_time
-))
-
-# Assign the calculated distances to a new column in data frame
-nsf_to_authors$firstname_similarity <- firstname_similarity
-nsf_to_authors$lastname_similarity <- lastname_similarity
-
-# Filter observations where the names are above the threshold: threshold seemed reasonable as it allows for a single typo
-similar_names <- nsf_to_authors %>%
-  filter(firstname_similarity >= threshold & lastname_similarity >= threshold)
-
-# drop unnecessary variables and drop duplicates
-df <- similar_names %>%
-  select(GrantID, AuthorId, Position, firstname_similarity, lastname_similarity) %>%
-  distinct()
-
-# Write table to db:
-cat("Starting data upload to the database...\n")
-dbWriteTable(con, name = "links_nsf_mag2", value = df, overwrite = TRUE)
-cat("Data upload to the database is complete.\n")
+# Clean up the furrr plan
+plan(NULL)
 
 # Some info
 final_elapsed_time <- Sys.time() - start_time
-elapsed_time <- as.numeric(elapsed_time)
+final_elapsed_time <- as.numeric(final_elapsed_time)
+
+# Convert elapsed time to minutes and potentially hours
+final_elapsed_minutes <- final_elapsed_time / 60
+if (final_elapsed_minutes >= 60) {
+  final_elapsed_hours <- floor(final_elapsed_minutes / 60)
+  final_elapsed_minutes <- final_elapsed_minutes %% 60
+  
+  # Display progress information
+  cat(sprintf(
+    "Complete. Total elapsed time: %d hours and %.2f minutes.\n",
+    final_elapsed_hours, final_elapsed_minutes
+  ))
+} else {
+  cat(sprintf(
+    "Complete. Total elapsed time: %.2f minutes.\n",
+    elapsed_minutes
+  ))
+}
 
-cat(sprintf(
-  "Complete. Total time elapsed: %2.f minutes.\n",
-  elapsed_time
-))
 
 # close connection to db
 DBI::dbDisconnect(con)

From 1a93552426d4e644b04852c6f4b14e807cb3a598 Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Mon, 2 Oct 2023 14:48:12 +0000
Subject: [PATCH 10/14] Changed restriction for name similarity

---
 .../main/prep_nsf/link_scinetnsf_to_mag.R     | 50 ++++++++-----------
 1 file changed, 20 insertions(+), 30 deletions(-)

diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
index c167eb3..845cf4c 100644
--- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
+++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -43,6 +43,7 @@ NSF_to_Authors <- tbl(con, sql("
                                "))
 
 nsf_to_authors <- collect(NSF_to_Authors)
+
 nsf_to_authors <- nsf_to_authors %>%
   filter(!is.na(PIFullName) & !is.na(NormalizedName))
 cat("Loaded dataset. \n")
@@ -66,7 +67,7 @@ nsf_to_authors <- nsf_to_authors %>%
                      word(PIFullName, 2) != word(PIFullName, -1),
                      word(PIFullName, 2), NA_character_)
   )
-
+cat("Separated names in dataset. \n")
 ### Compare name similarity
 # Set a threshold for similarity
 threshold <- 0.7
@@ -101,13 +102,17 @@ chunk_size <- 50000
 chunks <- split(nsf_to_authors, ceiling(seq_len(nrow(nsf_to_authors)) / chunk_size))
 
 # Load the furrr package for parallel processing
-plan(multisession)
+plan(multisession, workers=16)
 
 # Initialize variables for progress tracking
 total_chunks <- length(chunks)
 processed_chunks <- 0
 
 # Process and save each chunk as individual CSV files
+ time <- Sys.time()
+ cat(sprintf(
+  "Start processing %d chunks at %s \n", total_chunks,	start_time))
+
 for (i in seq_along(chunks)) {
   chunk <- chunks[[i]]
   
@@ -119,13 +124,19 @@ for (i in seq_along(chunks)) {
   chunk <- chunk %>%
     mutate(id = row_number()) %>%
     left_join(row_similarities, by = "id") %>%
-    filter(firstname_similarity >= threshold & lastname_similarity >= threshold) %>%
+    filter(
+      (firstname_similarity >= threshold & lastname_similarity >= threshold) | (lastname_similarity == 1.0 & substr(mag_firstname, 1, 1) == substr(nsf_firstname, 1, 1)) ) %>%
     select(GrantID, AuthorId, Position, mag_firstname, nsf_firstname, firstname_similarity, mag_lastname, nsf_lastname, lastname_similarity) %>%
     distinct()
-  
+    
   
   # Define the output file path
   output_file <- file.path("/mnt/ssd/chunks_nsf_links", paste0("chunk_", i, ".csv"))
+
+# Remove the output file if it exists
+if (file.exists(output_file)) {
+  file.remove(output_file)
+}
   
   # Write the chunk to a CSV file
   write.csv(chunk, file = output_file, row.names = FALSE)
@@ -135,23 +146,12 @@ for (i in seq_along(chunks)) {
   percent_processed <- (processed_chunks / total_chunks) * 100
   elapsed_time <- as.numeric(Sys.time() - start_time)
 
-  # Convert elapsed time to minutes and potentially hours
-  elapsed_minutes <- elapsed_time / 60
-  if (elapsed_minutes >= 60) {
-    elapsed_hours <- floor(elapsed_minutes / 60)
-    elapsed_minutes <- elapsed_minutes %% 60
 
   # Display progress information
     cat(sprintf(
-      "Processed %d out of %d chunks (%.2f%%) in %d hours and %.2f minutes.\n",
-      processed_chunks, total_chunks, percent_processed, elapsed_hours, elapsed_minutes
+      "Processed %d out of %d chunks (%.2f%%) in %.2f.\n",
+      processed_chunks, total_chunks, percent_processed, elapsed_time
     ))
-  } else {
-    cat(sprintf(
-      "Processed %d out of %d chunks (%.2f%%) in %.2f minutes.\n",
-      processed_chunks, total_chunks, percent_processed, elapsed_minutes
-    ))
-  }
 }
 
 # Clean up the furrr plan
@@ -162,22 +162,12 @@ final_elapsed_time <- Sys.time() - start_time
 final_elapsed_time <- as.numeric(final_elapsed_time)
 
 # Convert elapsed time to minutes and potentially hours
-final_elapsed_minutes <- final_elapsed_time / 60
-if (final_elapsed_minutes >= 60) {
-  final_elapsed_hours <- floor(final_elapsed_minutes / 60)
-  final_elapsed_minutes <- final_elapsed_minutes %% 60
-  
+
   # Display progress information
   cat(sprintf(
-    "Complete. Total elapsed time: %d hours and %.2f minutes.\n",
-    final_elapsed_hours, final_elapsed_minutes
+    "Complete. Total elapsed time: %.2f.\n",
+    final_elapsed_time
   ))
-} else {
-  cat(sprintf(
-    "Complete. Total elapsed time: %.2f minutes.\n",
-    elapsed_minutes
-  ))
-}
 
 
 # close connection to db

From 63c89460ccc17a82b6a5120c074a51f5bc3c429d Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Tue, 3 Oct 2023 15:00:37 +0000
Subject: [PATCH 11/14] Loaded chunk-csv files into one file

---
 .../main/prep_nsf/link_scinetnsf_to_mag.R     | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
index 845cf4c..7dae5fe 100644
--- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
+++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -172,4 +172,24 @@ final_elapsed_time <- as.numeric(final_elapsed_time)
 
 # close connection to db
 DBI::dbDisconnect(con)
-cat("Disconnected from db.\n")
\ No newline at end of file
+cat("Disconnected from db.\n")
+
+# Apend tables together
+# Initialize an empty data frame to store the appended data
+links_nsf_mag <- data.frame()
+
+# Loop through the file names and append the data
+for (i in 1:1072) {
+  # Construct the file path for each chunk
+  
+  # Load the CSV file
+  chunk_data <- read.csv(output_file, header = TRUE)
+  
+  # Append the chunk data to the appended_data data frame
+  links_nsf_mag <- rbind(links_nsf_mag, chunk_data)
+}
+links_nsf_mag <- links_nsf_mag%>%
+  distinct()
+
+# Write the appended data to a single CSV file
+#write.csv(links_nsf_mag, "links_nsf_mag.csv", row.names = FALSE)

From 6ba244da618225e9b03d7b2039cb5770dff92b0b Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Fri, 6 Oct 2023 18:49:57 +0000
Subject: [PATCH 12/14] added upload to db

---
 .../main/prep_nsf/link_scinetnsf_to_mag.R     | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
index 7dae5fe..aad281a 100644
--- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
+++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -170,9 +170,6 @@ final_elapsed_time <- as.numeric(final_elapsed_time)
   ))
 
 
-# close connection to db
-DBI::dbDisconnect(con)
-cat("Disconnected from db.\n")
 
 # Apend tables together
 # Initialize an empty data frame to store the appended data
@@ -183,13 +180,26 @@ for (i in 1:1072) {
   # Construct the file path for each chunk
   
   # Load the CSV file
-  chunk_data <- read.csv(output_file, header = TRUE)
+  chunk_data <- read.csv(paste0("/mnt/ssd/chunks_nsf_links/chunk_", i, ".csv"), header = TRUE, colClasses = c(GrantID = "character"))
   
   # Append the chunk data to the appended_data data frame
   links_nsf_mag <- rbind(links_nsf_mag, chunk_data)
 }
-links_nsf_mag <- links_nsf_mag%>%
+
+# drop unnecessary variables and drop duplicates
+links_nsf_mag <- links_nsf_mag %>%
+  select(GrantID, AuthorId, Position, firstname_similarity, lastname_similarity) %>%
   distinct()
 
+
 # Write the appended data to a single CSV file
 #write.csv(links_nsf_mag, "links_nsf_mag.csv", row.names = FALSE)
+
+
+# Write table to db:
+dbWriteTable(con, name = "links_nsf_mag", value = links_nsf_mag, overwrite = TRUE)
+cat("Uploaded to db.\n")
+
+# close connection to db
+DBI::dbDisconnect(con)
+cat("Disconnected from db.\n")
\ No newline at end of file

From 94bda097036dd8fbb8bf651a6fbac8dbbd4e3e28 Mon Sep 17 00:00:00 2001
From: Christoph <christophhedtrich@gmail.com>
Date: Mon, 9 Oct 2023 01:25:27 +0000
Subject: [PATCH 13/14] handle chunks correctly, change AuthorId to integer64
 for correct writing to db

---
 .../main/prep_nsf/link_scinetnsf_to_mag.R      | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
index aad281a..97eedc1 100644
--- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
+++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -7,7 +7,7 @@
 start_time <- Sys.time()
 cat(sprintf("Started at %s \n", start_time))
 
-packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist", "purrr", "furrr")
+packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist", "purrr", "furrr", "bit64")
 lapply(packages, library, character.only = TRUE)
 
 datapath <- "/mnt/ssd/"
@@ -171,17 +171,19 @@ final_elapsed_time <- as.numeric(final_elapsed_time)
 
 
 
-# Apend tables together
+# Append tables together
 # Initialize an empty data frame to store the appended data
 links_nsf_mag <- data.frame()
 
+chunks <- list.files("/mnt/ssd/chunks_nsf_links/", pattern = "*.csv", full.names = TRUE)
 # Loop through the file names and append the data
-for (i in 1:1072) {
-  # Construct the file path for each chunk
-  
+for (chunk in chunks) {
   # Load the CSV file
-  chunk_data <- read.csv(paste0("/mnt/ssd/chunks_nsf_links/chunk_", i, ".csv"), header = TRUE, colClasses = c(GrantID = "character"))
-  
+  chunk_data <- read.csv(chunk, 
+                         header = TRUE, 
+                         colClasses = c(GrantID = "character", AuthorId = "character")
+                         ) 
+  chunk_data <- chunk_data %>% mutate(AuthorId = as.integer64(AuthorId))
   # Append the chunk data to the appended_data data frame
   links_nsf_mag <- rbind(links_nsf_mag, chunk_data)
 }
@@ -202,4 +204,4 @@ cat("Uploaded to db.\n")
 
 # close connection to db
 DBI::dbDisconnect(con)
-cat("Disconnected from db.\n")
\ No newline at end of file
+cat("Disconnected from db.\n")

From ac571fc1c5a026dba8d12451d4ceee4fdc94b28d Mon Sep 17 00:00:00 2001
From: mona <m.dapfer@outlook.de>
Date: Mon, 9 Oct 2023 14:01:03 +0000
Subject: [PATCH 14/14] droped a line which was not true anymore

---
 src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
index 97eedc1..15fe32d 100644
--- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
+++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R
@@ -1,6 +1,5 @@
 # Link SciSciNet_Links_NSF table with Paper_Author_Affiliations, Authors, and NSF_Investigator 
 # Keeps only those with link between NSF grant and author ID.
-# only those links with a similar name (similarity >=0.8) are loaded into db
 # Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder
 
 # Initialize variables for counting rows and timestamp