From 3c91d73f3e1bbc622ac2361ee92a5d8d7f4042c5 Mon Sep 17 00:00:00 2001 From: mona Date: Fri, 22 Sep 2023 08:12:19 +0000 Subject: [PATCH 01/14] Download SciSciNet_NSF and link to mag authors --- .../main/prep_nsf/link_scinetnsf_to_mag.R | 58 +++++++++++ .../main/prep_nsf/scinet_data_to_db.py | 97 +++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R create mode 100644 src/dataprep/main/prep_nsf/scinet_data_to_db.py diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R new file mode 100644 index 0000000..7170733 --- /dev/null +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -0,0 +1,58 @@ +# Link SciSciNet_Links_NSF table with Paper_Author_Affiliations, Authors, and NSF_Investigator +# Keeps only those with link between NSF grant and author ID. +# Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder + +packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "ggplot2", "stringdist", "DBI") +lapply(packages, library, character.only = TRUE) + +datapath <- "/mnt/ssd/" +db_file <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite") +sciscinet_path <- paste0(datapath,"sciscinet_data/") + + +#filepath_nsf=paste0(sciscinet_path,"SciSciNet_Link_NSF.tsv") + +con <- DBI::dbConnect(RSQLite::SQLite(), db_file) +cat("The database connection is: \n") +src_dbi(con) + +# Create table with all links between NSF-grant and authors via papers + +NSF_to_Authors <- tbl(con, sql(" + select a. PaperID, a.Type, a.GrantID, b.AuthorId, b.OriginalAuthor + ,c.NormalizedName, Position, FirstName, LastName + from scinet as a + inner join ( + select PaperId AS PaperID, AuthorId, OriginalAuthor + from PaperAuthorAffiliations + )b + using (PaperID) + inner join ( + select AuthorId, NormalizedName + from Authors + ) c + using (AuthorId) + inner join ( + select GrantID, Position, FirstName, LastName + from NSF_Investigator + ) d + using (GrantID) + ")) + +nsf_to_authors <- collect(NSF_to_Authors) + +# Split the "NormalizedName" column into "nsf_firstname" and "nsf_lastname" columns +nsf_to_authors <- nsf_to_authors %>% + separate(NormalizedName, into = c("nsf_firstname", "nsf_lastname"), sep = " ", extra = "merge") + +nsf_author_links <- subset(nsf_to_authors, select = -c(OriginalAuthor, NormalizedName, Type, PaperID)) %>% + mutate(name_similarity = stringdist::stringdistmatrix(paste(nsf_firstname, nsf_lastname, sep = " "), paste(FirstName, LastName, sep = " "))) + +# Set a threshold for similarity (e.g., 0.8 means 80% similarity) +threshold <- 0.8 + +# Filter observations where the names are similar or above the threshold +similar_names <- nsf_author_links %>% + filter(name_similarity >= threshold) + +DBI::dbDisconnect(con) \ No newline at end of file diff --git a/src/dataprep/main/prep_nsf/scinet_data_to_db.py b/src/dataprep/main/prep_nsf/scinet_data_to_db.py new file mode 100644 index 0000000..a5990b9 --- /dev/null +++ b/src/dataprep/main/prep_nsf/scinet_data_to_db.py @@ -0,0 +1,97 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# %% +""" +Download SciSciNet table SciSciNet_Link_NSF with links between NSF-grants and papers +Upload into db +link to Paper_Author_Affiliations, Authors, NSF_Investigator in R file: test_sciscinet_data.R in same folder + + +Create table in database: +- SciSciNet_Link_NSF + +SciSciNet_Link_NSF schema is: + +GrantID INTEGER, PaperID INTEGER + + +unique index on Grantid and PaperID (multiple paperIDs per GrantID) +""" + +import subprocess +import sqlite3 as sqlite +import argparse +import os +from os import listdir +from os.path import isfile, join +import pandas as pd +import numpy as np +import re +import sys +import requests + +sys.path.append('/home/mona/mag_sample/src/dataprep/') + +from helpers.variables import db_file, datapath, databasepath +from helpers.functions import analyze_db + +scinet_path = os.path.join(datapath, "sciscinet_data/") +filepath_nsf = os.path.join(scinet_path, "SciSciNet_Link_NSF.tsv") + + + +# Download file + +url_nsf = "https://ndownloader.figstatic.com/files/36139242" +response = requests.get(url_nsf) +with open(filepath_nsf, "wb") as file: + file.write(response.content) +print("Downloaded data") + + +# ## Read files in loop and dump to db + +def load_scinet(filepath): + df = pd.read_csv(filepath, + sep="\t", + names=["NSF_Award_Number", "PaperID", "Type", "Diff_ZScore"]) + df.drop_duplicates(inplace=True) + + # Create the GrantID column by removing non-numeric characters and formatting + + df['GrantID'] = df['NSF_Award_Number'].str.extract(r'-(\d+)') + + return df + +files = [f for f in listdir(scinet_path) if isfile(join(scinet_path, f))] + + +con = sqlite.connect(database = db_file, isolation_level= None) +with con: + for (i,f) in enumerate(files): + df = load_scinet(scinet_path+f) + #print(df.head()) + if i==0: + if_exists_opt="replace" + else: + if_exists_opt="append" + + df.to_sql("scinet", + con=con, + if_exists=if_exists_opt, + index=False, + schema= """NSF_Award_Number TEXT + , PaperID INTEGER + , Type TEXT + , Diff_ZScore NUMERIC + , GrantID TEXT + """ + ) + + # Make index and clean up + con.execute("CREATE UNIQUE INDEX idx_scinet ON scinet (GrantID ASC, PaperID ASC)") + + analyze_db(con) + +con.close() \ No newline at end of file From b2c08e00727dd73650c4f77feb6a1b2557fc4367 Mon Sep 17 00:00:00 2001 From: mona Date: Mon, 25 Sep 2023 10:11:12 +0000 Subject: [PATCH 02/14] Some improvements in loading to db, linking --- .../main/prep_nsf/link_scinetnsf_to_mag.R | 50 +++++++++++++++---- .../main/prep_nsf/scinet_data_to_db.py | 20 ++++---- 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R index 7170733..e2285c7 100644 --- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -2,12 +2,17 @@ # Keeps only those with link between NSF grant and author ID. # Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder -packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "ggplot2", "stringdist", "DBI") + +# Note: Not sure if calculating string distance now works correctly + + + +packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist") lapply(packages, library, character.only = TRUE) datapath <- "/mnt/ssd/" db_file <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite") -sciscinet_path <- paste0(datapath,"sciscinet_data/") +#sciscinet_path <- paste0(datapath,"sciscinet_data/") #filepath_nsf=paste0(sciscinet_path,"SciSciNet_Link_NSF.tsv") @@ -21,7 +26,7 @@ src_dbi(con) NSF_to_Authors <- tbl(con, sql(" select a. PaperID, a.Type, a.GrantID, b.AuthorId, b.OriginalAuthor ,c.NormalizedName, Position, FirstName, LastName - from scinet as a + from scinet_links_nsf as a inner join ( select PaperId AS PaperID, AuthorId, OriginalAuthor from PaperAuthorAffiliations @@ -41,18 +46,41 @@ NSF_to_Authors <- tbl(con, sql(" nsf_to_authors <- collect(NSF_to_Authors) -# Split the "NormalizedName" column into "nsf_firstname" and "nsf_lastname" columns -nsf_to_authors <- nsf_to_authors %>% - separate(NormalizedName, into = c("nsf_firstname", "nsf_lastname"), sep = " ", extra = "merge") +# Create a variable with the full name from mag +nsf_to_authors$mag_name <- paste(nsf_to_authors$FirstName, nsf_to_authors$LastName, sep = " ") -nsf_author_links <- subset(nsf_to_authors, select = -c(OriginalAuthor, NormalizedName, Type, PaperID)) %>% - mutate(name_similarity = stringdist::stringdistmatrix(paste(nsf_firstname, nsf_lastname, sep = " "), paste(FirstName, LastName, sep = " "))) +## Still running, not sure if running correctly from here -# Set a threshold for similarity (e.g., 0.8 means 80% similarity) +### Compare name similarity +# Set a threshold for similarity threshold <- 0.8 -# Filter observations where the names are similar or above the threshold -similar_names <- nsf_author_links %>% +# Calculate string similarity for each row and add a new column +name_similarity <- numeric(0) + +# Iterate through rows and calculate string distances +for (i in 1:nrow(nsf_to_authors)) { + mag_name <- nsf_to_authors$mag_name[i] + NormalizedName <- nsf_to_authors$NormalizedName[i] + + # Calculate string distance for this row + row_similarity <- stringdistmatrix( + mag_name, + NormalizedName + ) + + # Append the calculated distance to the results vector + name_similarity <- c(name_similarity, row_similarity) +} + +# Assign the calculated distances to a new column in data frame +nsf_to_authors$name_similarity <- name_similarity + +# Filter observations where the names are above the threshold +similar_names <- nsf_to_authors %>% filter(name_similarity >= threshold) +# To do: write to db (keep only necessary variables: GrantID, AuthorID, Position, Paper ID(?)) + +# close connection to db DBI::dbDisconnect(con) \ No newline at end of file diff --git a/src/dataprep/main/prep_nsf/scinet_data_to_db.py b/src/dataprep/main/prep_nsf/scinet_data_to_db.py index a5990b9..61986f0 100644 --- a/src/dataprep/main/prep_nsf/scinet_data_to_db.py +++ b/src/dataprep/main/prep_nsf/scinet_data_to_db.py @@ -9,14 +9,14 @@ Create table in database: -- SciSciNet_Link_NSF +- scinet_links_nsf SciSciNet_Link_NSF schema is: -GrantID INTEGER, PaperID INTEGER +GrantID TEXT, PaperID INTEGER, Type TEXT -unique index on Grantid and PaperID (multiple paperIDs per GrantID) +unique index on Grantid and PaperID (multiple PaperIDs per GrantID) """ import subprocess @@ -55,12 +55,14 @@ def load_scinet(filepath): df = pd.read_csv(filepath, sep="\t", - names=["NSF_Award_Number", "PaperID", "Type", "Diff_ZScore"]) + names=["NSF_Award_Number", "PaperID", "Type", "Diff_ZScore"], + skiprows=1) df.drop_duplicates(inplace=True) # Create the GrantID column by removing non-numeric characters and formatting df['GrantID'] = df['NSF_Award_Number'].str.extract(r'-(\d+)') + df = df.drop(columns=['NSF_Award_Number', 'Diff_ZScore']) return df @@ -77,21 +79,19 @@ def load_scinet(filepath): else: if_exists_opt="append" - df.to_sql("scinet", + df.to_sql("scinet_links_nsf", con=con, if_exists=if_exists_opt, index=False, - schema= """NSF_Award_Number TEXT - , PaperID INTEGER + schema= """ PaperID INTEGER , Type TEXT - , Diff_ZScore NUMERIC , GrantID TEXT """ ) # Make index and clean up - con.execute("CREATE UNIQUE INDEX idx_scinet ON scinet (GrantID ASC, PaperID ASC)") + con.execute("CREATE UNIQUE INDEX idx_scinet_grantpaper ON scinet_links_nsf (GrantID ASC, PaperID ASC)") analyze_db(con) -con.close() \ No newline at end of file +con.close() From 5a17e7fad574142dd4d9372226f747a13662f21a Mon Sep 17 00:00:00 2001 From: mona Date: Mon, 25 Sep 2023 11:24:37 +0000 Subject: [PATCH 03/14] Updated code for name similarity btw nsf and mag --- src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R index e2285c7..c768a9d 100644 --- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -58,13 +58,14 @@ threshold <- 0.8 # Calculate string similarity for each row and add a new column name_similarity <- numeric(0) + # Iterate through rows and calculate string distances for (i in 1:nrow(nsf_to_authors)) { mag_name <- nsf_to_authors$mag_name[i] NormalizedName <- nsf_to_authors$NormalizedName[i] # Calculate string distance for this row - row_similarity <- stringdistmatrix( + row_similarity <- stringsim( mag_name, NormalizedName ) @@ -80,7 +81,13 @@ nsf_to_authors$name_similarity <- name_similarity similar_names <- nsf_to_authors %>% filter(name_similarity >= threshold) -# To do: write to db (keep only necessary variables: GrantID, AuthorID, Position, Paper ID(?)) +# drop unnecessary variables +df <- similar_names %>% + select(GrantID, AuthorId, Position) %>% + distinct() + +# Write table to db: +dbWriteTable(con, name = "links_nsf_mag", value = df, overwrite = TRUE) # close connection to db DBI::dbDisconnect(con) \ No newline at end of file From 64e10d92600d20118e73aa4e74c6229a1d60ba86 Mon Sep 17 00:00:00 2001 From: mona Date: Tue, 26 Sep 2023 07:19:10 +0000 Subject: [PATCH 04/14] Final code to download nsf links and upload to db --- .../main/prep_nsf/scinet_data_to_db.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/dataprep/main/prep_nsf/scinet_data_to_db.py b/src/dataprep/main/prep_nsf/scinet_data_to_db.py index 61986f0..b42fc31 100644 --- a/src/dataprep/main/prep_nsf/scinet_data_to_db.py +++ b/src/dataprep/main/prep_nsf/scinet_data_to_db.py @@ -13,7 +13,7 @@ SciSciNet_Link_NSF schema is: -GrantID TEXT, PaperID INTEGER, Type TEXT +GrantID TEXT, PaperID INTEGER, Type TEXT, Diff_ZScore NUMERIC unique index on Grantid and PaperID (multiple PaperIDs per GrantID) @@ -28,10 +28,10 @@ import pandas as pd import numpy as np import re -import sys -import requests +#import sys +#import requests -sys.path.append('/home/mona/mag_sample/src/dataprep/') +#sys.path.append('/home/mona/mag_sample/src/dataprep/') from helpers.variables import db_file, datapath, databasepath from helpers.functions import analyze_db @@ -50,8 +50,8 @@ print("Downloaded data") -# ## Read files in loop and dump to db - +# ## Read file and dump to db +# remove first row as it only contains column names that can't be overwritten def load_scinet(filepath): df = pd.read_csv(filepath, sep="\t", @@ -59,15 +59,22 @@ def load_scinet(filepath): skiprows=1) df.drop_duplicates(inplace=True) - # Create the GrantID column by removing non-numeric characters and formatting + # Create a GrantID variable in same format as previously used by removing non-numeric characters from NSF_Award_Number + # drop NSF_Award_Number as we only need GrantID df['GrantID'] = df['NSF_Award_Number'].str.extract(r'-(\d+)') - df = df.drop(columns=['NSF_Award_Number', 'Diff_ZScore']) + df = df.drop(columns=['NSF_Award_Number']) + + #Check that all rows will be uploaded into db: in raw file 1309518 rows + num_observations = df.shape[0] + print(num_observations, "rows of 1309518 rows in the raw file will be loaded into the db") return df files = [f for f in listdir(scinet_path) if isfile(join(scinet_path, f))] + + con = sqlite.connect(database = db_file, isolation_level= None) with con: @@ -86,10 +93,12 @@ def load_scinet(filepath): schema= """ PaperID INTEGER , Type TEXT , GrantID TEXT + , Diff_ZScore NUMERIC """ ) - # Make index and clean up + # Make index and clean up: + # Serves as check that only unique observations part of the dataframe con.execute("CREATE UNIQUE INDEX idx_scinet_grantpaper ON scinet_links_nsf (GrantID ASC, PaperID ASC)") analyze_db(con) From 5ac96321a99e5b48013bd0f1faa7c682f91f5f11 Mon Sep 17 00:00:00 2001 From: mona Date: Tue, 26 Sep 2023 15:10:59 +0000 Subject: [PATCH 05/14] Final code to link mag and nsf + upload to db --- .../main/prep_nsf/link_scinetnsf_to_mag.R | 99 ++++++++++++------- 1 file changed, 63 insertions(+), 36 deletions(-) diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R index c768a9d..2e5cfff 100644 --- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -1,36 +1,29 @@ # Link SciSciNet_Links_NSF table with Paper_Author_Affiliations, Authors, and NSF_Investigator # Keeps only those with link between NSF grant and author ID. +# only those links with a similar name (similarity >=0.8) are loaded into db # Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder - -# Note: Not sure if calculating string distance now works correctly - - - packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist") lapply(packages, library, character.only = TRUE) datapath <- "/mnt/ssd/" db_file <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite") -#sciscinet_path <- paste0(datapath,"sciscinet_data/") - -#filepath_nsf=paste0(sciscinet_path,"SciSciNet_Link_NSF.tsv") con <- DBI::dbConnect(RSQLite::SQLite(), db_file) cat("The database connection is: \n") src_dbi(con) -# Create table with all links between NSF-grant and authors via papers +# Create table with all links between NSF-grant and authors via papers NSF_to_Authors <- tbl(con, sql(" - select a. PaperID, a.Type, a.GrantID, b.AuthorId, b.OriginalAuthor - ,c.NormalizedName, Position, FirstName, LastName + select a. PaperID, a.Type, a.GrantID, b.AuthorId + ,c.NormalizedName, d.Position, d.PIFullName from scinet_links_nsf as a inner join ( - select PaperId AS PaperID, AuthorId, OriginalAuthor - from PaperAuthorAffiliations - )b + select PaperId AS PaperID, AuthorId + from PaperAuthorAffiliations + )b using (PaperID) inner join ( select AuthorId, NormalizedName @@ -38,55 +31,89 @@ NSF_to_Authors <- tbl(con, sql(" ) c using (AuthorId) inner join ( - select GrantID, Position, FirstName, LastName + select GrantID, Position, PIFullName from NSF_Investigator - ) d + ) d using (GrantID) ")) nsf_to_authors <- collect(NSF_to_Authors) -# Create a variable with the full name from mag -nsf_to_authors$mag_name <- paste(nsf_to_authors$FirstName, nsf_to_authors$LastName, sep = " ") +# Create separate variables for first and last name for both nsf and mag names +nsf_to_authors <- nsf_to_authors %>% + mutate( + mag_firstname = word(NormalizedName, 1), + mag_lastname = word(NormalizedName, -1), + mag_middlename = ifelse(str_count(NormalizedName, "\\s+") >= 2 & + word(NormalizedName, 2) != word(NormalizedName, -1), + word(NormalizedName, 2), NA_character_) + ) + + +nsf_to_authors <- nsf_to_authors %>% + mutate( + nsf_firstname = word(PIFullName, 1), + nsf_lastname = word(PIFullName, -1), + nsf_middlename = ifelse(str_count(PIFullName, "\\s+") >= 2 & + word(PIFullName, 2) != word(PIFullName, -1), + word(PIFullName, 2), NA_character_) + ) + -## Still running, not sure if running correctly from here ### Compare name similarity # Set a threshold for similarity threshold <- 0.8 -# Calculate string similarity for each row and add a new column -name_similarity <- numeric(0) +### Test several distances + +# Calculate string similarity for first and last names by row and add a new column +firstname_similarity <- numeric(0) +lastname_similarity <- numeric(0) -# Iterate through rows and calculate string distances +# Iterate through rows and calculate string distances for first and last names separately for (i in 1:nrow(nsf_to_authors)) { - mag_name <- nsf_to_authors$mag_name[i] - NormalizedName <- nsf_to_authors$NormalizedName[i] + mag_firstname <- nsf_to_authors$mag_firstname[i] + nsf_firstname <- nsf_to_authors$nsf_firstname[i] + + # Calculate string distance for first name by row using Optimal String Alignment (default) + first_row_similarity <- stringsim( + mag_firstname, + nsf_firstname, + method="osa" ) + + # Append the calculated distance to the results vector for first name + firstname_similarity <- c(firstname_similarity, first_row_similarity) + + # Calculate string distance for last name by row + mag_lastname <- nsf_to_authors$mag_lastname[i] + nsf_lastname <- nsf_to_authors$nsf_lastname[i] - # Calculate string distance for this row - row_similarity <- stringsim( - mag_name, - NormalizedName + last_row_similarity <- stringsim( + mag_lastname, + nsf_lastname, + method="osa" ) - # Append the calculated distance to the results vector - name_similarity <- c(name_similarity, row_similarity) + # Append the calculated distance to the results vector for last name + lastname_similarity <- c(lastname_similarity, last_row_similarity) } # Assign the calculated distances to a new column in data frame -nsf_to_authors$name_similarity <- name_similarity +nsf_to_authors$firstname_similarity <- firstname_similarity +nsf_to_authors$lastname_similarity <- lastname_similarity -# Filter observations where the names are above the threshold +# Filter observations where the names are above the threshold: threshold seemed reasonable as it allows for a single typo similar_names <- nsf_to_authors %>% - filter(name_similarity >= threshold) + filter(firstname_similarity >= threshold & lastname_similarity >= threshold) -# drop unnecessary variables +# drop unnecessary variables and drop duplicates df <- similar_names %>% - select(GrantID, AuthorId, Position) %>% + select(GrantID, AuthorId, Position, firstname_similarity, lastname_similarity) %>% distinct() -# Write table to db: +# Write table to db: dbWriteTable(con, name = "links_nsf_mag", value = df, overwrite = TRUE) # close connection to db From ae370fcda42a52be7e293fb195335eb0195d6f29 Mon Sep 17 00:00:00 2001 From: mona Date: Tue, 26 Sep 2023 15:11:44 +0000 Subject: [PATCH 06/14] Comparison of methods for stringdistance --- .../main/prep_nsf/test_name_similarity.csv | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 src/dataprep/main/prep_nsf/test_name_similarity.csv diff --git a/src/dataprep/main/prep_nsf/test_name_similarity.csv b/src/dataprep/main/prep_nsf/test_name_similarity.csv new file mode 100644 index 0000000..98f68a1 --- /dev/null +++ b/src/dataprep/main/prep_nsf/test_name_similarity.csv @@ -0,0 +1,101 @@ +"NormalizedName","PIFullName","mag_firstname","mag_lastname","mag_middlename","nsf_firstname","nsf_lastname","nsf_middlename","firstname_similarity_osa","lastname_similarity_osa","firstname_similarity_lv","lastname_similarity_lv","firstname_similarity_dl","lastname_similarity_dl","firstname_similarity_lcs","lastname_similarity_lcs","firstname_similarity_qgram","lastname_similarity_qgram","firstname_similarity_cosine","lastname_similarity_cosine","firstname_similarity_jac","lastname_similarity_jac","firstname_similarity_jw0","lastname_similarity_jw0","firstname_similarity_jw.1","lastname_similarity_jw.1","firstname_similarity_jw.2","lastname_similarity_jw.2" +"jennifer fowler","jennifer w fowler","jennifer","fowler",NA,"jennifer","fowler","w",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"thomas colligan","jennifer w fowler","thomas","colligan",NA,"jennifer","fowler","w",0,0.25,0,0.25,0,0.25,0,0.285714285714286,0,0.285714285714286,0,0.387298334620742,0,0.181818181818182,0,0.527777777777778,0,0.527777777777778,0,0.527777777777778 +"jaxen godfrey","jennifer w fowler","jaxen","godfrey",NA,"jennifer","fowler","w",0.25,0.285714285714286,0.25,0.285714285714286,0.25,0.285714285714286,0.461538461538462,0.307692307692308,0.461538461538462,0.615384615384615,0.645497224367903,0.617213399848368,0.375,0.444444444444444,0.658333333333333,0.531746031746032,0.6925,0.531746031746032,0.726666666666667,0.531746031746032 +"carl spangrude","jennifer w fowler","carl","spangrude",NA,"jennifer","fowler","w",0,0.111111111111111,0,0.111111111111111,0,0.111111111111111,0.166666666666667,0.133333333333333,0.166666666666667,0.266666666666667,0.144337567297406,0.272165526975909,0.111111111111111,0.153846153846154,0,0.425925925925926,0,0.425925925925926,0,0.425925925925926 +"daqing wan","daqing wan","daqing","wan",NA,"daqing","wan",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"jiyou li","daqing wan","jiyou","li",NA,"daqing","wan",NA,0,0,0,0,0,0,0.181818181818182,0,0.181818181818182,0,0.182574185835055,0,0.1,0,0.455555555555555,0,0.455555555555555,0,0.455555555555555,0 +"daqing wan","daqing wan","daqing","wan",NA,"daqing","wan",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"krishna kaipa","daqing wan","krishna","kaipa",NA,"daqing","wan",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.307692307692308,0.25,0.461538461538462,0.25,0.462910049886276,0.436435780471985,0.3,0.166666666666667,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111 +"jun zhang","daqing wan","jun","zhang",NA,"daqing","wan",NA,0.166666666666667,0.4,0.166666666666667,0.4,0.166666666666667,0.4,0.222222222222222,0.5,0.222222222222222,0.5,0.235702260395516,0.516397779494322,0.125,0.333333333333333,0.5,0.688888888888889,0.5,0.688888888888889,0.5,0.688888888888889 +"daqing wan","daqing wan","daqing","wan",NA,"daqing","wan",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"jun zhang","daqing wan","jun","zhang",NA,"daqing","wan",NA,0.166666666666667,0.4,0.166666666666667,0.4,0.166666666666667,0.4,0.222222222222222,0.5,0.222222222222222,0.5,0.235702260395516,0.516397779494322,0.125,0.333333333333333,0.5,0.688888888888889,0.5,0.688888888888889,0.5,0.688888888888889 +"daqing wan","daqing wan","daqing","wan",NA,"daqing","wan",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"alicia marino","daqing wan","alicia","marino",NA,"daqing","wan",NA,0.166666666666667,0.333333333333333,0.166666666666667,0.333333333333333,0.166666666666667,0.333333333333333,0.333333333333333,0.444444444444444,0.333333333333333,0.444444444444444,0.516397779494322,0.471404520791032,0.25,0.285714285714286,0.555555555555555,0.666666666666667,0.555555555555555,0.666666666666667,0.555555555555555,0.666666666666667 +"angela robinson","daqing wan","angela","robinson",NA,"daqing","wan",NA,0,0.125,0,0.125,0,0.125,0.5,0.181818181818182,0.5,0.181818181818182,0.577350269189626,0.333333333333333,0.375,0.125,0.444444444444444,0.486111111111111,0.444444444444444,0.486111111111111,0.444444444444444,0.486111111111111 +"tim lai","daqing wan","tim","lai",NA,"daqing","wan",NA,0.166666666666667,0.333333333333333,0.166666666666667,0.333333333333333,0.166666666666667,0.333333333333333,0.222222222222222,0.333333333333333,0.222222222222222,0.333333333333333,0.235702260395516,0.333333333333333,0.125,0.2,0.5,0.555555555555555,0.5,0.555555555555555,0.5,0.555555555555555 +"nathan beckmann","nathan beckmann","nathan","beckmann",NA,"nathan","beckmann",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"brian c schwedock","nathan beckmann","brian","schwedock","c","nathan","beckmann",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.363636363636364,0.352941176470588,0.363636363636364,0.352941176470588,0.565685424949238,0.381385035698237,0.285714285714286,0.25,0.577777777777778,0.324074074074074,0.577777777777778,0.324074074074074,0.577777777777778,0.324074074074074 +"george yin","hongwei zhang","george","yin",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.461538461538462,0.25,0.461538461538462,0.25,0.597614304667197,0.258198889747161,0.375,0.142857142857143,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111 +"jayanthi rao","hongwei zhang","jayanthi","rao",NA,"hongwei","zhang",NA,0.125,0.2,0.125,0.2,0.125,0.2,0.266666666666667,0.25,0.4,0.25,0.358568582800318,0.258198889747161,0.272727272727273,0.142857142857143,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111 +"chuan li","hongwei zhang","chuan","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.333333333333333,0,0.333333333333333,0,0.338061701891406,0,0.2,0,0.561904761904762,0,0.561904761904762,0,0.561904761904762,0 +"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"le yi wang","hongwei zhang","le","wang","yi","hongwei","zhang",NA,0.142857142857143,0.6,0.142857142857143,0.6,0.142857142857143,0.6,0.222222222222222,0.666666666666667,0.222222222222222,0.666666666666667,0.267261241912424,0.670820393249937,0.125,0.5,0,0.783333333333333,0,0.783333333333333,0,0.783333333333333 +"george yin","hongwei zhang","george","yin",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.461538461538462,0.25,0.461538461538462,0.25,0.597614304667197,0.258198889747161,0.375,0.142857142857143,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111 +"jayanthi rao","hongwei zhang","jayanthi","rao",NA,"hongwei","zhang",NA,0.125,0.2,0.125,0.2,0.125,0.2,0.266666666666667,0.25,0.4,0.25,0.358568582800318,0.258198889747161,0.272727272727273,0.142857142857143,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111 +"chuan li","hongwei zhang","chuan","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.333333333333333,0,0.333333333333333,0,0.338061701891406,0,0.2,0,0.561904761904762,0,0.561904761904762,0,0.561904761904762,0 +"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"le yi wang","hongwei zhang","le","wang","yi","hongwei","zhang",NA,0.142857142857143,0.6,0.142857142857143,0.6,0.142857142857143,0.6,0.222222222222222,0.666666666666667,0.222222222222222,0.666666666666667,0.267261241912424,0.670820393249937,0.125,0.5,0,0.783333333333333,0,0.783333333333333,0,0.783333333333333 +"keqiang li","hongwei zhang","keqiang","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.285714285714286,0,0.571428571428571,0,0.571428571428571,0,0.4,0,0,0,0,0,0,0 +"shengbo eben li","hongwei zhang","shengbo","li","eben","hongwei","zhang",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.428571428571429,0,0.714285714285714,0,0.714285714285714,0,0.555555555555556,0,0.619047619047619,0,0.619047619047619,0,0.619047619047619,0 +"yang zheng","hongwei zhang","yang","zheng",NA,"hongwei","zhang",NA,0.285714285714286,0.8,0.285714285714286,0.8,0.285714285714286,0.8,0.363636363636364,0.8,0.363636363636364,0.8,0.377964473009227,0.8,0.222222222222222,0.666666666666667,0.595238095238095,0.866666666666667,0.595238095238095,0.893333333333333,0.595238095238095,0.92 +"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"jianqiang wang","hongwei zhang","jianqiang","wang",NA,"hongwei","zhang",NA,0.111111111111111,0.6,0.111111111111111,0.6,0.111111111111111,0.6,0.25,0.666666666666667,0.375,0.666666666666667,0.487950036474267,0.670820393249937,0.3,0.5,0.502645502645503,0.783333333333333,0.502645502645503,0.783333333333333,0.502645502645503,0.783333333333333 +"xiaohui qin","hongwei zhang","xiaohui","qin",NA,"hongwei","zhang",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.285714285714286,0.25,0.428571428571429,0.25,0.50395263067897,0.258198889747161,0.3,0.142857142857143,0.523809523809524,0.511111111111111,0.523809523809524,0.511111111111111,0.523809523809524,0.511111111111111 +"keqiang li","hongwei zhang","keqiang","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.285714285714286,0,0.571428571428571,0,0.571428571428571,0,0.4,0,0,0,0,0,0,0 +"shengbo eben li","hongwei zhang","shengbo","li","eben","hongwei","zhang",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.428571428571429,0,0.714285714285714,0,0.714285714285714,0,0.555555555555556,0,0.619047619047619,0,0.619047619047619,0,0.619047619047619,0 +"george yin","hongwei zhang","george","yin",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.461538461538462,0.25,0.461538461538462,0.25,0.597614304667197,0.258198889747161,0.375,0.142857142857143,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111 +"le yi wang","hongwei zhang","le","wang","yi","hongwei","zhang",NA,0.142857142857143,0.6,0.142857142857143,0.6,0.142857142857143,0.6,0.222222222222222,0.666666666666667,0.222222222222222,0.666666666666667,0.267261241912424,0.670820393249937,0.125,0.5,0,0.783333333333333,0,0.783333333333333,0,0.783333333333333 +"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"thu nguyen","hongwei zhang","thu","nguyen",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.2,0.363636363636364,0.2,0.363636363636364,0.218217890235992,0.474341649025257,0.111111111111111,0.25,0.492063492063492,0.455555555555555,0.492063492063492,0.455555555555555,0.492063492063492,0.455555555555555 +"j karl hedrick","hongwei zhang","j","hedrick","karl","hongwei","zhang",NA,0,0,0,0,0,0,0,0.166666666666667,0,0.166666666666667,0,0.169030850945703,0,0.0909090909090909,0,0.447619047619048,0,0.447619047619048,0,0.447619047619048 +"keqiang li","hongwei zhang","keqiang","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.285714285714286,0,0.571428571428571,0,0.571428571428571,0,0.4,0,0,0,0,0,0,0 +"shengbo eben li","hongwei zhang","shengbo","li","eben","hongwei","zhang",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.428571428571429,0,0.714285714285714,0,0.714285714285714,0,0.555555555555556,0,0.619047619047619,0,0.619047619047619,0,0.619047619047619,0 +"feng gao","hongwei zhang","feng","gao",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.363636363636364,0.25,0.545454545454545,0.5,0.566946709513841,0.516397779494322,0.375,0.333333333333333,0.595238095238095,0.511111111111111,0.595238095238095,0.511111111111111,0.595238095238095,0.511111111111111 +"yang zheng","hongwei zhang","yang","zheng",NA,"hongwei","zhang",NA,0.285714285714286,0.8,0.285714285714286,0.8,0.285714285714286,0.8,0.363636363636364,0.8,0.363636363636364,0.8,0.377964473009227,0.8,0.222222222222222,0.666666666666667,0.595238095238095,0.866666666666667,0.595238095238095,0.893333333333333,0.595238095238095,0.92 +"yujia wu","hongwei zhang","yujia","wu",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.166666666666667,0,0.166666666666667,0,0.169030850945703,0,0.0909090909090909,0,0,0,0,0,0,0 +"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"j karl hedrick","jing hua","j","hedrick","karl","jing","hua",NA,0.25,0.142857142857143,0.25,0.142857142857143,0.25,0.142857142857143,0.4,0.2,0.4,0.2,0.5,0.218217890235992,0.25,0.111111111111111,0.75,0.492063492063492,0.775,0.542857142857143,0.8,0.593650793650794 +"j karl hedrick","hongwei zhang","j","hedrick","karl","hongwei","zhang",NA,0,0,0,0,0,0,0,0.166666666666667,0,0.166666666666667,0,0.169030850945703,0,0.0909090909090909,0,0.447619047619048,0,0.447619047619048,0,0.447619047619048 +"j karl hedrick","jayanthi rao","j","hedrick","karl","jayanthi","rao",NA,0.125,0.142857142857143,0.125,0.142857142857143,0.125,0.142857142857143,0.222222222222222,0.2,0.222222222222222,0.2,0.316227766016838,0.218217890235992,0.142857142857143,0.111111111111111,0.708333333333333,0,0.7375,0,0.766666666666667,0 +"j karl hedrick","anthony holt","j","hedrick","karl","anthony","holt",NA,0,0.142857142857143,0,0.142857142857143,0,0.142857142857143,0,0.181818181818182,0,0.181818181818182,0,0.188982236504614,0,0.1,0,0.464285714285714,0,0.517857142857143,0,0.571428571428571 +"keqiang li","jing hua","keqiang","li",NA,"jing","hua",NA,0.428571428571429,0,0.428571428571429,0,0.428571428571429,0,0.545454545454545,0,0.545454545454545,0,0.566946709513841,0,0.375,0,0.464285714285714,0,0.464285714285714,0,0.464285714285714,0 +"keqiang li","hongwei zhang","keqiang","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.285714285714286,0,0.571428571428571,0,0.571428571428571,0,0.4,0,0,0,0,0,0,0 +"keqiang li","jayanthi rao","keqiang","li",NA,"jayanthi","rao",NA,0.125,0,0.125,0,0.125,0,0.266666666666667,0,0.4,0,0.478091443733757,0,0.272727272727273,0,0.511904761904762,0,0.511904761904762,0,0.511904761904762,0 +"keqiang li","anthony holt","keqiang","li",NA,"anthony","holt",NA,0.142857142857143,0.25,0.142857142857143,0.25,0.142857142857143,0.25,0.285714285714286,0.333333333333333,0.285714285714286,0.333333333333333,0.377964473009227,0.353553390593274,0.181818181818182,0.2,0.428571428571428,0,0.428571428571428,0,0.428571428571428,0 +"shengbo eben li","jing hua","shengbo","li","eben","jing","hua",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.363636363636364,0,0.363636363636364,0,0.377964473009227,0,0.222222222222222,0,0.595238095238095,0,0.595238095238095,0,0.595238095238095,0 +"shengbo eben li","hongwei zhang","shengbo","li","eben","hongwei","zhang",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.428571428571429,0,0.714285714285714,0,0.714285714285714,0,0.555555555555556,0,0.619047619047619,0,0.619047619047619,0,0.619047619047619,0 +"shengbo eben li","jayanthi rao","shengbo","li","eben","jayanthi","rao",NA,0.125,0,0.125,0,0.125,0,0.133333333333333,0,0.266666666666667,0,0.239045721866879,0,0.166666666666667,0,0.422619047619048,0,0.422619047619048,0,0.422619047619048,0 +"shengbo eben li","anthony holt","shengbo","li","eben","anthony","holt",NA,0,0.25,0,0.25,0,0.25,0.285714285714286,0.333333333333333,0.428571428571429,0.333333333333333,0.50395263067897,0.353553390593274,0.3,0.2,0.507936507936508,0,0.507936507936508,0,0.507936507936508,0 +"feng gao","jing hua","feng","gao",NA,"jing","hua",NA,0.5,0,0.5,0,0.5,0,0.5,0.333333333333333,0.5,0.333333333333333,0.5,0.333333333333333,0.333333333333333,0.2,0.666666666666667,0,0.666666666666667,0,0.666666666666667,0 +"feng gao","hongwei zhang","feng","gao",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.363636363636364,0.25,0.545454545454545,0.5,0.566946709513841,0.516397779494322,0.375,0.333333333333333,0.595238095238095,0.511111111111111,0.595238095238095,0.511111111111111,0.595238095238095,0.511111111111111 +"feng gao","jayanthi rao","feng","gao",NA,"jayanthi","rao",NA,0.125,0.666666666666667,0.125,0.666666666666667,0.125,0.666666666666667,0.166666666666667,0.666666666666667,0.166666666666667,0.666666666666667,0.158113883008419,0.666666666666667,0.1,0.5,0.458333333333333,0.777777777777778,0.458333333333333,0.777777777777778,0.458333333333333,0.777777777777778 +"feng gao","anthony holt","feng","gao",NA,"anthony","holt",NA,0.142857142857143,0,0.142857142857143,0,0.142857142857143,0,0.181818181818182,0.285714285714286,0.181818181818182,0.285714285714286,0.333333333333333,0.288675134594813,0.111111111111111,0.166666666666667,0.464285714285714,0.527777777777778,0.464285714285714,0.527777777777778,0.464285714285714,0.527777777777778 +"yang zheng","jing hua","yang","zheng",NA,"jing","hua",NA,0.5,0.2,0.5,0.2,0.5,0.2,0.5,0.25,0.5,0.25,0.5,0.258198889747161,0.333333333333333,0.142857142857143,0.666666666666667,0.511111111111111,0.666666666666667,0.511111111111111,0.666666666666667,0.511111111111111 +"yang zheng","hongwei zhang","yang","zheng",NA,"hongwei","zhang",NA,0.285714285714286,0.8,0.285714285714286,0.8,0.285714285714286,0.8,0.363636363636364,0.8,0.363636363636364,0.8,0.377964473009227,0.8,0.222222222222222,0.666666666666667,0.595238095238095,0.866666666666667,0.595238095238095,0.893333333333333,0.595238095238095,0.92 +"yang zheng","jayanthi rao","yang","zheng",NA,"jayanthi","rao",NA,0.375,0,0.375,0,0.375,0,0.5,0,0.5,0,0.632455532033676,0,0.375,0,0.597222222222222,0,0.597222222222222,0,0.597222222222222,0 +"yang zheng","anthony holt","yang","zheng",NA,"anthony","holt",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.363636363636364,0.222222222222222,0.545454545454545,0.222222222222222,0.666666666666667,0.223606797749979,0.428571428571429,0.125,0.595238095238095,0.483333333333333,0.595238095238095,0.483333333333333,0.595238095238095,0.483333333333333 +"yujia wu","jing hua","yujia","wu",NA,"jing","hua",NA,0.2,0.333333333333333,0.2,0.333333333333333,0.2,0.333333333333333,0.444444444444444,0.4,0.444444444444444,0.4,0.447213595499958,0.408248290463863,0.285714285714286,0.25,0,0.611111111111111,0,0.611111111111111,0,0.611111111111111 +"yujia wu","hongwei zhang","yujia","wu",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.166666666666667,0,0.166666666666667,0,0.169030850945703,0,0.0909090909090909,0,0,0,0,0,0,0 +"yujia wu","jayanthi rao","yujia","wu",NA,"jayanthi","rao",NA,0.125,0,0.125,0,0.125,0,0.307692307692308,0,0.615384615384615,0,0.707106781186547,0,0.5,0,0.491666666666667,0,0.491666666666667,0,0.491666666666667,0 +"yujia wu","anthony holt","yujia","wu",NA,"anthony","holt",NA,0,0,0,0,0,0,0.166666666666667,0,0.333333333333333,0,0.298142396999972,0,0.222222222222222,0,0,0,0,0,0,0 +"hongwei zhang","jing hua","hongwei","zhang",NA,"jing","hua",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.363636363636364,0.5,0.545454545454545,0.5,0.566946709513841,0.516397779494322,0.375,0.333333333333333,0.595238095238095,0.688888888888889,0.595238095238095,0.688888888888889,0.595238095238095,0.688888888888889 +"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"hongwei zhang","jayanthi rao","hongwei","zhang",NA,"jayanthi","rao",NA,0.125,0.2,0.125,0.2,0.125,0.2,0.266666666666667,0.25,0.4,0.25,0.358568582800318,0.258198889747161,0.272727272727273,0.142857142857143,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111 +"hongwei zhang","anthony holt","hongwei","zhang",NA,"anthony","holt",NA,0,0.2,0,0.2,0,0.2,0.428571428571429,0.222222222222222,0.428571428571429,0.222222222222222,0.50395263067897,0.223606797749979,0.3,0.125,0.428571428571428,0.483333333333333,0.428571428571428,0.483333333333333,0.428571428571428,0.483333333333333 +"kumaresh singh","adrian sandu","kumaresh","singh",NA,"adrian","sandu",NA,0.125,0.4,0.125,0.4,0.125,0.4,0.285714285714286,0.4,0.285714285714286,0.4,0.375,0.4,0.181818181818182,0.25,0.527777777777778,0.6,0.527777777777778,0.64,0.527777777777778,0.68 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"mohamed jardak","adrian sandu","mohamed","jardak",NA,"adrian","sandu",NA,0,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0.307692307692308,0.363636363636364,0.307692307692308,0.363636363636364,0.353553390593274,0.474341649025257,0.222222222222222,0.25,0.436507936507937,0.577777777777778,0.436507936507937,0.577777777777778,0.436507936507937,0.577777777777778 +"meemong lee","adrian sandu","meemong","lee",NA,"adrian","sandu",NA,0.142857142857143,0,0.142857142857143,0,0.142857142857143,0,0.153846153846154,0,0.153846153846154,0,0.106600358177805,0,0.111111111111111,0,0.436507936507937,0,0.436507936507937,0,0.436507936507937,0 +"kumaresh singh","adrian sandu","kumaresh","singh",NA,"adrian","sandu",NA,0.125,0.4,0.125,0.4,0.125,0.4,0.285714285714286,0.4,0.285714285714286,0.4,0.375,0.4,0.181818181818182,0.25,0.527777777777778,0.6,0.527777777777778,0.64,0.527777777777778,0.68 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"kevin w bowman","adrian sandu","kevin","bowman","w","adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.363636363636364,0.363636363636364,0.363636363636364,0.363636363636364,0.316227766016838,0.365148371670111,0.25,0.222222222222222,0.577777777777778,0,0.577777777777778,0,0.577777777777778,0 +"alexandru cioaca","adrian sandu","alexandru","cioaca",NA,"adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.4,0.181818181818182,0.666666666666667,0.181818181818182,0.746202507244636,0.282842712474619,0.444444444444444,0.125,0.611111111111111,0.455555555555555,0.65,0.455555555555555,0.688888888888889,0.455555555555555 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"elias d nino ruiz","adrian sandu","elias","ruiz","d","adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.363636363636364,0.222222222222222,0.363636363636364,0.222222222222222,0.474341649025257,0.223606797749979,0.25,0.125,0.577777777777778,0,0.577777777777778,0,0.577777777777778,0 +"mohamed jardak","adrian sandu","mohamed","jardak",NA,"adrian","sandu",NA,0,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0.307692307692308,0.363636363636364,0.307692307692308,0.363636363636364,0.353553390593274,0.474341649025257,0.222222222222222,0.25,0.436507936507937,0.577777777777778,0.436507936507937,0.577777777777778,0.436507936507937,0.577777777777778 +"meemong lee","adrian sandu","meemong","lee",NA,"adrian","sandu",NA,0.142857142857143,0,0.142857142857143,0,0.142857142857143,0,0.153846153846154,0,0.153846153846154,0,0.106600358177805,0,0.111111111111111,0,0.436507936507937,0,0.436507936507937,0,0.436507936507937,0 +"kumaresh singh","adrian sandu","kumaresh","singh",NA,"adrian","sandu",NA,0.125,0.4,0.125,0.4,0.125,0.4,0.285714285714286,0.4,0.285714285714286,0.4,0.375,0.4,0.181818181818182,0.25,0.527777777777778,0.6,0.527777777777778,0.64,0.527777777777778,0.68 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"kevin w bowman","adrian sandu","kevin","bowman","w","adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.363636363636364,0.363636363636364,0.363636363636364,0.363636363636364,0.316227766016838,0.365148371670111,0.25,0.222222222222222,0.577777777777778,0,0.577777777777778,0,0.577777777777778,0 +"ahmed attia","adrian sandu","ahmed","attia",NA,"adrian","sandu",NA,0.166666666666667,0,0.166666666666667,0,0.166666666666667,0,0.363636363636364,0.2,0.363636363636364,0.2,0.474341649025257,0.298142396999972,0.25,0.142857142857143,0.455555555555555,0.466666666666667,0.51,0.466666666666667,0.564444444444444,0.466666666666667 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"vishwas rao","adrian sandu","vishwas","rao",NA,"adrian","sandu",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.307692307692308,0.25,0.307692307692308,0.25,0.353553390593274,0.258198889747161,0.222222222222222,0.142857142857143,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"vishwas rao","adrian sandu","vishwas","rao",NA,"adrian","sandu",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.307692307692308,0.25,0.307692307692308,0.25,0.353553390593274,0.258198889747161,0.222222222222222,0.142857142857143,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"haiyan cheng","adrian sandu","haiyan","cheng",NA,"adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.666666666666667,0.2,0.666666666666667,0.2,0.75,0.2,0.428571428571429,0.111111111111111,0.777777777777778,0.466666666666667,0.777777777777778,0.466666666666667,0.777777777777778,0.466666666666667 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"ahmed attia","adrian sandu","ahmed","attia",NA,"adrian","sandu",NA,0.166666666666667,0,0.166666666666667,0,0.166666666666667,0,0.363636363636364,0.2,0.363636363636364,0.2,0.474341649025257,0.298142396999972,0.25,0.142857142857143,0.455555555555555,0.466666666666667,0.51,0.466666666666667,0.564444444444444,0.466666666666667 +"vishwas rao","adrian sandu","vishwas","rao",NA,"adrian","sandu",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.307692307692308,0.25,0.307692307692308,0.25,0.353553390593274,0.258198889747161,0.222222222222222,0.142857142857143,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111 From 47aa945b5a5bbf8c8d2e95815da080c3f75936bd Mon Sep 17 00:00:00 2001 From: mona Date: Thu, 28 Sep 2023 12:58:53 +0000 Subject: [PATCH 07/14] Added info on processing stage + updated pipeline --- .../main/prep_nsf/link_scinetnsf_to_mag.R | 60 +++++++++++++++++-- src/dataprep/pipeline.sh | 7 ++- 2 files changed, 59 insertions(+), 8 deletions(-) diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R index 2e5cfff..1aecf7d 100644 --- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -3,6 +3,11 @@ # only those links with a similar name (similarity >=0.8) are loaded into db # Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder +# Initialize variables for counting rows and timestamp +row_count <- 0 +start_time <- Sys.time() +cat(sprintf("Started at", start_time)) + packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist") lapply(packages, library, character.only = TRUE) @@ -13,6 +18,7 @@ db_file <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite") con <- DBI::dbConnect(RSQLite::SQLite(), db_file) cat("The database connection is: \n") src_dbi(con) +cat("Connected to db...\n") # Create table with all links between NSF-grant and authors via papers @@ -73,6 +79,7 @@ firstname_similarity <- numeric(0) lastname_similarity <- numeric(0) # Iterate through rows and calculate string distances for first and last names separately +cat("Start comparing names...\n") for (i in 1:nrow(nsf_to_authors)) { mag_firstname <- nsf_to_authors$mag_firstname[i] nsf_firstname <- nsf_to_authors$nsf_firstname[i] @@ -83,8 +90,7 @@ for (i in 1:nrow(nsf_to_authors)) { nsf_firstname, method="osa" ) - # Append the calculated distance to the results vector for first name - firstname_similarity <- c(firstname_similarity, first_row_similarity) + # Calculate string distance for last name by row mag_lastname <- nsf_to_authors$mag_lastname[i] @@ -96,10 +102,42 @@ for (i in 1:nrow(nsf_to_authors)) { method="osa" ) - # Append the calculated distance to the results vector for last name + # Append the calculated distances to the results vector + firstname_similarity <- c(firstname_similarity, first_row_similarity) lastname_similarity <- c(lastname_similarity, last_row_similarity) + + # Increment row count + row_count <- row_count + 1 + + # Progress after each 500,000th row + if (row_count %% 50 == 0) { + # Calculate elapsed time + elapsed_time <- Sys.time() - start_time + elapsed_time <- as.numeric(elapsed_time) + + # Calculate percentage of data processed + percent_processed <- (row_count / nrow(nsf_to_authors)) * 100 + + # Some information + cat(sprintf( + "Processed %d rows (%.2f%%) in %2.f minutes.\n", + row_count, + percent_processed, + elapsed_time + )) + } } +elapsed_time <- Sys.time() - start_time +elapsed_time <- as.numeric(elapsed_time) + +percent_processed <- (row_count / nrow(nsf_to_authors)) * 100 +cat(sprintf( + "Processed all rows (%.2f%%) in %2.f minutes.\n", + percent_processed, + elapsed_time +)) + # Assign the calculated distances to a new column in data frame nsf_to_authors$firstname_similarity <- firstname_similarity nsf_to_authors$lastname_similarity <- lastname_similarity @@ -114,7 +152,19 @@ df <- similar_names %>% distinct() # Write table to db: -dbWriteTable(con, name = "links_nsf_mag", value = df, overwrite = TRUE) +cat("Starting data upload to the database...\n") +#dbWriteTable(con, name = "links_nsf_mag2", value = df, overwrite = TRUE) +cat("Data upload to the database is complete.\n") + +# Some info +final_elapsed_time <- Sys.time() - start_time +elapsed_time <- as.numeric(elapsed_time) + +cat(sprintf( + "Complete. Total time elapsed: %2.f minutes.\n", + elapsed_time +)) # close connection to db -DBI::dbDisconnect(con) \ No newline at end of file +DBI::dbDisconnect(con) +cat("Disconnected from db.\n") \ No newline at end of file diff --git a/src/dataprep/pipeline.sh b/src/dataprep/pipeline.sh index 8431c27..2b09a04 100644 --- a/src/dataprep/pipeline.sh +++ b/src/dataprep/pipeline.sh @@ -143,9 +143,10 @@ Rscript -e "rmarkdown::render('$script_path/reports/quality_linking_advisors.Rmd # ## 3. Link NSF grants to MAG advisors bash $script_path/link/grants.sh $logfile_path -# XXX adapt for grants - use mona train -#python -m $script_path.link.write_csv_links --linking_type "advisors" --train_name "christoph_degree0" \ -# &> $logfile_path/write_csv_links_advisors.log +# XXX adapt for grants: links ScisciNet (nsf) to mag + +Rscript $script_path.prep_nsf.link_scinetnsf_to_mag.R &> \ + &> $logfile_path/write_csv_links_grants.log Rscript -e "rmarkdown::render('$script_path/reports/quality_linking_grants.Rmd', output_dir = '$output_path')" \ &> $logfile_path/quality_linking_grants.log From 85820d7822937c284da71de9b7c85a5fe155385a Mon Sep 17 00:00:00 2001 From: mona Date: Thu, 28 Sep 2023 13:03:55 +0000 Subject: [PATCH 08/14] just uncommented db-upload --- src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R index 1aecf7d..36339a5 100644 --- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -153,7 +153,7 @@ df <- similar_names %>% # Write table to db: cat("Starting data upload to the database...\n") -#dbWriteTable(con, name = "links_nsf_mag2", value = df, overwrite = TRUE) +dbWriteTable(con, name = "links_nsf_mag2", value = df, overwrite = TRUE) cat("Data upload to the database is complete.\n") # Some info From ea8c1f4f0f22790eebcdfda794a5ff974862acc4 Mon Sep 17 00:00:00 2001 From: mona Date: Fri, 29 Sep 2023 11:02:31 +0000 Subject: [PATCH 09/14] Optimized similarity check of names --- .../main/prep_nsf/link_scinetnsf_to_mag.R | 173 ++++++++++-------- 1 file changed, 94 insertions(+), 79 deletions(-) diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R index 36339a5..c167eb3 100644 --- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -4,11 +4,10 @@ # Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder # Initialize variables for counting rows and timestamp -row_count <- 0 start_time <- Sys.time() -cat(sprintf("Started at", start_time)) +cat(sprintf("Started at %s \n", start_time)) -packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist") +packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist", "purrr", "furrr") lapply(packages, library, character.only = TRUE) datapath <- "/mnt/ssd/" @@ -23,7 +22,7 @@ cat("Connected to db...\n") # Create table with all links between NSF-grant and authors via papers NSF_to_Authors <- tbl(con, sql(" - select a. PaperID, a.Type, a.GrantID, b.AuthorId + select a. PaperID, a.GrantID, b.AuthorId ,c.NormalizedName, d.Position, d.PIFullName from scinet_links_nsf as a inner join ( @@ -44,6 +43,9 @@ NSF_to_Authors <- tbl(con, sql(" ")) nsf_to_authors <- collect(NSF_to_Authors) +nsf_to_authors <- nsf_to_authors %>% + filter(!is.na(PIFullName) & !is.na(NormalizedName)) +cat("Loaded dataset. \n") # Create separate variables for first and last name for both nsf and mag names nsf_to_authors <- nsf_to_authors %>% @@ -65,105 +67,118 @@ nsf_to_authors <- nsf_to_authors %>% word(PIFullName, 2), NA_character_) ) - - ### Compare name similarity # Set a threshold for similarity -threshold <- 0.8 - +threshold <- 0.7 -### Test several distances +### Create function to calculate similarity and filter -# Calculate string similarity for first and last names by row and add a new column -firstname_similarity <- numeric(0) -lastname_similarity <- numeric(0) +fct_similarity <- function(row) { + mag_firstname <- row$mag_firstname + nsf_firstname <- row$nsf_firstname -# Iterate through rows and calculate string distances for first and last names separately -cat("Start comparing names...\n") -for (i in 1:nrow(nsf_to_authors)) { - mag_firstname <- nsf_to_authors$mag_firstname[i] - nsf_firstname <- nsf_to_authors$nsf_firstname[i] - - # Calculate string distance for first name by row using Optimal String Alignment (default) + # Calculate string distances by row using Optimal String Alignment (default) first_row_similarity <- stringsim( mag_firstname, nsf_firstname, method="osa" ) - - - # Calculate string distance for last name by row - mag_lastname <- nsf_to_authors$mag_lastname[i] - nsf_lastname <- nsf_to_authors$nsf_lastname[i] - last_row_similarity <- stringsim( - mag_lastname, - nsf_lastname, - method="osa" + mag_lastname <- row$mag_lastname + nsf_lastname <- row$nsf_lastname + + last_row_similarity <- stringsim( + mag_lastname, + nsf_lastname, + method = "osa" ) - # Append the calculated distances to the results vector - firstname_similarity <- c(firstname_similarity, first_row_similarity) - lastname_similarity <- c(lastname_similarity, last_row_similarity) + return(data.frame(firstname_similarity = first_row_similarity, lastname_similarity = last_row_similarity)) +} + +# Split the data into chunks of 50,000 rows +chunk_size <- 50000 +chunks <- split(nsf_to_authors, ceiling(seq_len(nrow(nsf_to_authors)) / chunk_size)) + +# Load the furrr package for parallel processing +plan(multisession) - # Increment row count - row_count <- row_count + 1 +# Initialize variables for progress tracking +total_chunks <- length(chunks) +processed_chunks <- 0 + +# Process and save each chunk as individual CSV files +for (i in seq_along(chunks)) { + chunk <- chunks[[i]] + + # Calculate similarity and filter rows row by row + row_similarities <- purrr::map_df(1:nrow(chunk), ~fct_similarity(chunk[.x, ])) %>% + mutate(id = row_number()) + + # Filter rows that meet the threshold criteria + chunk <- chunk %>% + mutate(id = row_number()) %>% + left_join(row_similarities, by = "id") %>% + filter(firstname_similarity >= threshold & lastname_similarity >= threshold) %>% + select(GrantID, AuthorId, Position, mag_firstname, nsf_firstname, firstname_similarity, mag_lastname, nsf_lastname, lastname_similarity) %>% + distinct() - # Progress after each 500,000th row - if (row_count %% 50 == 0) { - # Calculate elapsed time - elapsed_time <- Sys.time() - start_time - elapsed_time <- as.numeric(elapsed_time) - - # Calculate percentage of data processed - percent_processed <- (row_count / nrow(nsf_to_authors)) * 100 - - # Some information + + # Define the output file path + output_file <- file.path("/mnt/ssd/chunks_nsf_links", paste0("chunk_", i, ".csv")) + + # Write the chunk to a CSV file + write.csv(chunk, file = output_file, row.names = FALSE) + + # Update progress + processed_chunks <- processed_chunks + 1 + percent_processed <- (processed_chunks / total_chunks) * 100 + elapsed_time <- as.numeric(Sys.time() - start_time) + + # Convert elapsed time to minutes and potentially hours + elapsed_minutes <- elapsed_time / 60 + if (elapsed_minutes >= 60) { + elapsed_hours <- floor(elapsed_minutes / 60) + elapsed_minutes <- elapsed_minutes %% 60 + + # Display progress information cat(sprintf( - "Processed %d rows (%.2f%%) in %2.f minutes.\n", - row_count, - percent_processed, - elapsed_time + "Processed %d out of %d chunks (%.2f%%) in %d hours and %.2f minutes.\n", + processed_chunks, total_chunks, percent_processed, elapsed_hours, elapsed_minutes + )) + } else { + cat(sprintf( + "Processed %d out of %d chunks (%.2f%%) in %.2f minutes.\n", + processed_chunks, total_chunks, percent_processed, elapsed_minutes )) } } -elapsed_time <- Sys.time() - start_time -elapsed_time <- as.numeric(elapsed_time) - -percent_processed <- (row_count / nrow(nsf_to_authors)) * 100 -cat(sprintf( - "Processed all rows (%.2f%%) in %2.f minutes.\n", - percent_processed, - elapsed_time -)) - -# Assign the calculated distances to a new column in data frame -nsf_to_authors$firstname_similarity <- firstname_similarity -nsf_to_authors$lastname_similarity <- lastname_similarity - -# Filter observations where the names are above the threshold: threshold seemed reasonable as it allows for a single typo -similar_names <- nsf_to_authors %>% - filter(firstname_similarity >= threshold & lastname_similarity >= threshold) - -# drop unnecessary variables and drop duplicates -df <- similar_names %>% - select(GrantID, AuthorId, Position, firstname_similarity, lastname_similarity) %>% - distinct() - -# Write table to db: -cat("Starting data upload to the database...\n") -dbWriteTable(con, name = "links_nsf_mag2", value = df, overwrite = TRUE) -cat("Data upload to the database is complete.\n") +# Clean up the furrr plan +plan(NULL) # Some info final_elapsed_time <- Sys.time() - start_time -elapsed_time <- as.numeric(elapsed_time) +final_elapsed_time <- as.numeric(final_elapsed_time) + +# Convert elapsed time to minutes and potentially hours +final_elapsed_minutes <- final_elapsed_time / 60 +if (final_elapsed_minutes >= 60) { + final_elapsed_hours <- floor(final_elapsed_minutes / 60) + final_elapsed_minutes <- final_elapsed_minutes %% 60 + + # Display progress information + cat(sprintf( + "Complete. Total elapsed time: %d hours and %.2f minutes.\n", + final_elapsed_hours, final_elapsed_minutes + )) +} else { + cat(sprintf( + "Complete. Total elapsed time: %.2f minutes.\n", + elapsed_minutes + )) +} -cat(sprintf( - "Complete. Total time elapsed: %2.f minutes.\n", - elapsed_time -)) # close connection to db DBI::dbDisconnect(con) From 1a93552426d4e644b04852c6f4b14e807cb3a598 Mon Sep 17 00:00:00 2001 From: mona Date: Mon, 2 Oct 2023 14:48:12 +0000 Subject: [PATCH 10/14] Changed restriction for name similarity --- .../main/prep_nsf/link_scinetnsf_to_mag.R | 50 ++++++++----------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R index c167eb3..845cf4c 100644 --- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -43,6 +43,7 @@ NSF_to_Authors <- tbl(con, sql(" ")) nsf_to_authors <- collect(NSF_to_Authors) + nsf_to_authors <- nsf_to_authors %>% filter(!is.na(PIFullName) & !is.na(NormalizedName)) cat("Loaded dataset. \n") @@ -66,7 +67,7 @@ nsf_to_authors <- nsf_to_authors %>% word(PIFullName, 2) != word(PIFullName, -1), word(PIFullName, 2), NA_character_) ) - +cat("Separated names in dataset. \n") ### Compare name similarity # Set a threshold for similarity threshold <- 0.7 @@ -101,13 +102,17 @@ chunk_size <- 50000 chunks <- split(nsf_to_authors, ceiling(seq_len(nrow(nsf_to_authors)) / chunk_size)) # Load the furrr package for parallel processing -plan(multisession) +plan(multisession, workers=16) # Initialize variables for progress tracking total_chunks <- length(chunks) processed_chunks <- 0 # Process and save each chunk as individual CSV files + time <- Sys.time() + cat(sprintf( + "Start processing %d chunks at %s \n", total_chunks, start_time)) + for (i in seq_along(chunks)) { chunk <- chunks[[i]] @@ -119,13 +124,19 @@ for (i in seq_along(chunks)) { chunk <- chunk %>% mutate(id = row_number()) %>% left_join(row_similarities, by = "id") %>% - filter(firstname_similarity >= threshold & lastname_similarity >= threshold) %>% + filter( + (firstname_similarity >= threshold & lastname_similarity >= threshold) | (lastname_similarity == 1.0 & substr(mag_firstname, 1, 1) == substr(nsf_firstname, 1, 1)) ) %>% select(GrantID, AuthorId, Position, mag_firstname, nsf_firstname, firstname_similarity, mag_lastname, nsf_lastname, lastname_similarity) %>% distinct() - + # Define the output file path output_file <- file.path("/mnt/ssd/chunks_nsf_links", paste0("chunk_", i, ".csv")) + +# Remove the output file if it exists +if (file.exists(output_file)) { + file.remove(output_file) +} # Write the chunk to a CSV file write.csv(chunk, file = output_file, row.names = FALSE) @@ -135,23 +146,12 @@ for (i in seq_along(chunks)) { percent_processed <- (processed_chunks / total_chunks) * 100 elapsed_time <- as.numeric(Sys.time() - start_time) - # Convert elapsed time to minutes and potentially hours - elapsed_minutes <- elapsed_time / 60 - if (elapsed_minutes >= 60) { - elapsed_hours <- floor(elapsed_minutes / 60) - elapsed_minutes <- elapsed_minutes %% 60 # Display progress information cat(sprintf( - "Processed %d out of %d chunks (%.2f%%) in %d hours and %.2f minutes.\n", - processed_chunks, total_chunks, percent_processed, elapsed_hours, elapsed_minutes + "Processed %d out of %d chunks (%.2f%%) in %.2f.\n", + processed_chunks, total_chunks, percent_processed, elapsed_time )) - } else { - cat(sprintf( - "Processed %d out of %d chunks (%.2f%%) in %.2f minutes.\n", - processed_chunks, total_chunks, percent_processed, elapsed_minutes - )) - } } # Clean up the furrr plan @@ -162,22 +162,12 @@ final_elapsed_time <- Sys.time() - start_time final_elapsed_time <- as.numeric(final_elapsed_time) # Convert elapsed time to minutes and potentially hours -final_elapsed_minutes <- final_elapsed_time / 60 -if (final_elapsed_minutes >= 60) { - final_elapsed_hours <- floor(final_elapsed_minutes / 60) - final_elapsed_minutes <- final_elapsed_minutes %% 60 - + # Display progress information cat(sprintf( - "Complete. Total elapsed time: %d hours and %.2f minutes.\n", - final_elapsed_hours, final_elapsed_minutes + "Complete. Total elapsed time: %.2f.\n", + final_elapsed_time )) -} else { - cat(sprintf( - "Complete. Total elapsed time: %.2f minutes.\n", - elapsed_minutes - )) -} # close connection to db From 63c89460ccc17a82b6a5120c074a51f5bc3c429d Mon Sep 17 00:00:00 2001 From: mona Date: Tue, 3 Oct 2023 15:00:37 +0000 Subject: [PATCH 11/14] Loaded chunk-csv files into one file --- .../main/prep_nsf/link_scinetnsf_to_mag.R | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R index 845cf4c..7dae5fe 100644 --- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -172,4 +172,24 @@ final_elapsed_time <- as.numeric(final_elapsed_time) # close connection to db DBI::dbDisconnect(con) -cat("Disconnected from db.\n") \ No newline at end of file +cat("Disconnected from db.\n") + +# Apend tables together +# Initialize an empty data frame to store the appended data +links_nsf_mag <- data.frame() + +# Loop through the file names and append the data +for (i in 1:1072) { + # Construct the file path for each chunk + + # Load the CSV file + chunk_data <- read.csv(output_file, header = TRUE) + + # Append the chunk data to the appended_data data frame + links_nsf_mag <- rbind(links_nsf_mag, chunk_data) +} +links_nsf_mag <- links_nsf_mag%>% + distinct() + +# Write the appended data to a single CSV file +#write.csv(links_nsf_mag, "links_nsf_mag.csv", row.names = FALSE) From 6ba244da618225e9b03d7b2039cb5770dff92b0b Mon Sep 17 00:00:00 2001 From: mona Date: Fri, 6 Oct 2023 18:49:57 +0000 Subject: [PATCH 12/14] added upload to db --- .../main/prep_nsf/link_scinetnsf_to_mag.R | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R index 7dae5fe..aad281a 100644 --- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -170,9 +170,6 @@ final_elapsed_time <- as.numeric(final_elapsed_time) )) -# close connection to db -DBI::dbDisconnect(con) -cat("Disconnected from db.\n") # Apend tables together # Initialize an empty data frame to store the appended data @@ -183,13 +180,26 @@ for (i in 1:1072) { # Construct the file path for each chunk # Load the CSV file - chunk_data <- read.csv(output_file, header = TRUE) + chunk_data <- read.csv(paste0("/mnt/ssd/chunks_nsf_links/chunk_", i, ".csv"), header = TRUE, colClasses = c(GrantID = "character")) # Append the chunk data to the appended_data data frame links_nsf_mag <- rbind(links_nsf_mag, chunk_data) } -links_nsf_mag <- links_nsf_mag%>% + +# drop unnecessary variables and drop duplicates +links_nsf_mag <- links_nsf_mag %>% + select(GrantID, AuthorId, Position, firstname_similarity, lastname_similarity) %>% distinct() + # Write the appended data to a single CSV file #write.csv(links_nsf_mag, "links_nsf_mag.csv", row.names = FALSE) + + +# Write table to db: +dbWriteTable(con, name = "links_nsf_mag", value = links_nsf_mag, overwrite = TRUE) +cat("Uploaded to db.\n") + +# close connection to db +DBI::dbDisconnect(con) +cat("Disconnected from db.\n") \ No newline at end of file From 94bda097036dd8fbb8bf651a6fbac8dbbd4e3e28 Mon Sep 17 00:00:00 2001 From: Christoph Date: Mon, 9 Oct 2023 01:25:27 +0000 Subject: [PATCH 13/14] handle chunks correctly, change AuthorId to integer64 for correct writing to db --- .../main/prep_nsf/link_scinetnsf_to_mag.R | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R index aad281a..97eedc1 100644 --- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -7,7 +7,7 @@ start_time <- Sys.time() cat(sprintf("Started at %s \n", start_time)) -packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist", "purrr", "furrr") +packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist", "purrr", "furrr", "bit64") lapply(packages, library, character.only = TRUE) datapath <- "/mnt/ssd/" @@ -171,17 +171,19 @@ final_elapsed_time <- as.numeric(final_elapsed_time) -# Apend tables together +# Append tables together # Initialize an empty data frame to store the appended data links_nsf_mag <- data.frame() +chunks <- list.files("/mnt/ssd/chunks_nsf_links/", pattern = "*.csv", full.names = TRUE) # Loop through the file names and append the data -for (i in 1:1072) { - # Construct the file path for each chunk - +for (chunk in chunks) { # Load the CSV file - chunk_data <- read.csv(paste0("/mnt/ssd/chunks_nsf_links/chunk_", i, ".csv"), header = TRUE, colClasses = c(GrantID = "character")) - + chunk_data <- read.csv(chunk, + header = TRUE, + colClasses = c(GrantID = "character", AuthorId = "character") + ) + chunk_data <- chunk_data %>% mutate(AuthorId = as.integer64(AuthorId)) # Append the chunk data to the appended_data data frame links_nsf_mag <- rbind(links_nsf_mag, chunk_data) } @@ -202,4 +204,4 @@ cat("Uploaded to db.\n") # close connection to db DBI::dbDisconnect(con) -cat("Disconnected from db.\n") \ No newline at end of file +cat("Disconnected from db.\n") From ac571fc1c5a026dba8d12451d4ceee4fdc94b28d Mon Sep 17 00:00:00 2001 From: mona Date: Mon, 9 Oct 2023 14:01:03 +0000 Subject: [PATCH 14/14] droped a line which was not true anymore --- src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R | 1 - 1 file changed, 1 deletion(-) diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R index 97eedc1..15fe32d 100644 --- a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -1,6 +1,5 @@ # Link SciSciNet_Links_NSF table with Paper_Author_Affiliations, Authors, and NSF_Investigator # Keeps only those with link between NSF grant and author ID. -# only those links with a similar name (similarity >=0.8) are loaded into db # Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder # Initialize variables for counting rows and timestamp