diff --git a/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R new file mode 100644 index 0000000..15fe32d --- /dev/null +++ b/src/dataprep/main/prep_nsf/link_scinetnsf_to_mag.R @@ -0,0 +1,206 @@ +# Link SciSciNet_Links_NSF table with Paper_Author_Affiliations, Authors, and NSF_Investigator +# Keeps only those with link between NSF grant and author ID. +# Data downloaded and uploaded into db in: scinet_data_to_db.py in same folder + +# Initialize variables for counting rows and timestamp +start_time <- Sys.time() +cat(sprintf("Started at %s \n", start_time)) + +packages <- c("tidyverse", "broom", "dbplyr", "RSQLite", "stringdist", "purrr", "furrr", "bit64") +lapply(packages, library, character.only = TRUE) + +datapath <- "/mnt/ssd/" +db_file <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite") + + +con <- DBI::dbConnect(RSQLite::SQLite(), db_file) +cat("The database connection is: \n") +src_dbi(con) +cat("Connected to db...\n") + +# Create table with all links between NSF-grant and authors via papers + +NSF_to_Authors <- tbl(con, sql(" + select a. PaperID, a.GrantID, b.AuthorId + ,c.NormalizedName, d.Position, d.PIFullName + from scinet_links_nsf as a + inner join ( + select PaperId AS PaperID, AuthorId + from PaperAuthorAffiliations + )b + using (PaperID) + inner join ( + select AuthorId, NormalizedName + from Authors + ) c + using (AuthorId) + inner join ( + select GrantID, Position, PIFullName + from NSF_Investigator + ) d + using (GrantID) + ")) + +nsf_to_authors <- collect(NSF_to_Authors) + +nsf_to_authors <- nsf_to_authors %>% + filter(!is.na(PIFullName) & !is.na(NormalizedName)) +cat("Loaded dataset. \n") + +# Create separate variables for first and last name for both nsf and mag names +nsf_to_authors <- nsf_to_authors %>% + mutate( + mag_firstname = word(NormalizedName, 1), + mag_lastname = word(NormalizedName, -1), + mag_middlename = ifelse(str_count(NormalizedName, "\\s+") >= 2 & + word(NormalizedName, 2) != word(NormalizedName, -1), + word(NormalizedName, 2), NA_character_) + ) + + +nsf_to_authors <- nsf_to_authors %>% + mutate( + nsf_firstname = word(PIFullName, 1), + nsf_lastname = word(PIFullName, -1), + nsf_middlename = ifelse(str_count(PIFullName, "\\s+") >= 2 & + word(PIFullName, 2) != word(PIFullName, -1), + word(PIFullName, 2), NA_character_) + ) +cat("Separated names in dataset. \n") +### Compare name similarity +# Set a threshold for similarity +threshold <- 0.7 + +### Create function to calculate similarity and filter + +fct_similarity <- function(row) { + mag_firstname <- row$mag_firstname + nsf_firstname <- row$nsf_firstname + + # Calculate string distances by row using Optimal String Alignment (default) + first_row_similarity <- stringsim( + mag_firstname, + nsf_firstname, + method="osa" ) + + + mag_lastname <- row$mag_lastname + nsf_lastname <- row$nsf_lastname + + last_row_similarity <- stringsim( + mag_lastname, + nsf_lastname, + method = "osa" + ) + + return(data.frame(firstname_similarity = first_row_similarity, lastname_similarity = last_row_similarity)) +} + +# Split the data into chunks of 50,000 rows +chunk_size <- 50000 +chunks <- split(nsf_to_authors, ceiling(seq_len(nrow(nsf_to_authors)) / chunk_size)) + +# Load the furrr package for parallel processing +plan(multisession, workers=16) + +# Initialize variables for progress tracking +total_chunks <- length(chunks) +processed_chunks <- 0 + +# Process and save each chunk as individual CSV files + time <- Sys.time() + cat(sprintf( + "Start processing %d chunks at %s \n", total_chunks, start_time)) + +for (i in seq_along(chunks)) { + chunk <- chunks[[i]] + + # Calculate similarity and filter rows row by row + row_similarities <- purrr::map_df(1:nrow(chunk), ~fct_similarity(chunk[.x, ])) %>% + mutate(id = row_number()) + + # Filter rows that meet the threshold criteria + chunk <- chunk %>% + mutate(id = row_number()) %>% + left_join(row_similarities, by = "id") %>% + filter( + (firstname_similarity >= threshold & lastname_similarity >= threshold) | (lastname_similarity == 1.0 & substr(mag_firstname, 1, 1) == substr(nsf_firstname, 1, 1)) ) %>% + select(GrantID, AuthorId, Position, mag_firstname, nsf_firstname, firstname_similarity, mag_lastname, nsf_lastname, lastname_similarity) %>% + distinct() + + + # Define the output file path + output_file <- file.path("/mnt/ssd/chunks_nsf_links", paste0("chunk_", i, ".csv")) + +# Remove the output file if it exists +if (file.exists(output_file)) { + file.remove(output_file) +} + + # Write the chunk to a CSV file + write.csv(chunk, file = output_file, row.names = FALSE) + + # Update progress + processed_chunks <- processed_chunks + 1 + percent_processed <- (processed_chunks / total_chunks) * 100 + elapsed_time <- as.numeric(Sys.time() - start_time) + + + # Display progress information + cat(sprintf( + "Processed %d out of %d chunks (%.2f%%) in %.2f.\n", + processed_chunks, total_chunks, percent_processed, elapsed_time + )) +} + +# Clean up the furrr plan +plan(NULL) + +# Some info +final_elapsed_time <- Sys.time() - start_time +final_elapsed_time <- as.numeric(final_elapsed_time) + +# Convert elapsed time to minutes and potentially hours + + # Display progress information + cat(sprintf( + "Complete. Total elapsed time: %.2f.\n", + final_elapsed_time + )) + + + +# Append tables together +# Initialize an empty data frame to store the appended data +links_nsf_mag <- data.frame() + +chunks <- list.files("/mnt/ssd/chunks_nsf_links/", pattern = "*.csv", full.names = TRUE) +# Loop through the file names and append the data +for (chunk in chunks) { + # Load the CSV file + chunk_data <- read.csv(chunk, + header = TRUE, + colClasses = c(GrantID = "character", AuthorId = "character") + ) + chunk_data <- chunk_data %>% mutate(AuthorId = as.integer64(AuthorId)) + # Append the chunk data to the appended_data data frame + links_nsf_mag <- rbind(links_nsf_mag, chunk_data) +} + +# drop unnecessary variables and drop duplicates +links_nsf_mag <- links_nsf_mag %>% + select(GrantID, AuthorId, Position, firstname_similarity, lastname_similarity) %>% + distinct() + + +# Write the appended data to a single CSV file +#write.csv(links_nsf_mag, "links_nsf_mag.csv", row.names = FALSE) + + +# Write table to db: +dbWriteTable(con, name = "links_nsf_mag", value = links_nsf_mag, overwrite = TRUE) +cat("Uploaded to db.\n") + +# close connection to db +DBI::dbDisconnect(con) +cat("Disconnected from db.\n") diff --git a/src/dataprep/main/prep_nsf/scinet_data_to_db.py b/src/dataprep/main/prep_nsf/scinet_data_to_db.py new file mode 100644 index 0000000..b42fc31 --- /dev/null +++ b/src/dataprep/main/prep_nsf/scinet_data_to_db.py @@ -0,0 +1,106 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# %% +""" +Download SciSciNet table SciSciNet_Link_NSF with links between NSF-grants and papers +Upload into db +link to Paper_Author_Affiliations, Authors, NSF_Investigator in R file: test_sciscinet_data.R in same folder + + +Create table in database: +- scinet_links_nsf + +SciSciNet_Link_NSF schema is: + +GrantID TEXT, PaperID INTEGER, Type TEXT, Diff_ZScore NUMERIC + + +unique index on Grantid and PaperID (multiple PaperIDs per GrantID) +""" + +import subprocess +import sqlite3 as sqlite +import argparse +import os +from os import listdir +from os.path import isfile, join +import pandas as pd +import numpy as np +import re +#import sys +#import requests + +#sys.path.append('/home/mona/mag_sample/src/dataprep/') + +from helpers.variables import db_file, datapath, databasepath +from helpers.functions import analyze_db + +scinet_path = os.path.join(datapath, "sciscinet_data/") +filepath_nsf = os.path.join(scinet_path, "SciSciNet_Link_NSF.tsv") + + + +# Download file + +url_nsf = "https://ndownloader.figstatic.com/files/36139242" +response = requests.get(url_nsf) +with open(filepath_nsf, "wb") as file: + file.write(response.content) +print("Downloaded data") + + +# ## Read file and dump to db +# remove first row as it only contains column names that can't be overwritten +def load_scinet(filepath): + df = pd.read_csv(filepath, + sep="\t", + names=["NSF_Award_Number", "PaperID", "Type", "Diff_ZScore"], + skiprows=1) + df.drop_duplicates(inplace=True) + + # Create a GrantID variable in same format as previously used by removing non-numeric characters from NSF_Award_Number + # drop NSF_Award_Number as we only need GrantID + + df['GrantID'] = df['NSF_Award_Number'].str.extract(r'-(\d+)') + df = df.drop(columns=['NSF_Award_Number']) + + #Check that all rows will be uploaded into db: in raw file 1309518 rows + num_observations = df.shape[0] + print(num_observations, "rows of 1309518 rows in the raw file will be loaded into the db") + + return df + +files = [f for f in listdir(scinet_path) if isfile(join(scinet_path, f))] + + + + +con = sqlite.connect(database = db_file, isolation_level= None) +with con: + for (i,f) in enumerate(files): + df = load_scinet(scinet_path+f) + #print(df.head()) + if i==0: + if_exists_opt="replace" + else: + if_exists_opt="append" + + df.to_sql("scinet_links_nsf", + con=con, + if_exists=if_exists_opt, + index=False, + schema= """ PaperID INTEGER + , Type TEXT + , GrantID TEXT + , Diff_ZScore NUMERIC + """ + ) + + # Make index and clean up: + # Serves as check that only unique observations part of the dataframe + con.execute("CREATE UNIQUE INDEX idx_scinet_grantpaper ON scinet_links_nsf (GrantID ASC, PaperID ASC)") + + analyze_db(con) + +con.close() diff --git a/src/dataprep/main/prep_nsf/test_name_similarity.csv b/src/dataprep/main/prep_nsf/test_name_similarity.csv new file mode 100644 index 0000000..98f68a1 --- /dev/null +++ b/src/dataprep/main/prep_nsf/test_name_similarity.csv @@ -0,0 +1,101 @@ +"NormalizedName","PIFullName","mag_firstname","mag_lastname","mag_middlename","nsf_firstname","nsf_lastname","nsf_middlename","firstname_similarity_osa","lastname_similarity_osa","firstname_similarity_lv","lastname_similarity_lv","firstname_similarity_dl","lastname_similarity_dl","firstname_similarity_lcs","lastname_similarity_lcs","firstname_similarity_qgram","lastname_similarity_qgram","firstname_similarity_cosine","lastname_similarity_cosine","firstname_similarity_jac","lastname_similarity_jac","firstname_similarity_jw0","lastname_similarity_jw0","firstname_similarity_jw.1","lastname_similarity_jw.1","firstname_similarity_jw.2","lastname_similarity_jw.2" +"jennifer fowler","jennifer w fowler","jennifer","fowler",NA,"jennifer","fowler","w",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"thomas colligan","jennifer w fowler","thomas","colligan",NA,"jennifer","fowler","w",0,0.25,0,0.25,0,0.25,0,0.285714285714286,0,0.285714285714286,0,0.387298334620742,0,0.181818181818182,0,0.527777777777778,0,0.527777777777778,0,0.527777777777778 +"jaxen godfrey","jennifer w fowler","jaxen","godfrey",NA,"jennifer","fowler","w",0.25,0.285714285714286,0.25,0.285714285714286,0.25,0.285714285714286,0.461538461538462,0.307692307692308,0.461538461538462,0.615384615384615,0.645497224367903,0.617213399848368,0.375,0.444444444444444,0.658333333333333,0.531746031746032,0.6925,0.531746031746032,0.726666666666667,0.531746031746032 +"carl spangrude","jennifer w fowler","carl","spangrude",NA,"jennifer","fowler","w",0,0.111111111111111,0,0.111111111111111,0,0.111111111111111,0.166666666666667,0.133333333333333,0.166666666666667,0.266666666666667,0.144337567297406,0.272165526975909,0.111111111111111,0.153846153846154,0,0.425925925925926,0,0.425925925925926,0,0.425925925925926 +"daqing wan","daqing wan","daqing","wan",NA,"daqing","wan",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"jiyou li","daqing wan","jiyou","li",NA,"daqing","wan",NA,0,0,0,0,0,0,0.181818181818182,0,0.181818181818182,0,0.182574185835055,0,0.1,0,0.455555555555555,0,0.455555555555555,0,0.455555555555555,0 +"daqing wan","daqing wan","daqing","wan",NA,"daqing","wan",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"krishna kaipa","daqing wan","krishna","kaipa",NA,"daqing","wan",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.307692307692308,0.25,0.461538461538462,0.25,0.462910049886276,0.436435780471985,0.3,0.166666666666667,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111 +"jun zhang","daqing wan","jun","zhang",NA,"daqing","wan",NA,0.166666666666667,0.4,0.166666666666667,0.4,0.166666666666667,0.4,0.222222222222222,0.5,0.222222222222222,0.5,0.235702260395516,0.516397779494322,0.125,0.333333333333333,0.5,0.688888888888889,0.5,0.688888888888889,0.5,0.688888888888889 +"daqing wan","daqing wan","daqing","wan",NA,"daqing","wan",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"jun zhang","daqing wan","jun","zhang",NA,"daqing","wan",NA,0.166666666666667,0.4,0.166666666666667,0.4,0.166666666666667,0.4,0.222222222222222,0.5,0.222222222222222,0.5,0.235702260395516,0.516397779494322,0.125,0.333333333333333,0.5,0.688888888888889,0.5,0.688888888888889,0.5,0.688888888888889 +"daqing wan","daqing wan","daqing","wan",NA,"daqing","wan",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"alicia marino","daqing wan","alicia","marino",NA,"daqing","wan",NA,0.166666666666667,0.333333333333333,0.166666666666667,0.333333333333333,0.166666666666667,0.333333333333333,0.333333333333333,0.444444444444444,0.333333333333333,0.444444444444444,0.516397779494322,0.471404520791032,0.25,0.285714285714286,0.555555555555555,0.666666666666667,0.555555555555555,0.666666666666667,0.555555555555555,0.666666666666667 +"angela robinson","daqing wan","angela","robinson",NA,"daqing","wan",NA,0,0.125,0,0.125,0,0.125,0.5,0.181818181818182,0.5,0.181818181818182,0.577350269189626,0.333333333333333,0.375,0.125,0.444444444444444,0.486111111111111,0.444444444444444,0.486111111111111,0.444444444444444,0.486111111111111 +"tim lai","daqing wan","tim","lai",NA,"daqing","wan",NA,0.166666666666667,0.333333333333333,0.166666666666667,0.333333333333333,0.166666666666667,0.333333333333333,0.222222222222222,0.333333333333333,0.222222222222222,0.333333333333333,0.235702260395516,0.333333333333333,0.125,0.2,0.5,0.555555555555555,0.5,0.555555555555555,0.5,0.555555555555555 +"nathan beckmann","nathan beckmann","nathan","beckmann",NA,"nathan","beckmann",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"brian c schwedock","nathan beckmann","brian","schwedock","c","nathan","beckmann",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.363636363636364,0.352941176470588,0.363636363636364,0.352941176470588,0.565685424949238,0.381385035698237,0.285714285714286,0.25,0.577777777777778,0.324074074074074,0.577777777777778,0.324074074074074,0.577777777777778,0.324074074074074 +"george yin","hongwei zhang","george","yin",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.461538461538462,0.25,0.461538461538462,0.25,0.597614304667197,0.258198889747161,0.375,0.142857142857143,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111 +"jayanthi rao","hongwei zhang","jayanthi","rao",NA,"hongwei","zhang",NA,0.125,0.2,0.125,0.2,0.125,0.2,0.266666666666667,0.25,0.4,0.25,0.358568582800318,0.258198889747161,0.272727272727273,0.142857142857143,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111 +"chuan li","hongwei zhang","chuan","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.333333333333333,0,0.333333333333333,0,0.338061701891406,0,0.2,0,0.561904761904762,0,0.561904761904762,0,0.561904761904762,0 +"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"le yi wang","hongwei zhang","le","wang","yi","hongwei","zhang",NA,0.142857142857143,0.6,0.142857142857143,0.6,0.142857142857143,0.6,0.222222222222222,0.666666666666667,0.222222222222222,0.666666666666667,0.267261241912424,0.670820393249937,0.125,0.5,0,0.783333333333333,0,0.783333333333333,0,0.783333333333333 +"george yin","hongwei zhang","george","yin",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.461538461538462,0.25,0.461538461538462,0.25,0.597614304667197,0.258198889747161,0.375,0.142857142857143,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111 +"jayanthi rao","hongwei zhang","jayanthi","rao",NA,"hongwei","zhang",NA,0.125,0.2,0.125,0.2,0.125,0.2,0.266666666666667,0.25,0.4,0.25,0.358568582800318,0.258198889747161,0.272727272727273,0.142857142857143,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111 +"chuan li","hongwei zhang","chuan","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.333333333333333,0,0.333333333333333,0,0.338061701891406,0,0.2,0,0.561904761904762,0,0.561904761904762,0,0.561904761904762,0 +"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"le yi wang","hongwei zhang","le","wang","yi","hongwei","zhang",NA,0.142857142857143,0.6,0.142857142857143,0.6,0.142857142857143,0.6,0.222222222222222,0.666666666666667,0.222222222222222,0.666666666666667,0.267261241912424,0.670820393249937,0.125,0.5,0,0.783333333333333,0,0.783333333333333,0,0.783333333333333 +"keqiang li","hongwei zhang","keqiang","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.285714285714286,0,0.571428571428571,0,0.571428571428571,0,0.4,0,0,0,0,0,0,0 +"shengbo eben li","hongwei zhang","shengbo","li","eben","hongwei","zhang",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.428571428571429,0,0.714285714285714,0,0.714285714285714,0,0.555555555555556,0,0.619047619047619,0,0.619047619047619,0,0.619047619047619,0 +"yang zheng","hongwei zhang","yang","zheng",NA,"hongwei","zhang",NA,0.285714285714286,0.8,0.285714285714286,0.8,0.285714285714286,0.8,0.363636363636364,0.8,0.363636363636364,0.8,0.377964473009227,0.8,0.222222222222222,0.666666666666667,0.595238095238095,0.866666666666667,0.595238095238095,0.893333333333333,0.595238095238095,0.92 +"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"jianqiang wang","hongwei zhang","jianqiang","wang",NA,"hongwei","zhang",NA,0.111111111111111,0.6,0.111111111111111,0.6,0.111111111111111,0.6,0.25,0.666666666666667,0.375,0.666666666666667,0.487950036474267,0.670820393249937,0.3,0.5,0.502645502645503,0.783333333333333,0.502645502645503,0.783333333333333,0.502645502645503,0.783333333333333 +"xiaohui qin","hongwei zhang","xiaohui","qin",NA,"hongwei","zhang",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.285714285714286,0.25,0.428571428571429,0.25,0.50395263067897,0.258198889747161,0.3,0.142857142857143,0.523809523809524,0.511111111111111,0.523809523809524,0.511111111111111,0.523809523809524,0.511111111111111 +"keqiang li","hongwei zhang","keqiang","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.285714285714286,0,0.571428571428571,0,0.571428571428571,0,0.4,0,0,0,0,0,0,0 +"shengbo eben li","hongwei zhang","shengbo","li","eben","hongwei","zhang",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.428571428571429,0,0.714285714285714,0,0.714285714285714,0,0.555555555555556,0,0.619047619047619,0,0.619047619047619,0,0.619047619047619,0 +"george yin","hongwei zhang","george","yin",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.461538461538462,0.25,0.461538461538462,0.25,0.597614304667197,0.258198889747161,0.375,0.142857142857143,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111,0.642857142857143,0.511111111111111 +"le yi wang","hongwei zhang","le","wang","yi","hongwei","zhang",NA,0.142857142857143,0.6,0.142857142857143,0.6,0.142857142857143,0.6,0.222222222222222,0.666666666666667,0.222222222222222,0.666666666666667,0.267261241912424,0.670820393249937,0.125,0.5,0,0.783333333333333,0,0.783333333333333,0,0.783333333333333 +"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"thu nguyen","hongwei zhang","thu","nguyen",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.2,0.363636363636364,0.2,0.363636363636364,0.218217890235992,0.474341649025257,0.111111111111111,0.25,0.492063492063492,0.455555555555555,0.492063492063492,0.455555555555555,0.492063492063492,0.455555555555555 +"j karl hedrick","hongwei zhang","j","hedrick","karl","hongwei","zhang",NA,0,0,0,0,0,0,0,0.166666666666667,0,0.166666666666667,0,0.169030850945703,0,0.0909090909090909,0,0.447619047619048,0,0.447619047619048,0,0.447619047619048 +"keqiang li","hongwei zhang","keqiang","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.285714285714286,0,0.571428571428571,0,0.571428571428571,0,0.4,0,0,0,0,0,0,0 +"shengbo eben li","hongwei zhang","shengbo","li","eben","hongwei","zhang",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.428571428571429,0,0.714285714285714,0,0.714285714285714,0,0.555555555555556,0,0.619047619047619,0,0.619047619047619,0,0.619047619047619,0 +"feng gao","hongwei zhang","feng","gao",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.363636363636364,0.25,0.545454545454545,0.5,0.566946709513841,0.516397779494322,0.375,0.333333333333333,0.595238095238095,0.511111111111111,0.595238095238095,0.511111111111111,0.595238095238095,0.511111111111111 +"yang zheng","hongwei zhang","yang","zheng",NA,"hongwei","zhang",NA,0.285714285714286,0.8,0.285714285714286,0.8,0.285714285714286,0.8,0.363636363636364,0.8,0.363636363636364,0.8,0.377964473009227,0.8,0.222222222222222,0.666666666666667,0.595238095238095,0.866666666666667,0.595238095238095,0.893333333333333,0.595238095238095,0.92 +"yujia wu","hongwei zhang","yujia","wu",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.166666666666667,0,0.166666666666667,0,0.169030850945703,0,0.0909090909090909,0,0,0,0,0,0,0 +"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"j karl hedrick","jing hua","j","hedrick","karl","jing","hua",NA,0.25,0.142857142857143,0.25,0.142857142857143,0.25,0.142857142857143,0.4,0.2,0.4,0.2,0.5,0.218217890235992,0.25,0.111111111111111,0.75,0.492063492063492,0.775,0.542857142857143,0.8,0.593650793650794 +"j karl hedrick","hongwei zhang","j","hedrick","karl","hongwei","zhang",NA,0,0,0,0,0,0,0,0.166666666666667,0,0.166666666666667,0,0.169030850945703,0,0.0909090909090909,0,0.447619047619048,0,0.447619047619048,0,0.447619047619048 +"j karl hedrick","jayanthi rao","j","hedrick","karl","jayanthi","rao",NA,0.125,0.142857142857143,0.125,0.142857142857143,0.125,0.142857142857143,0.222222222222222,0.2,0.222222222222222,0.2,0.316227766016838,0.218217890235992,0.142857142857143,0.111111111111111,0.708333333333333,0,0.7375,0,0.766666666666667,0 +"j karl hedrick","anthony holt","j","hedrick","karl","anthony","holt",NA,0,0.142857142857143,0,0.142857142857143,0,0.142857142857143,0,0.181818181818182,0,0.181818181818182,0,0.188982236504614,0,0.1,0,0.464285714285714,0,0.517857142857143,0,0.571428571428571 +"keqiang li","jing hua","keqiang","li",NA,"jing","hua",NA,0.428571428571429,0,0.428571428571429,0,0.428571428571429,0,0.545454545454545,0,0.545454545454545,0,0.566946709513841,0,0.375,0,0.464285714285714,0,0.464285714285714,0,0.464285714285714,0 +"keqiang li","hongwei zhang","keqiang","li",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.285714285714286,0,0.571428571428571,0,0.571428571428571,0,0.4,0,0,0,0,0,0,0 +"keqiang li","jayanthi rao","keqiang","li",NA,"jayanthi","rao",NA,0.125,0,0.125,0,0.125,0,0.266666666666667,0,0.4,0,0.478091443733757,0,0.272727272727273,0,0.511904761904762,0,0.511904761904762,0,0.511904761904762,0 +"keqiang li","anthony holt","keqiang","li",NA,"anthony","holt",NA,0.142857142857143,0.25,0.142857142857143,0.25,0.142857142857143,0.25,0.285714285714286,0.333333333333333,0.285714285714286,0.333333333333333,0.377964473009227,0.353553390593274,0.181818181818182,0.2,0.428571428571428,0,0.428571428571428,0,0.428571428571428,0 +"shengbo eben li","jing hua","shengbo","li","eben","jing","hua",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.363636363636364,0,0.363636363636364,0,0.377964473009227,0,0.222222222222222,0,0.595238095238095,0,0.595238095238095,0,0.595238095238095,0 +"shengbo eben li","hongwei zhang","shengbo","li","eben","hongwei","zhang",NA,0.285714285714286,0,0.285714285714286,0,0.285714285714286,0,0.428571428571429,0,0.714285714285714,0,0.714285714285714,0,0.555555555555556,0,0.619047619047619,0,0.619047619047619,0,0.619047619047619,0 +"shengbo eben li","jayanthi rao","shengbo","li","eben","jayanthi","rao",NA,0.125,0,0.125,0,0.125,0,0.133333333333333,0,0.266666666666667,0,0.239045721866879,0,0.166666666666667,0,0.422619047619048,0,0.422619047619048,0,0.422619047619048,0 +"shengbo eben li","anthony holt","shengbo","li","eben","anthony","holt",NA,0,0.25,0,0.25,0,0.25,0.285714285714286,0.333333333333333,0.428571428571429,0.333333333333333,0.50395263067897,0.353553390593274,0.3,0.2,0.507936507936508,0,0.507936507936508,0,0.507936507936508,0 +"feng gao","jing hua","feng","gao",NA,"jing","hua",NA,0.5,0,0.5,0,0.5,0,0.5,0.333333333333333,0.5,0.333333333333333,0.5,0.333333333333333,0.333333333333333,0.2,0.666666666666667,0,0.666666666666667,0,0.666666666666667,0 +"feng gao","hongwei zhang","feng","gao",NA,"hongwei","zhang",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.363636363636364,0.25,0.545454545454545,0.5,0.566946709513841,0.516397779494322,0.375,0.333333333333333,0.595238095238095,0.511111111111111,0.595238095238095,0.511111111111111,0.595238095238095,0.511111111111111 +"feng gao","jayanthi rao","feng","gao",NA,"jayanthi","rao",NA,0.125,0.666666666666667,0.125,0.666666666666667,0.125,0.666666666666667,0.166666666666667,0.666666666666667,0.166666666666667,0.666666666666667,0.158113883008419,0.666666666666667,0.1,0.5,0.458333333333333,0.777777777777778,0.458333333333333,0.777777777777778,0.458333333333333,0.777777777777778 +"feng gao","anthony holt","feng","gao",NA,"anthony","holt",NA,0.142857142857143,0,0.142857142857143,0,0.142857142857143,0,0.181818181818182,0.285714285714286,0.181818181818182,0.285714285714286,0.333333333333333,0.288675134594813,0.111111111111111,0.166666666666667,0.464285714285714,0.527777777777778,0.464285714285714,0.527777777777778,0.464285714285714,0.527777777777778 +"yang zheng","jing hua","yang","zheng",NA,"jing","hua",NA,0.5,0.2,0.5,0.2,0.5,0.2,0.5,0.25,0.5,0.25,0.5,0.258198889747161,0.333333333333333,0.142857142857143,0.666666666666667,0.511111111111111,0.666666666666667,0.511111111111111,0.666666666666667,0.511111111111111 +"yang zheng","hongwei zhang","yang","zheng",NA,"hongwei","zhang",NA,0.285714285714286,0.8,0.285714285714286,0.8,0.285714285714286,0.8,0.363636363636364,0.8,0.363636363636364,0.8,0.377964473009227,0.8,0.222222222222222,0.666666666666667,0.595238095238095,0.866666666666667,0.595238095238095,0.893333333333333,0.595238095238095,0.92 +"yang zheng","jayanthi rao","yang","zheng",NA,"jayanthi","rao",NA,0.375,0,0.375,0,0.375,0,0.5,0,0.5,0,0.632455532033676,0,0.375,0,0.597222222222222,0,0.597222222222222,0,0.597222222222222,0 +"yang zheng","anthony holt","yang","zheng",NA,"anthony","holt",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.363636363636364,0.222222222222222,0.545454545454545,0.222222222222222,0.666666666666667,0.223606797749979,0.428571428571429,0.125,0.595238095238095,0.483333333333333,0.595238095238095,0.483333333333333,0.595238095238095,0.483333333333333 +"yujia wu","jing hua","yujia","wu",NA,"jing","hua",NA,0.2,0.333333333333333,0.2,0.333333333333333,0.2,0.333333333333333,0.444444444444444,0.4,0.444444444444444,0.4,0.447213595499958,0.408248290463863,0.285714285714286,0.25,0,0.611111111111111,0,0.611111111111111,0,0.611111111111111 +"yujia wu","hongwei zhang","yujia","wu",NA,"hongwei","zhang",NA,0,0,0,0,0,0,0.166666666666667,0,0.166666666666667,0,0.169030850945703,0,0.0909090909090909,0,0,0,0,0,0,0 +"yujia wu","jayanthi rao","yujia","wu",NA,"jayanthi","rao",NA,0.125,0,0.125,0,0.125,0,0.307692307692308,0,0.615384615384615,0,0.707106781186547,0,0.5,0,0.491666666666667,0,0.491666666666667,0,0.491666666666667,0 +"yujia wu","anthony holt","yujia","wu",NA,"anthony","holt",NA,0,0,0,0,0,0,0.166666666666667,0,0.333333333333333,0,0.298142396999972,0,0.222222222222222,0,0,0,0,0,0,0 +"hongwei zhang","jing hua","hongwei","zhang",NA,"jing","hua",NA,0.285714285714286,0.2,0.285714285714286,0.2,0.285714285714286,0.2,0.363636363636364,0.5,0.545454545454545,0.5,0.566946709513841,0.516397779494322,0.375,0.333333333333333,0.595238095238095,0.688888888888889,0.595238095238095,0.688888888888889,0.595238095238095,0.688888888888889 +"hongwei zhang","hongwei zhang","hongwei","zhang",NA,"hongwei","zhang",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"hongwei zhang","jayanthi rao","hongwei","zhang",NA,"jayanthi","rao",NA,0.125,0.2,0.125,0.2,0.125,0.2,0.266666666666667,0.25,0.4,0.25,0.358568582800318,0.258198889747161,0.272727272727273,0.142857142857143,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111,0.511904761904762,0.511111111111111 +"hongwei zhang","anthony holt","hongwei","zhang",NA,"anthony","holt",NA,0,0.2,0,0.2,0,0.2,0.428571428571429,0.222222222222222,0.428571428571429,0.222222222222222,0.50395263067897,0.223606797749979,0.3,0.125,0.428571428571428,0.483333333333333,0.428571428571428,0.483333333333333,0.428571428571428,0.483333333333333 +"kumaresh singh","adrian sandu","kumaresh","singh",NA,"adrian","sandu",NA,0.125,0.4,0.125,0.4,0.125,0.4,0.285714285714286,0.4,0.285714285714286,0.4,0.375,0.4,0.181818181818182,0.25,0.527777777777778,0.6,0.527777777777778,0.64,0.527777777777778,0.68 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"mohamed jardak","adrian sandu","mohamed","jardak",NA,"adrian","sandu",NA,0,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0.307692307692308,0.363636363636364,0.307692307692308,0.363636363636364,0.353553390593274,0.474341649025257,0.222222222222222,0.25,0.436507936507937,0.577777777777778,0.436507936507937,0.577777777777778,0.436507936507937,0.577777777777778 +"meemong lee","adrian sandu","meemong","lee",NA,"adrian","sandu",NA,0.142857142857143,0,0.142857142857143,0,0.142857142857143,0,0.153846153846154,0,0.153846153846154,0,0.106600358177805,0,0.111111111111111,0,0.436507936507937,0,0.436507936507937,0,0.436507936507937,0 +"kumaresh singh","adrian sandu","kumaresh","singh",NA,"adrian","sandu",NA,0.125,0.4,0.125,0.4,0.125,0.4,0.285714285714286,0.4,0.285714285714286,0.4,0.375,0.4,0.181818181818182,0.25,0.527777777777778,0.6,0.527777777777778,0.64,0.527777777777778,0.68 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"kevin w bowman","adrian sandu","kevin","bowman","w","adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.363636363636364,0.363636363636364,0.363636363636364,0.363636363636364,0.316227766016838,0.365148371670111,0.25,0.222222222222222,0.577777777777778,0,0.577777777777778,0,0.577777777777778,0 +"alexandru cioaca","adrian sandu","alexandru","cioaca",NA,"adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.4,0.181818181818182,0.666666666666667,0.181818181818182,0.746202507244636,0.282842712474619,0.444444444444444,0.125,0.611111111111111,0.455555555555555,0.65,0.455555555555555,0.688888888888889,0.455555555555555 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"elias d nino ruiz","adrian sandu","elias","ruiz","d","adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.363636363636364,0.222222222222222,0.363636363636364,0.222222222222222,0.474341649025257,0.223606797749979,0.25,0.125,0.577777777777778,0,0.577777777777778,0,0.577777777777778,0 +"mohamed jardak","adrian sandu","mohamed","jardak",NA,"adrian","sandu",NA,0,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0.307692307692308,0.363636363636364,0.307692307692308,0.363636363636364,0.353553390593274,0.474341649025257,0.222222222222222,0.25,0.436507936507937,0.577777777777778,0.436507936507937,0.577777777777778,0.436507936507937,0.577777777777778 +"meemong lee","adrian sandu","meemong","lee",NA,"adrian","sandu",NA,0.142857142857143,0,0.142857142857143,0,0.142857142857143,0,0.153846153846154,0,0.153846153846154,0,0.106600358177805,0,0.111111111111111,0,0.436507936507937,0,0.436507936507937,0,0.436507936507937,0 +"kumaresh singh","adrian sandu","kumaresh","singh",NA,"adrian","sandu",NA,0.125,0.4,0.125,0.4,0.125,0.4,0.285714285714286,0.4,0.285714285714286,0.4,0.375,0.4,0.181818181818182,0.25,0.527777777777778,0.6,0.527777777777778,0.64,0.527777777777778,0.68 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"kevin w bowman","adrian sandu","kevin","bowman","w","adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.363636363636364,0.363636363636364,0.363636363636364,0.363636363636364,0.316227766016838,0.365148371670111,0.25,0.222222222222222,0.577777777777778,0,0.577777777777778,0,0.577777777777778,0 +"ahmed attia","adrian sandu","ahmed","attia",NA,"adrian","sandu",NA,0.166666666666667,0,0.166666666666667,0,0.166666666666667,0,0.363636363636364,0.2,0.363636363636364,0.2,0.474341649025257,0.298142396999972,0.25,0.142857142857143,0.455555555555555,0.466666666666667,0.51,0.466666666666667,0.564444444444444,0.466666666666667 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"vishwas rao","adrian sandu","vishwas","rao",NA,"adrian","sandu",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.307692307692308,0.25,0.307692307692308,0.25,0.353553390593274,0.258198889747161,0.222222222222222,0.142857142857143,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"vishwas rao","adrian sandu","vishwas","rao",NA,"adrian","sandu",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.307692307692308,0.25,0.307692307692308,0.25,0.353553390593274,0.258198889747161,0.222222222222222,0.142857142857143,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"haiyan cheng","adrian sandu","haiyan","cheng",NA,"adrian","sandu",NA,0.333333333333333,0,0.333333333333333,0,0.333333333333333,0,0.666666666666667,0.2,0.666666666666667,0.2,0.75,0.2,0.428571428571429,0.111111111111111,0.777777777777778,0.466666666666667,0.777777777777778,0.466666666666667,0.777777777777778,0.466666666666667 +"adrian sandu","adrian sandu","adrian","sandu",NA,"adrian","sandu",NA,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +"ahmed attia","adrian sandu","ahmed","attia",NA,"adrian","sandu",NA,0.166666666666667,0,0.166666666666667,0,0.166666666666667,0,0.363636363636364,0.2,0.363636363636364,0.2,0.474341649025257,0.298142396999972,0.25,0.142857142857143,0.455555555555555,0.466666666666667,0.51,0.466666666666667,0.564444444444444,0.466666666666667 +"vishwas rao","adrian sandu","vishwas","rao",NA,"adrian","sandu",NA,0.142857142857143,0.2,0.142857142857143,0.2,0.142857142857143,0.2,0.307692307692308,0.25,0.307692307692308,0.25,0.353553390593274,0.258198889747161,0.222222222222222,0.142857142857143,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111,0.53968253968254,0.511111111111111 diff --git a/src/dataprep/pipeline.sh b/src/dataprep/pipeline.sh index 8431c27..2b09a04 100644 --- a/src/dataprep/pipeline.sh +++ b/src/dataprep/pipeline.sh @@ -143,9 +143,10 @@ Rscript -e "rmarkdown::render('$script_path/reports/quality_linking_advisors.Rmd # ## 3. Link NSF grants to MAG advisors bash $script_path/link/grants.sh $logfile_path -# XXX adapt for grants - use mona train -#python -m $script_path.link.write_csv_links --linking_type "advisors" --train_name "christoph_degree0" \ -# &> $logfile_path/write_csv_links_advisors.log +# XXX adapt for grants: links ScisciNet (nsf) to mag + +Rscript $script_path.prep_nsf.link_scinetnsf_to_mag.R &> \ + &> $logfile_path/write_csv_links_grants.log Rscript -e "rmarkdown::render('$script_path/reports/quality_linking_grants.Rmd', output_dir = '$output_path')" \ &> $logfile_path/quality_linking_grants.log