-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqvr_processing.r
144 lines (140 loc) · 8.84 KB
/
qvr_processing.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
process_qvr_data <- function(the_data, id_column = "Appl.Id", hd_branch = TRUE) {
the_data <- unique(the_data)
if (sum(duplicated(the_data[,id_column])) > 0) {
split_data <- split(the_data, the_data[,id_column])
tmp <- which(sapply(split_data, nrow) > 1)
tmp2 <- lapply(tmp, function(y) colnames(split_data[[y]][sapply(1:ncol(split_data[[y]]), function(x) length(unique(split_data[[y]][,x]))) > 1]))
dupCols <- unique(unlist(tmp2))
the_data <- the_data[!duplicated(the_data[,id_column]),]
the_data <- the_data[order(the_data[,id_column]),] ## split_data is ordered by id_column, so need to order the_data the same way to ensure the values match
for (i in 1:length(dupCols)) {
the_data[,dupCols[i]] <- sapply(1:length(split_data), function(x) paste(unique(split_data[[x]][,dupCols[i]]), collapse = ";"))
}
}
if ("Animal" %in% colnames(the_data) == TRUE) {
the_data$is_animal <- ifelse(grepl("^10$|^35$|^9[8-9]$|^N$", the_data$Animal), "N", "Y")
the_data$is_animal[the_data$Animal == "-"] <- "U"
}
if ("Human" %in% colnames(the_data) == TRUE) {
the_data$is_human <- ifelse(grepl("^10$|^9[5-9]$|^N$", the_data$Human), "N", "Y")
the_data$is_human[the_data$Human == "-"] <- "U"
}
if (all("Clinical.Trial" %in% colnames(the_data), "Phase.3.Trials" %in% colnames(the_data)) == TRUE) {
the_data$is_trial <- sapply(1:nrow(the_data), function(x) ifelse(any(the_data$Clinical.Trial[x] %in% c("1", "Y"), the_data$Phase.3.Trials[x] %in% c("1", "Y")), "Y", "N"))
}
if ("Stat.Grp" %in% colnames(the_data) == TRUE) {
the_data$is_awarded <- ifelse(the_data$Stat.Grp %in% c("A", "TP", "U", "T"), "Y", "N")
}
if ("Inst.Type" %in% colnames(the_data) == TRUE) {
the_data$inst_type_desc[the_data$Inst.Type == 10] <- "Higher education"
the_data$inst_type_desc[the_data$Inst.Type == 20] <- "Research organization"
the_data$inst_type_desc[the_data$Inst.Type == 30] <- "Independent hospital"
the_data$inst_type_desc[the_data$Inst.Type == 40] <- "Other education"
the_data$inst_type_desc[is.na(the_data$inst_type_desc)] <- "Other"
}
if (hd_branch == TRUE && "PCC" %in% colnames(the_data) == TRUE) {
the_data$branch <- gsub("[ -].+", "", the_data$PCC)
the_data$branch[the_data$branch %in% c("HLB", "CDB")] <- "CDBB"
the_data$branch[the_data$branch %in% c("CE", "CD", "CRE", "CARE", "CRH", "CDDB")] <- "CRB"
the_data$branch[the_data$branch %in% c("GT", "DBGT", "DBSVB")] <- "DBCAB"
the_data$branch[the_data$branch %in% c("RS")] <- "FI"
the_data$branch[the_data$branch %in% c("MRDD", "IDD")] <- "IDDB"
the_data$branch[the_data$branch %in% c("PAMA")] <- "MPIDB"
the_data$branch[the_data$branch %in% c("CP", "ARMR", "BRMR", "TSR", "BSCD", "SMAD", "BSRT", "BSRE")] <- "NCMRR"
the_data$branch[the_data$branch %in% c("OPP")] <- "OPPTB"
the_data$branch[the_data$branch %in% c("DBS", "EA", "OHE")] <- "PDB"
the_data$branch[the_data$branch %in% c("NE", "ENG")] <- "PGNB"
the_data$branch[the_data$branch %in% c("PP")] <- "PPB"
the_data$branch[the_data$branch %in% c("PCCR")] <- "PTCIB"
}
if ("RFA.PA.Number" %in% colnames(the_data) == TRUE) {
the_data$nofo_number <- gsub("(PA)(\\d{2})", "\\1-\\2", the_data$RFA.PA.Number)
the_data$nofo_number <- gsub("(PA[RS])(\\d{2})", "\\1-\\2", the_data$nofo_number)
the_data$nofo_number <- gsub("([A-Z]{2})(\\d{2})", "RFA-\\1-\\2", the_data$nofo_number)
the_data$nofo_number <- gsub("(OTA)(\\d{2})", "\\1-\\2", the_data$nofo_number)
}
if (all("Actv" %in% colnames(the_data), "Project" %in% colnames(the_data)) == TRUE) {
the_data$core_project_num <- paste0(the_data$Actv, gsub("-.+", "", the_data$Project))
}
colnames(the_data) <- tolower(colnames(the_data))
colnames(the_data) <- gsub("\\.\\.*", "_", colnames(the_data))
colnames(the_data) <- gsub("_$", "", colnames(the_data))
the_data[the_data == "-"] <- NA
pcols <- which(sapply(1:ncol(the_data), function(x) sum(is.na(the_data[,x]))) == nrow(the_data))
the_data[,pcols] <- NULL
return(the_data)
}
# Usage
# appls <- read.csv("qvr_application_data.csv", stringsAsFactors = FALSE)
# appls <- process_qvr_data(appls)
## before doing this for each file, replace the [sub] box with a space " " in notepad++ for each file
## this function also assumes there are only two columns in the abstract data frame [appl_id, abstracts], so this only works for data in that structure
clean_abstracts <- function(filepath, overwrite = FALSE) {
the_abstracts <- scan(filepath, what = "varchar", sep = "\n", quiet = TRUE, skipNul = TRUE)
the_abstracts <- sapply(the_abstracts, iconv, to = "ASCII", sub = " ")
prob_lines <- which(grepl("\",\".+\",\".+", the_abstracts))
while (length(prob_lines) > 0) {
the_abstracts <- gsub("(\",\".+)(\",\")(.+)", "\\1 \\3", the_abstracts)
prob_lines <- which(grepl("\",\".+\",\".+", the_abstracts))
}
the_abstracts <- gsub("(?<!\",)$", "\",", the_abstracts, perl = TRUE)
if (overwrite == TRUE) {
writeLines(the_abstracts, con = filepath)
}
else {
filepath <- gsub("(\\.[a-z]{3,4}$)", "_cleaned\\1", filepath)
writeLines(the_abstracts, con = filepath)
}
message("Done")
}
## what I'm essentially doing here is reading in the csv as a vector of character strings so that the extra commas don't hurt anything
## I then look for the "," string of characters after the first column (i.e. a second occurrence of that string, which indicates an incorrect column break)
## I then gsub the second "," substring out of the main string by retaining just the substrings on either side of the problem substring
## This means I need to do that multiple times if there are multiple "," substrings in the main string, thus the 'while' loop
clean_org_names <- function(x) {
x <- gsub("\\bUNIV\\b", "UNIVERSITY", x)
x <- gsub("\\bHOSP\\b", "HOSPITAL", x)
x <- gsub("\\bRES\\b", "RESEARCH", x)
x <- gsub("\\bINST\\b", "INSTITUTE", x)
x <- gsub("\\bMED\\b", "MEDICAL", x)
x <- gsub("\\bBR\\b", "BRANCH", x)
x <- gsub("\\bCOLL\\b", "COLLEGE", x)
x <- gsub("\\bHLTH\\b", "HEALTH", x)
x <- gsub("\\bSCIS*\\b", "SCIENCE", x)
x <- gsub("\\bTX\\b", "TEXAS", x)
x <- gsub("\\bCTR\\b", "CENTER", x)
x <- gsub("\\bSCH\\b", "SCHOOL", x)
x <- gsub("\\bCOL\\b", "COLLEGE", x)
x <- gsub(", INC", "", x)
#x <- gsub(";LOAN REPAYMENT APPLICATIONS$", "", x)
x <- gsub("\\(.+\\)", "", x)
x <- gsub(" AT ", ", ", x)
x <- gsub("COLUMBIA UNIVERSITY HEALTH SCIENCES", "COLUMBIA UNIVERSITY", x)
x <- gsub("COLUMBIA UNIVERSITY NEW YORK MORNINGSIDE", "COLUMBIA UNIVERSITY", x)
x <- gsub("MOUNT SINAI SCHOOL OF MEDICINE OF CUNY", "MOUNT SINAI SCHOOL OF MEDICINE", x)
x <- gsub("CUNY GRADUATE SCHOOL AND UNIVERSITY CENTER", "CITY UNIVERSITY OF NEW YORK", x)
x <- gsub("TRUSTEES OF ", "", x)
x <- gsub("NEW YORK STATE PSYCHIATRIC INSTITUTE dba RESEARCH FOUNDATION FOR MENTAL HYGIENE", "NEW YORK STATE PSYCHIATRIC INSTITUTE", x)
x <- gsub("UNIVERSITY OF VERMONT & ST AGRIC COLLEGE", "UNIVERSITY OF VERMONT", x)
x <- gsub("REHABILITATION INSTITUTE OF CHICAGO D/B/A SHIRLEY RYAN ABILITYLAB", "REHABILITATION INSTITUTE OF CHICAGO", x)
x <- gsub("SANFORD RESEARCH/USD", "SANFORD RESEARCH", x)
x <- gsub("SANFORD BURNHAM PREBYS MEDICAL DISCOVERY INSTITUTE", "SANFORD RESEARCH", x)
x <- gsub("UNIVERSITY OF CONNECTICUT SCHOOL OF MEDICAL/DNT", "UNIVERSITY OF CONNECTICUT", x)
x <- gsub("UNIVERSITY OF MEDICAL/DENT OF NJ-NJ MEDICAL SCHOOL", "UNIVERSITY OF MEDICINE AND DENTISTRY OF NEW JERSEY", x)
x <- gsub("UNIVERSITY OF MEDICAL/DENT NJ-R W JOHNSON MEDICAL SCHOOL", "UNIVERSITY OF MEDICINE AND DENTISTRY OF NEW JERSEY", x)
x <- gsub("CHILDREN'S HOSPITAL PITTSBURGH/UPMC HEALTH SYS", "CHILDREN'S HOSPITAL PITTSBURGH", x)
x <- gsub("LUNDQUIST INSTITUTE FOR BIOMEDICAL INNOVATION, HARBOR-UCLA MEDICAL CENTER", "LUNDQUIST INSTITUTE FOR BIOMEDICAL INNOVATION", x)
x <- gsub("U.S. NATIONAL INSTITUTE/CHILD HEALTH/HUMAN DEV", "NATIONAL INSTITUTE CHILD HEALTH HUMAN", x)
x <- gsub("EUNICE KENNEDY SHRIVER CENTER MTL RETARDATN", "NATIONAL INSTITUTE CHILD HEALTH HUMAN", x)
x <- gsub("VIRGINIA POLYTECHNIC INSTITUTE AND ST UNIVERSITY", "VIRGINIA POLYTECHNIC INSTITUTE", x)
x <- gsub("MAGEE-WOMEN'S HOSPITAL OF UPMC", "MAGEE-WOMEN'S HOSPITAL", x)
x <- gsub("BOSTON UNIVERSITY MEDICAL CAMPUS", "BOSTON UNIVERSITY", x)
x <- gsub("RUTGERS, THE STATE UNIVERSITY OF N.J.", "RUTGERS UNIVERSITY", x)
x <- gsub("SLOAN-KETTERING INSTITUTE CAN RESEARCH", "SLOAN-KETTERING INSTITUTE CANCER RESEARCH", x)
x <- gsub("TEMPLE UNIVERSITY OF THE COMMONWEALTH", "TEMPLE UNIVERSITY", x)
x <- gsub("CLEVELAND CLINIC LERNER COM-CWRU", "CLEVELAND CLINIC", x)
x <- gsub("LSU PENNINGTON BIOMEDICAL RESEARCH CENTER", "LOUISIANA STATE UNIVERSITY", x)
x <- gsub("UNIVERSITY OF TEXAS MD ANDERSON CAN CENTER", "UNIVERSITY OF TEXAS MD ANDERSON CANCER CENTER", x)
x <- gsub("RBHS-ROBERT WOOD JOHNSON MEDICAL SCHOOL", "ROBERT WOOD JOHNSON MEDICAL SCHOOL", x)
x <- gsub("HENRY M. JACKSON FDN FOR THE ADV MIL/MEDICAL", "HENRY M. JACKSON FOUNDATION", x)
}