-
Notifications
You must be signed in to change notification settings - Fork 11
/
pubmedXML.R
85 lines (84 loc) · 5.14 KB
/
pubmedXML.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
## clean pubmed XML returned from either the reutils or rentrez packages and save the cleaned XML to a new file
clean_api_xml <- function(infile, outfile) {
theData <- readChar(infile, file.info(infile)$size, useBytes = TRUE)
theData <- gsub("<?xml version=\"1.0\" ?>", "", theData, fixed = TRUE)
theData <- gsub("<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st January 2019//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd\">", "", theData, fixed = TRUE, useBytes = TRUE)
theData <- gsub("<PubmedArticleSet>", "", theData, fixed = TRUE)
theData <- gsub("</PubmedArticleSet>", "", theData, fixed = TRUE)
theData <- gsub("<U\\+\\w{4}>", "", theData) ## note: with some files this doesn't catch everything; potial issue with <OtherAbstract> tags especially
theData <- paste("<?xml version=\"1.0\" ?>", "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st January 2019//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd\">", "<PubmedArticleSet>", theData, "</PubmedArticleSet>", sep = "\n")
#theData <- paste(theData, "</PubmedArticleSet>")
theData <- iconv(theData, to = "UTF-8", sub = "")
writeLines(theData, outfile, sep = " ")
return(theData)
}
## extract a data frame from the cleaned XML
## Note: does not handle <pubmedBookArticle> documents
extract_xml <- function(theFile) {
library(XML)
newData <- xmlParse(theFile)
records <- getNodeSet(newData, "//PubmedArticle")
pmid <- xpathSApply(newData,"//MedlineCitation/PMID", xmlValue)
doi <- lapply(records, xpathSApply, ".//ELocationID[@EIdType = \"doi\"]", xmlValue)
doi[sapply(doi, is.list)] <- NA
doi <- unlist(doi)
authLast <- lapply(records, xpathSApply, ".//Author/LastName", xmlValue)
authLast[sapply(authLast, is.list)] <- NA
authInit <- lapply(records, xpathSApply, ".//Author/Initials", xmlValue)
authInit[sapply(authInit, is.list)] <- NA
authors <- mapply(paste, authLast, authInit, collapse = "|")
## affiliations <- lapply(records, xpathSApply, ".//Author/AffiliationInfo/Affiliation", xmlValue)
## affiliations[sapply(affiliations, is.list)] <- NA
## affiliations <- sapply(affiliations, paste, collapse = "|")
year <- lapply(records, xpathSApply, ".//PubDate/Year", xmlValue)
year[sapply(year, is.list)] <- NA
year[which(sapply(year, is.na) == TRUE)] <- lapply(records[which(sapply(year, is.na) == TRUE)], xpathSApply, ".//PubDate/MedlineDate", xmlValue)
year <- gsub(" .+", "", year)
year <- gsub("-.+", "", year)
articletitle <- lapply(records, xpathSApply, ".//ArticleTitle", xmlValue)
articletitle[sapply(articletitle, is.list)] <- NA
articletitle <- unlist(articletitle)
journal <- lapply(records, xpathSApply, ".//ISOAbbreviation", xmlValue)
journal[sapply(journal, is.list)] <- NA
journal <- unlist(journal)
volume <- lapply(records, xpathSApply, ".//JournalIssue/Volume", xmlValue)
volume[sapply(volume, is.list)] <- NA
volume <- unlist(volume)
issue <- lapply(records, xpathSApply, ".//JournalIssue/Issue", xmlValue)
issue[sapply(issue, is.list)] <- NA
issue <- unlist(issue)
pages <- lapply(records, xpathSApply, ".//MedlinePgn", xmlValue)
pages[sapply(pages, is.list)] <- NA
pages <- unlist(pages)
abstract <- lapply(records, xpathSApply, ".//Abstract/AbstractText", xmlValue)
abstract[sapply(abstract, is.list)] <- NA
abstract <- sapply(abstract, paste, collapse = "|")
meshHeadings <- lapply(records, xpathSApply, ".//DescriptorName", xmlValue)
meshHeadings[sapply(meshHeadings, is.list)] <- NA
meshHeadings <- sapply(meshHeadings, paste, collapse = "|")
chemNames <- lapply(records, xpathSApply, ".//NameOfSubstance", xmlValue)
chemNames[sapply(chemNames, is.list)] <- NA
chemNames <- sapply(chemNames, paste, collapse = "|")
grantAgency <- lapply(records, xpathSApply, ".//Grant/Agency", xmlValue)
grantAgency[sapply(grantAgency, is.list)] <- NA
grantAgency <- sapply(grantAgency, paste, collapse = "|")
grantAgency <- sapply(strsplit(grantAgency, "|", fixed = TRUE), unique)
grantAgency <- sapply(grantAgency, paste, collapse = "|")
names(grantAgency) <- NULL
grantNumber <- lapply(records, xpathSApply, ".//Grant/GrantID", xmlValue)
grantNumber[sapply(grantNumber, is.list)] <- NA
grantNumber <- sapply(grantNumber, paste, collapse = "|")
grantCountry <- lapply(records, xpathSApply, ".//Grant/Country", xmlValue)
grantCountry[sapply(grantCountry, is.list)] <- NA
grantCountry <- sapply(grantCountry, paste, collapse = "|")
grantCountry <- sapply(strsplit(grantCountry, "|", fixed = TRUE), unique)
grantCountry <- sapply(grantCountry, paste, collapse = "|")
nctID <- lapply(records, xpathSApply, ".//DataBank[DataBankName = 'ClinicalTrials.gov']/AccessionNumberList/AccessionNumber", xmlValue)
nctID[sapply(nctID, is.null)] <- NA
nctID <- sapply(nctID, paste, collapse = "|")
ptype <- lapply(records, xpathSApply, ".//PublicationType", xmlValue)
ptype[sapply(ptype, is.list)] <- NA
ptype <- sapply(ptype, paste, collapse = "|")
theDF <- data.frame(pmid, doi, authors, year, articletitle, journal, volume, issue, pages, abstract, meshHeadings, chemNames, grantAgency, grantNumber, grantCountry, nctID, ptype, stringsAsFactors = FALSE)
return(theDF)
}