-
Notifications
You must be signed in to change notification settings - Fork 3
/
parse_plos_author_notes.Rmd
140 lines (112 loc) · 4.25 KB
/
parse_plos_author_notes.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
Build Relation between Author Initials and Roles
========================================================
Get the data from the PLOS API:
```{r}
# Load required packages
library(rplos)
library(stringr)
library(ggplot2)
# Return DOI and author_notes for all PLOS research articles
result <- searchplos(terms = "*:*", fields = "id,author_notes",
list('article_type:"research article"', 'doc_type:full'),
limit=100000, key=getOption('PlosApiKey'))
```
Now we have to parse the author notes. The pattern we look for is
role: author list.
That is, we need a colon followed by a period.
```{r}
pair.rx <- '\\s*([^:]+):\\s+([^.:]+)[.]'
pair.matches <- str_match_all(result$author_notes, pair.rx)
author.role.table <- do.call(rbind, mapply(function(id, m) {
if (identical(m, character(0))) {
print(paste("no matches for", id))
NULL
} else {
# first column is the whole matched string
role <- m[,2]
initials.string <- m[,3]
stopifnot(length(role) == length(initials.string))
initials <- str_split(initials.string, '\\s+')
cbind(id=id, do.call(rbind, mapply(function(r, i) {
data.frame(role=r, author=i)
}, role, initials, SIMPLIFY=FALSE, USE.NAMES=FALSE)))
}
}, result$id, pair.matches, SIMPLIFY=FALSE, USE.NAMES=FALSE))
```
# How many did we parse with this logic?
```{r}
parsed.ids <- length(unique(author.role.table$id))
total.ids <- nrow(result)
parsed.ids / total.ids
```
Some of the exceptions are due to different conventions.
For example, the convention for PLOS Genetics seems to be
DTR, JD, TMB, MBM, and DBA conceived and designed the experiments. DTR, JD, and DBA performed the experiments. DTR, JD, LKV, and DBA analyzed the data. DTR, JD, HKT, JRF, MAP, and DBA contributed reagents/materials/analysis tools. DTR, JD, LKV, HKT, TMB, RPK, RF, MAP, NL, MBM, and DBA wrote the paper.
(from 10.1371/journal.pgen.002013)
# What if we just look at the main 5 roles?
```{r}
main.roles <- c(
'Conceived and designed the experiments',)
'Performed the experiments',
'Analyzed the data',
'Contributed reagents/materials/analysis tools',
'Wrote the paper')
normalised.main.roles <- tolower(sub('^(\\w+).+$', '\\1', main.roles))
ar.main <- transform(subset(author.role.table,
tolower(gsub('\\s', '', role)) %in% tolower(gsub('\\s', '', main.roles))),
normalised.role=tolower(sub('^(\\w+).+$', '\\1', role)))
nrow(ar.main) / nrow(author.role.table)
```
```{r}
ar.main.num.authors <- aggregate(author ~ id, ar.main, length)
```
```{r}
ar.main.author.counts <- aggregate(author ~ id + normalised.role, ar.main, length)
m <- ggplot(ar.main.author.counts, aes(x = author))
m <- m + geom_histogram(binwidth = 1)
m + facet_grid(normalised.role ~.)
```
```{r}
ar.main.role.counts <- aggregate(normalised.role ~ id + author, ar.main, length)
m <- ggplot(ar.main.role.counts, aes(x = normalised.role))
m + geom_histogram(binwidth = 1)
```
```{r}
authors.in.only.one.role <- subset(ar.main.role.counts, normalised.role == 1)
authors.in.only.one.role <- merge(authors.in.only.one.role[,c('id','author')],
ar.main)
#aggregate(authors.in.only.one.role)
m <- ggplot(authors.in.only.one.role, aes(x = normalised.role))
m + geom_bar()
```
We can then build a smaller data frame with one row per author-paper pair and dummy variables for each of the main roles.
```{r}
# by(ar.main, ar.main$id, function(r) {
# authors <- sort(unique(r$author))
# as.list(subset(r, select=c(author,role)))
# #pa.row <- data.frame(id=r$id[1], author=authors)
# #for (main.role in main.roles) {
# #}
# });
```
#doi,author,conceived,performed,analyzed,contributed,wrote
# by(result, 1:nrow(result), function(r) {
# author.notes <- sub('[.]$', '', r$author_notes)
# chunks <- unlist(str_split(author.notes, '[.]\\s?'))
# if (all(str_count(chunks, ':') == 1)) {
# good.chunks <- str_split(chunks, ':\\s?')
# roles <- chunks[seq(1,length(chunks),by=2)]
# all.initials <- chunks[seq(2,length(chunks),by=2)]
# initials <- str_split(all.initials, '\\s+')
# if (length(initials) != length(roles)) {
# print(chunks)
# print(roles)
# print(initials)
# }
# names(initials) <- roles
# initials
# } else {
# NULL
# }
# #valid <- roles %in% valid.roles
# })