forked from emeryberger/CSrankings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild-citations.py
196 lines (157 loc) · 6.54 KB
/
build-citations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
from lxml import etree as ElementTree
import htmlentitydefs
import csv
import operator
import re
import scholar
# import gzip
generateLog = True
parser = ElementTree.XMLParser(attribute_defaults=True, load_dtd=True)
# Papers must be at least 4 pages long to count.
pageCountThreshold = 4
# Match ordinary page numbers (as in 10-17).
pageCounterNormal = re.compile('(\d+)-(\d+)')
# Match page number in the form volume:page (as in 12:140-12:150).
pageCounterColon = re.compile('[0-9]+:([1-9][0-9]*)-[0-9]+:([1-9][0-9]*)')
querier = scholar.ScholarQuerier()
settings = scholar.ScholarSettings()
query = scholar.SearchScholarQuery()
def pagecount(input):
pageCounterMatcher1 = pageCounterNormal.match(input)
pageCounterMatcher2 = pageCounterColon.match(input)
start = 0
end = 0
count = 0
if (not (pageCounterMatcher1 is None)):
start = int(pageCounterMatcher1.group(1))
end = int(pageCounterMatcher1.group(2))
count = end-start+1
else:
if (not (pageCounterMatcher2 is None)):
start = int(pageCounterMatcher2.group(1))
end = int(pageCounterMatcher2.group(2))
count = end-start+1
return count
areadict = {
'proglang' : ['POPL', 'PLDI','OOPSLA'],
'logic' : ['CAV', 'LICS'],
'softeng' : ['ICSE', 'SIGSOFT FSE', 'ESEC/SIGSOFT FSE'],
'opsys' : ['SOSP', 'OSDI', 'EuroSys'],
'arch' : ['ISCA', 'MICRO', 'ASPLOS'],
'theory' : ['STOC', 'FOCS'],
'networks' : ['SIGCOMM', 'INFOCOM', 'NSDI'],
'security' : ['IEEE Symposium on Security and Privacy', 'ACM Conference on Computer and Communications Security', 'USENIX Security Symposium','NDSS'],
'mlmining' : ['NIPS', 'ICML','KDD'],
'ai' : ['AAAI', 'IJCAI'],
'database' : ['PODS', 'VLDB', 'PVLDB', 'SIGMOD Conference'],
'graphics' : ['ACM Trans. Graph.', 'SIGGRAPH'],
'metrics' : ['SIGMETRICS','IMC'],
'web' : ['WWW', 'SIGIR'],
'hci' : ['CHI','UbiComp','UIST'],
'nlp' : ['EMNLP','ACL','NAACL'],
'vision' : ['CVPR','ICCV'],
'mobile' : ['MobiSys','MobiCom','SenSys'],
'robotics' : ['ICRA','IROS','Robotics: Science and Systems']
}
# Build a dictionary mapping conferences to areas.
# e.g., confdict['CVPR'] = 'vision'.
confdict = {}
for k, v in areadict.items():
for item in v:
confdict[item] = k
# The list of all areas.
arealist = areadict.keys();
# Consider pubs in this range only.
startyear = 2000
endyear = 2016
outputfname = "citations.csv"
def parseDBLP(facultydict):
authlogs = {}
interestingauthors = {}
authorscores = {}
authorscoresAdjusted = {}
output = open(outputfname, mode='w')
with open('dblp.xml', mode='r') as f:
# with gzip.open('dblp.xml.gz') as f:
oldnode = None
for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
if (oldnode is not None):
oldnode.clear()
oldnode = node
foundArticle = False
inRange = False
authorsOnPaper = 0
authorName = ""
confname = ""
year = -1
if (node.tag == 'inproceedings' or node.tag == 'article'):
# First, check if this is one of the conferences we are looking for.
for child in node:
if (child.tag == 'booktitle' or child.tag == 'journal'):
if (child.text in confdict):
foundArticle = True
confname = child.text
break
if (not foundArticle):
# Nope.
continue
# It's a booktitle or journal, and it's one of our conferences.
# Check that dates are in the specified range.
for child in node:
if (child.tag == 'year'): # and type(child.text) is str):
year = int(child.text)
if ((year >= startyear) and (year <= endyear)):
inRange = True
break
if (not inRange):
# Out of range.
continue
# Count the number of pages. It needs to exceed our threshold to be considered.
pageCount = -1
for child in node:
if (child.tag == 'pages'):
pageCount = pagecount(child.text)
if ((pageCount > 1) and (pageCount < pageCountThreshold)):
# Only skip papers with a very small paper count,
# but above 1. Why?
# DBLP has real papers with incorrect page counts
# - usually a truncated single page. -1 means no
# pages found at all => some problem with journal
# entries in DBLP.
# print "Skipping article with "+str(pageCount)+" pages."
continue
# If we got here, we have a winner.
# Grab an author and a title.
for child in node:
if (child.tag == 'author'):
authorName = child.text
authorName.strip()
break
for child in node:
if (child.tag == 'title'):
title = child.text
title.strip()
break
query.set_author(authorName)
query.set_phrase(title)
query.set_scope(True)
query.set_timeframe(year, year)
query.set_num_page_results(1)
querier.send_query(query)
citations = 0
if (len(querier.articles) >= 1):
citations = int(querier.articles[0].attrs.items()[3][1][0])
print title+","+confname+","+str(year)+","+str(citations)
output.write(title+","+confname+","+str(year)+","+str(citations))
output.close()
def csv2dict_str_str(fname):
with open(fname, mode='r') as infile:
reader = csv.reader(infile)
#for rows in reader:
# print rows[0], "-->", rows[1]
d = {unicode(rows[0].strip(),'utf-8'): unicode(rows[1].strip(),'utf-8') for rows in reader}
return d
def sortdictionary(d):
return sorted(d.iteritems(), key=operator.itemgetter(1), reverse = True)
facultydict = csv2dict_str_str('faculty-affiliations.csv')
parseDBLP(facultydict)