forked from emeryberger/CSrankings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrim-dblp.py
106 lines (82 loc) · 3.37 KB
/
trim-dblp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from lxml import etree as ElementTree
import htmlentitydefs
import csv
import operator
import re
parser = ElementTree.XMLParser(attribute_defaults=True, load_dtd=True)
# Papers must be at least 4 pages long to count.
pageCountThreshold = 4
# Match ordinary page numbers (as in 10-17).
pageCounterNormal = re.compile('(\d+)-(\d+)')
# Match page number in the form volume:page (as in 12:140-12:150).
pageCounterColon = re.compile('[0-9]+:([1-9][0-9]*)-[0-9]+:([1-9][0-9]*)')
def pagecount(input):
pageCounterMatcher1 = pageCounterNormal.match(input)
pageCounterMatcher2 = pageCounterColon.match(input)
start = 0
end = 0
count = 0
if (not (pageCounterMatcher1 is None)):
start = int(pageCounterMatcher1.group(1))
end = int(pageCounterMatcher1.group(2))
count = end-start+1
else:
if (not (pageCounterMatcher2 is None)):
start = int(pageCounterMatcher2.group(1))
end = int(pageCounterMatcher2.group(2))
count = end-start+1
return count
# Consider pubs in this range only.
startyear = 2000
endyear = 2016
outputfname = "dblp-reduced.xml"
def parseDBLP():
count = 0
output = open(outputfname, 'w')
output.write("""<?xml version="1.0" encoding="ISO-8859-1"?>
<!DOCTYPE dblp SYSTEM "dblp.dtd">
<dblp>""")
with open('dblp.xml', mode='r') as f:
# with gzip.open('dblp.xml.gz') as f:
for (event, node) in ElementTree.iterparse(f, events=['start', 'end']):
foundArticle = False
inRange = False
authorsOnPaper = 0
authorName = ""
confname = ""
year = -1
if (node.tag == 'inproceedings' or node.tag == 'article'):
# Check that dates are in the specified range.
for child in node:
if (child.tag == 'year' and type(child.text) is str):
year = int(child.text)
if ((year >= startyear) and (year <= endyear)):
inRange = True
break
if (not inRange):
# Out of range.
node.clear()
continue
# Count the number of pages. It needs to exceed our threshold to be considered.
pageCount = -1
for child in node:
if (child.tag == 'pages' and type(child.text) is str):
pageCount = pagecount(child.text)
if ((pageCount > 1) and (pageCount < pageCountThreshold)):
# Only skip papers with a very small paper count,
# but above 1. Why?
# DBLP has real papers with incorrect page counts
# - usually a truncated single page. -1 means no
# pages found at all => some problem with journal
# entries in DBLP.
# print "Skipping article with "+str(pageCount)+" pages."
node.clear()
continue
count = count + 1
print str(count)
output.write (ElementTree.tostring(node, pretty_print=True))
node.clear()
# If we got here, we have a winner.
output.write("</dblp>")
output.close()
parseDBLP()