-
Notifications
You must be signed in to change notification settings - Fork 2
/
static-archive.py
executable file
·111 lines (90 loc) · 3.57 KB
/
static-archive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
import sys, os, errno, glob
from lxml import etree
# load properties from files
# from http://www.linuxtopia.org/online_books/programming_books/python_programming/python_ch34s04.html
def getprops(filename):
#propFile= file( r"secrets.properties", "rU" )
propDict= dict()
try:
with open (filename, "rU") as propFile:
for propLine in propFile:
propDef= propLine.strip()
if len(propDef) == 0:
continue
if propDef[0] in ( '!', '#' ):
continue
punctuation= [ propDef.find(c) for c in ':= ' ] + [ len(propDef) ]
found= min( [ pos for pos in punctuation if pos != -1 ] )
name= propDef[:found].rstrip()
value= propDef[found:].lstrip(":= ").rstrip()
propDict[name]= value
propFile.close()
except (IOError):
print filename + ".properties not found"
return propDict
def ensure_dir(f):
d = os.path.dirname(f)
if not os.path.exists(d):
os.makedirs(d)
# load secrets (we just need the twitter id)
secrets = getprops("secrets.properties")
twitterID = "'" + secrets['TWITTER_ID'] + "'"
# walk through xml archive directories representing months; generate html for each
# we check whether the transformation needs to be run: are there any dump files
# that are more recent than a pre-existing output file, or is there no output file,
# or is the xsl file more recent than the output file
# prepare the xsl transformer
xsltFile = "statuses2html.xsl"
xslt_doc = etree.parse(xsltFile)
transform = etree.XSLT(xslt_doc)
xsltFileDate = os.path.getmtime(xsltFile)
skipped = 0
transformed = 0
yearDirs= os.listdir("archive/xml")
for yearDir in yearDirs:
if os.path.isdir("archive/xml/" + yearDir):
monthDirs = os.listdir("archive/xml/" + yearDir)
for monthDir in monthDirs:
if os.path.isdir("archive/xml/" + yearDir + "/" + monthDir):
outputFile = "archive/html/" + yearDir + "/" + yearDir + "_" + monthDir + ".html"
# get outputFile modification date (in seconds since epoch)
try:
outputFileDate = os.path.getmtime(outputFile)
except:
outputFileDate = 0
# there may be several dump files from multiple fetches
# so we assemble all the dump files into a single xml tree
monthTree = etree.Element("month")
dumpFiles = glob.glob("archive/xml/" + yearDir + "/" + monthDir + "/*.xml")
if outputFileDate == 0 or xsltFileDate > outputFileDate:
needsTransform = True
else:
needsTransform = False
if not needsTransform:
# check dates of dump files against output file
for dumpFile in dumpFiles:
dumpFileDate = os.path.getmtime(dumpFile)
if dumpFileDate > outputFileDate:
needsTransform = True
if needsTransform:
print "Transforming " + yearDir + "/" + monthDir
transformed += 1
for dumpFile in dumpFiles:
doc = etree.parse(dumpFile)
monthTree.append(doc.getroot())
timestampFetch = "'" + doc.getroot().get("timestamp_fetch") + "'"
# run the xsl transformation
monthName = "'" + yearDir + "-" + monthDir + "'"
result_tree = transform(monthTree, month=monthName, twitterID=twitterID, timestampFetch = timestampFetch)
# write the html
print "Saving " + outputFile
ensure_dir(outputFile)
# output the master xml
with open (outputFile, "w") as f:
f.write(etree.tostring(result_tree, xml_declaration=True, encoding='utf-8', pretty_print=True))
f.closed
else:
print "Skipping " + yearDir + "/" + monthDir
skipped += 1
print "Finished. Transformed: " + str(transformed) + " months; skipped: " + str(skipped) + " months; total: " + str(transformed + skipped) + " months."