-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathcablegate.py
executable file
·111 lines (79 loc) · 2.66 KB
/
cablegate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/python3
import urllib
import os
import sys
import argparse
from lxml import etree
from lxml import html
import StringIO, webbrowser
import HTMLParser
import traceback
import random
import time, re, urllib, urllib2
#url = "http://www.moveflat.com/london-flat/flatshare-flatmate/london-areas/property/Box/"
#sock = urllib.urlopen(url)
#htmlSource = sock.read()
#sock.close()
parser = argparse.ArgumentParser(description='Parse Cablegate Files')
parser.add_argument('-d', dest='directory', nargs=1, required=True, help="Cablegate Directory")
parser.add_argument('-l', dest='year', nargs=1, required=True, help="Year YYYY Format")
args = parser.parse_args()
cable_files = []
try:
for dirname, dirnames, filenames in os.walk(args.directory[0] + "/cable/" + args.year[0]):
#for subdirname in dirnames:
# print os.path.join(dirname, subdirname)
for filename in filenames:
#print os.path.join(dirname, filename)
cable_files.append(os.path.join(dirname, filename))
except Exception, err:
print "An Error Occurred! "
traceback.print_exc()
sys.exit()
final_items = []
for fname in cable_files:
try:
f = open(fname,'r+')
source = f.read()
result = html.fromstring(source)
f.close()
details = result.find_class("cable")[0]
cable = result.findall(".body/div/div/code")[1] # use xpath relative
#print html.tostring(details)
cable_string = html.tostring(cable)
rawitems = cable_string.split("<a")
for item in rawitems:
#tidy the items
item = item[item.find("</a>")+4:]
item = item.strip("END SUMMARY")
item= HTMLParser.HTMLParser().unescape(item)
subitems = item.split("\n\n")
for sitem in subitems:
titem = ""
subsubitems = sitem.split("\n")
for ssitem in subsubitems:
if not( ssitem == " " or "--------" in ssitem or "</code>" in ssitem or "<pre>" in ssitem or ssitem.upper() == ssitem) and len(ssitem) > 1:
titem += ssitem
if len(titem) > 100 and not "Classified By:" in titem and not "CLASSIFIED BY:" in titem and titem.upper() != titem and not "SUBJECT:" in titem:
final_items.append(titem)
except Exception, err:
#print "Error"
#traceback.print_exc()
pass
random.seed()
premsg = final_items[ random.randint(0, len(final_items) -1 ) ]
pitems = premsg.split(".")
final_items = []
for pitem in pitems:
if len(pitem) > 10:
final_items.append(pitem)
premsg = final_items[random.randint(0, len(final_items) -1 )]
premsg = "<^(,__,)~~" + premsg [:140] + "..."
if len(premsg) > 0:
print premsg
message = urllib.quote(premsg)
urllib2.urlopen("http://127.0.0.1:8020/%s" % message)
#print "FINAL ITEMS"
#for item in final_items:
# print "***"
# print item