-
Notifications
You must be signed in to change notification settings - Fork 12
/
extract-rss.py
121 lines (99 loc) · 4.13 KB
/
extract-rss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# For usage and details, see http://www.gnu.org/licenses/gpl-3.0.txt
# AUTHOR:
#
# matteo DOT redaelli AT gmail DOT com
# http://www.redaelli.org/matteo
#
#
# USAGE:
#
# spark-submit --master yarn-client --driver-class-path /path/to/spark/assembly/target/scala-2.10/spark-assembly-1.3.0-hadoop2.5.2.jar extract-rss.py --source "/user/r/staging/twitter/searches/tyre/2014/12/*.gz" --target /tmp/tests/15
import re
import sys
import time
import xml.etree.cElementTree as ET
import os,argparse
from datetime import datetime
from pyspark import SparkContext
def clean_string(string):
try:
return string.replace("\t", " ").replace("\n", " ").replace("'", " ").replace('"', " ")
except:
return string
def html_to_text(html):
try:
return re.sub(r"<.*?>", "", html)
except:
return html
def rss_string_to_xml_object(line):
line = line.encode('utf8', 'replace')
tree = ET.ElementTree(ET.fromstring(line))
return tree.getroot()
def safe_root_find(root, field):
try:
return root.find(field).text
except:
return ""
def rss_string_to_list(line):
root = rss_string_to_xml_object(line)
title = clean_string(safe_root_find(root, 'title'))
description = clean_string(html_to_text(safe_root_find(root, 'description')))
pubDate = safe_root_find(root, 'pubDate')[5:16]
## carbuzz has not pubDate field... :-(
if pubDate == "":
pubDate = "%s-%s-%s" % (args.year, args.month, args.day)
else:
pubDate = datetime.strptime(pubDate, '%d %b %Y').strftime("%Y-%m-%d")
source = safe_root_find(root, 'rss_source')
link = safe_root_find(root, 'link')
language = safe_root_find(root, 'rss_language')
category = safe_root_find(root, 'rss_category')
return (language,source,category,pubDate,title,link,description)
def count_items_and_save(rdd, field_name, field_count, target_path, min_occurs=1):
rdd.map(lambda t: (t[field_count], 1))\
.reduceByKey(lambda x,y:x+y)\
.filter(lambda x:x[1] >= min_occurs)\
.map(lambda x:(x[1],x[0])).sortByKey(False)\
.map(lambda x: '\t'.join(unicode(i) for i in x)).repartition(1)\
.saveAsTextFile("%s/%s" % (target_path, field_name))
if __name__ == "__main__":
## parsing command line parameters
parser = argparse.ArgumentParser()
parser.add_argument("--source", help="source path")
parser.add_argument("--target", help="target_path")
parser.add_argument("--year", help="year")
parser.add_argument("--month", help="month")
parser.add_argument("--day", help="day")
args = parser.parse_args()
## if Day is None, I'll process the file related to yesterday
if args.day is None:
yesterday = datetime.datetime.now() - datetime.timedelta(days = 1)
args.year = yesterday.strftime('%Y')
args.month = yesterday.strftime('%m')
args.day = yesterday.strftime('%d')
## connecting to hdfs data
source_files = "%s/%s/%s/%s.gz" % (args.source, args.year, args.month, args.day)
target_path = args.target
sc = SparkContext(appName="rss-to-html-report.py")
rdd = sc.textFile(source_files).distinct().map(rss_string_to_list)
## extract stats
# (language,source,category,pubDate,title,link,description)
count_items_and_save(rdd, 'language', 0, target_path)
count_items_and_save(rdd, "source", 1, target_path)
count_items_and_save(rdd, 'category', 2, target_path)
count_items_and_save(rdd, "pubDate", 3, target_path)
## extract texts
rdd.map(lambda x: '\t'.join(unicode(i).replace("\n"," ") for i in x)).repartition(1).saveAsTextFile("%s/%s" % (target_path, "news"))