-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathnikki_process_users_votes.py
86 lines (68 loc) · 2.39 KB
/
nikki_process_users_votes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
CMSC 12300 / CAPP 30123
Task: Pre-processing data
Main author: Sanittawan (Nikki)
"""
import csv
from bs4 import BeautifulSoup
from lxml import etree
HEADERS_USERS = ['id', 'reputation', 'creationdate',
'displayname', 'lastaccessdate', 'websiteurl',
'location', 'aboutme', 'views',
'upvotes', 'downvotes', 'profileimageurl',
'accountid']
HEADERS_VOTES = ['id', 'postid', 'votetypeid', 'creationdate']
def write_row_to_csv(row, out_filename):
"""
Write a single row to a CSV file in an appending mode
Inputs:
row: (list) of a single row
out_filename: (string) name of the output file
Returns: nothing
"""
with open(out_filename, "a", newline='') as output:
wr = csv.writer(output, dialect='excel')
wr.writerow(row)
def process_single_line(line, headers=None):
"""
Clean a single line
Inputs:
line: (string) of a single line in XML file
headers: (list) of the output CSV file
Returns: (list) of a processed single line
"""
soup = BeautifulSoup(line, "lxml")
tag_attr_dict = soup.row.attrs
print(tag_attr_dict.keys())
rv = []
for col in headers:
if col not in tag_attr_dict.keys():
rv.append('')
else:
rv.append(tag_attr_dict[col])
return rv
def process_xml_by_line(filepath, out_filename, headers=None):
"""
Read a single line from a file and process it
Inputs:
filepath: (string) path to the input file
out_filename: (string) name of the output file
headers: (list) of the headers of the output file
Returns: nothing. Directly manipulate the output file
"""
with open(filepath) as f:
line = f.readline() # skip row 0
line = f.readline() # skip row 1
write_row_to_csv(headers, out_filename)
for i in range(502):
print("Line {}: {}".format(i, line.strip()))
line = f.readline()
line_list = process_single_line(line, headers)
write_row_to_csv(line_list, out_filename)
if __name__ == '__main__':
users_filepath = "./Users.xml"
users_csv = "sample_users.csv"
process_xml_by_line(users_filepath, users_csv, HEADERS_USERS)
votes_filepath = "./Votes.xml"
votes_csv = "sample_votes.csv"
process_xml_by_line(votes_filepath, votes_csv, HEADERS_VOTES)