-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcombine_histograms.py
executable file
·85 lines (73 loc) · 3.12 KB
/
combine_histograms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
"""
CSV Histogram Combiner
Daniel M. Zimmerman, September 2024
Usage:
$ ./combine_histograms.py <directory> <type>
Takes all the <type>.csv files in <directory>, which are assumed to be
histograms with terms (the first column) and frequencies (the second
column), and generates a single <type>.csv file combining all terms and
frequencies. This script does _not_ attempt to aggregate "similar"
terms or do any other clever optimizations when combining the
histograms; if Python thinks two terms are equivalent, their
frequencies are aggregated, and if it doesn't, they aren't.
"""
import sys
import csv
import os
class MalformedFile(Exception):
pass
def process_csv_file(file_path, histogram):
"""Read a CSV file and update the histogram with its counts."""
try:
with open(file_path, newline='') as csvfile:
reader = csv.DictReader(csvfile, fieldnames=['freq', 'term'], restkey='rest', delimiter='\t')
for row in reader:
if (row['freq'] == None or row['term'] == None or 'rest' in row):
raise MalformedFile
else:
new_freq = int(row['freq'])
if row['term'] in histogram:
new_freq = histogram[row['term']] + new_freq
histogram[row['term']] = new_freq
except MalformedFile:
print(f"Error processing {file_path}: incorrect number of columns")
except Exception as e:
print(f"Error processing {file_path}: {e}")
def generate_output_file(histogram, histogram_type):
"""Generate a <histogram_type>.csv file containing the histogram."""
# using negative numbers to sort phrases with
# more occurrences ahead of those with fewer
sorted_histogram = sorted([(0 - n, phrase) for (phrase, n) in histogram.items()])
# if we've gotten here, we don't check for overwriting the file - that check
# happens before we process anything
f = open(f"{histogram_type}.csv", "a")
for (n, p) in sorted_histogram:
print("%u\t%s" % (0 - n, p), file=f)
f.close()
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: ./combine_histograms.py <directory> <type>")
sys.exit(1)
input_path = sys.argv[1]
histogram_type = sys.argv[2]
# check for existing output file
if os.path.isfile(f'{histogram_type}.csv'):
print(f'{histogram_type}.csv already exists, aborting.')
sys.exit(1)
# check for directory
if os.path.isdir(input_path):
# process all csv files
combined_histogram = {}
files_processed = 0
for file_name in os.listdir(input_path):
file_path = os.path.join(input_path, file_name)
# check if it's a .csv file
if os.path.isfile(file_path) and file_name.endswith(f'{histogram_type}.csv'):
files_processed = files_processed + 1
process_csv_file(file_path, combined_histogram)
print(f'{files_processed} files processed')
generate_output_file(combined_histogram, histogram_type)
else:
print(f"'{input_path}' is not a directory.")
sys.exit(1)