-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinno.py
164 lines (143 loc) · 7.2 KB
/
inno.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import sys
import datetime
import gzip
def parseline(line):
""" parse by line from PtaPkgRJS.{0..127}.s tables
entries format is project, time, author, packages """
tokens = line.strip().split(';')
timestamp, project, author, packages = tokens[0], tokens[1], tokens[2], tokens[3:]
return project, timestamp, author, packages
###############################################################################
def write_project_packages_mem_table(table):
""" to run script on raw data tables one by one (to avoid consuming too much memory)
we need to memorize most up-to-date project packages table """
with open('data/ppkgs.mem', 'w') as f:
f.write('\n'.join(project + ';' + ';'.join(table[project]) for project in table))
print('Written project packages mem table')
return
def read_project_packages_mem_table():
""" counterpart to write_project_packages_mem_table,
read the most up-to-date project packages table as baseline for future innovation """
with open('data/ppkgs.mem', 'r') as f:
table = {}
for line in f:
tokens = line.strip().split(';')
if len(tokens) <= 1: # ignore invalid lines
continue
project, packages = tokens[0], tokens[1:]
table[project] = set(packages)
print('Read project packages mem table with ' + str(len(table)) + ' project entries')
return table
###############################################################################
def write_innovations_mem_table(table):
""" run script on raw tables one by one produce innovations map each time """
def mk_entry(innovation):
pkgA, pkgB = innovation
project, timestamp, author, count = table[innovation]
return ';'.join([pkgA, pkgB, project, timestamp, author, str(count)])
with open('data/innos.mem', 'w') as f:
f.write('\n'.join(mk_entry(innovation) for innovation in table))
print('Written innovations mem table')
return
def read_innovations_mem_table():
""" counterpart to write_innovations_mem_table,
read innovations seen so far as baseline for future innovation updates """
with open('data/innos.mem', 'r') as f:
table = {}
for line in f:
tokens = line.strip().split(';')
if len(tokens) < 6: # ignore invalid lines
continue
innovation = (tokens[0], tokens[1])
project, timestamp, author = tokens[2], tokens[3], tokens[4]
count = int(tokens[5])
table[innovation] = (project, timestamp, author, count)
print('Read innovations mem table with ' + str(len(table)) + ' innovation entries')
return table
###################################################################################################
if __name__ == '__main__':
if len(sys.argv) >= 2 and sys.argv[1] == 'projects':
# projects = {}
# for line in sys.stdin:
# project, _t, _a, _p = parseline(line)
# if project not in projects:
# projects[project] = 1
# else:
# projects[project] = projects[project] + 1
# topkeys = sorted(projects.keys(), key = lambda p: projects[p], reverse = True)
# for project in topkeys:
# print(project)
sys.exit(0)
###############################################################################
if len(sys.argv) >= 3 and sys.argv[1] == 'in-project-innos':
# project = sys.argv[2]
# current_packages = set()
# innovations = {}
# for line in sys.stdin:
# _p, timestamp, author, packages = parseline(line)
# for new_package in packages: # consider new package with every current package
# if new_package in current_packages:
# continue # ignore package already in current packages
# for current_package in current_packages:
# pair = tuple(sorted([new_package, current_package]))
# # it is not possible for pair to be already in innovations
# innovations[pair] = (project, timestamp, author)
# current_packages.add(new_package)
# for pair in innovations:
# pkgA, pkgB = pair
# print(';'.join([pkgA, pkgB, project, timestamp, author, '1']))
sys.exit(0)
###############################################################################
# How-To
# run `python inno.py <low_line_index> <high_line_index> &`
# requires and updates `data/innos.mem` and `data/ppkgs.mem`
# `data/ppkgs.mem` should contain the complete most recent set
# `data/innos.mem` should be either the complete most recent set or empty
# if empty, must migrate the resulted table into the complete most recent set
# see `data/copy/store/migrate.py`
# project -> seen packages set
project_packages_map = read_project_packages_mem_table()
# package pair -> (earliest seen) project, timestamp, author, occurance count
innovations = read_innovations_mem_table()
print('[debug] {' + str(datetime.datetime.now()) + '} Done read mem tables. Start read from stdin.')
line_count = 0
if len(sys.argv) <= 2:
print('Specify data process range! (lower inclusive, upper exclusive)')
sys.exit(-1)
read_data_lower = int(sys.argv[1])
read_data_upper = int(sys.argv[2])
print('Read data to process from line >= ' + str(read_data_lower) + ' to < ' + str(read_data_upper))
with gzip.open('/da0_data/play/JSthruMaps/tPaPkgRJS.s', 'r') as data_f:
for line in data_f:
line_count += 1
if line_count < read_data_lower:
continue # not yet start process data in range
if line_count >= read_data_upper:
print('[debug] {' + str(datetime.datetime.now()) + '} Done entire specified data process range.')
break # ignore more stdin if limit specified
if line_count in [3043426, 3050807, 3050808, 3067991, 3067992, 3127315, 4616934, 5225409,
16876583, 18322303, 18322824, 18322934, 18323512, 18324553, 18326294, 18326757,
18327082, 18327143, 18327707, 18327849, 18327932, 20699005, 20699163, 20699599,
20699729, 20832248, 25887024, 38223498]:
continue # line too long
project, timestamp, author, packages = parseline(line)
if project not in project_packages_map:
project_packages_map[project] = set() # initialize
for new_package in packages: # consider new package with every existing package
if new_package in project_packages_map[project]:
continue # package is already in current packages
for current_package in project_packages_map[project]:
pair = tuple(sorted([new_package, current_package])) # unorder
if pair not in innovations:
innovations[pair] = (project, timestamp, author, 1)
else: # innovation exists, only update count
current_project, current_timestamp, current_author, current_count = innovations[pair]
innovations[pair] = (current_project, current_timestamp, current_author, 1 + current_count)
# after new package innovations are done, put new package into current packages
project_packages_map[project].add(new_package)
if line_count % 10000 == 9999:
print('[debug] {' + str(datetime.datetime.now()) + '} Done processing stdin line ' + str(line_count) + '.')
print('Processed data upto (in/ex) ' + str(line_count))
print('[debug] {' + str(datetime.datetime.now()) + '} Done innovations from stdin. Start write new mem tables.')
write_project_packages_mem_table(project_packages_map)
write_innovations_mem_table(innovations)