-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathanalyze.py
295 lines (265 loc) · 9.86 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
import json
import glob
import duckdb
con = duckdb.connect("gx-results.duckdb")
con.sql("""CREATE OR REPLACE TABLE timeouts AS SELECT size, timeout_minutes, timeout_minutes * 60 AS timeout_seconds FROM read_csv_auto('input-data/timeouts.csv', header = true);""")
con.sql("""CREATE OR REPLACE TABLE algorithms AS FROM read_csv_auto('input-data/algorithms.csv', header = true);""")
con.sql("""CREATE OR REPLACE TABLE datasets AS FROM read_csv_auto('input-data/datasets.csv', header = true);""")
con.sql("""CREATE OR REPLACE TABLE size_ordering AS FROM read_csv_auto('input-data/size_ordering.csv');""")
con.sql("""
CREATE OR REPLACE TABLE results_raw (
platform VARCHAR,
environment_name VARCHAR,
pricing DOUBLE,
job VARCHAR,
algorithm VARCHAR,
dataset VARCHAR,
run VARCHAR,
success BOOLEAN,
load_time FLOAT,
makespan FLOAT,
processing_time FLOAT
);
""")
for path in glob.glob("submissions/**/*.json", recursive=True):
with open(path) as f:
j = json.load(f)
platform = j["system"]["platform"]["name"]
pricing = j["system"]["pricing"]
environment_name = j["system"]["environment"]["name"]
## GraphBLAS Intel Xeon Gold 6342
if platform == 'GraphBLAS Intel Xeon Gold 6342':
pricing = 15354.81
## ecs.c8i.24xlarge and ecs.c8a.48xlarge
quantity = j["system"]["environment"]["machines"][0]["quantity"]
if environment_name == 'ecs.c8i.24xlarge':
pricing = 355789.91 / 7.0
if quantity != '':
pricing *= int(quantity)
else:
raise Exception("quantity is null")
if environment_name == 'ecs.c8a.48xlarge':
pricing = 643038.34 / 7.0
if quantity != '':
pricing *= int(quantity)
else:
raise Exception("quantity is null")
## empty environment
if environment_name == '':
environment_name = "bare metal, dedicated server"
## combine environments for GeaCompute submission
if environment_name == 'ecs.c8i.24xlarge' or environment_name == 'ecs.c8a.48xlarge':
environment_name = 'ecs.c8i.24xlarge / ecs.c8a.48xlarge'
for job_id in j["result"]["jobs"]:
job = j["result"]["jobs"][job_id]
algorithm = job["algorithm"]
dataset = job["dataset"]
runs = job["runs"]
for run_id in runs:
run = j["result"]["runs"][run_id]
success = run["success"]
load_time = run["load_time"]
makespan = run["makespan"]
processing_time = run["processing_time"]
if platform == "GraphBLAS":
platform = "GraphBLAS Intel Xeon Platinum 8369"
con.sql(f"INSERT INTO results_raw VALUES ('{platform}', '{environment_name}', {pricing}, '{job_id}', '{algorithm}', '{dataset}', '{run_id}', {success}, {load_time}, {makespan}, {processing_time});")
con.sql("""
CREATE OR REPLACE TABLE results_raw AS
SELECT *
FROM results_raw
JOIN datasets ON datasets.name = results_raw.dataset
;
""")
con.sql("""
CREATE OR REPLACE TABLE results AS
SELECT * EXCLUDE name
FROM results_raw
WHERE success = true
AND size IN ('S', 'M', 'L', 'XL', '2XL', '3XL')
;
""")
# load results
con.sql("""
CREATE OR REPLACE TABLE results AS
SELECT
platform,
environment_name,
pricing,
algorithm,
dataset,
size,
avg(makespan) AS makespan,
avg(processing_time) AS processing_time,
count(*) AS runs
FROM results
GROUP BY ALL
;
""")
# add price-adjusted columns
con.sql("""
CREATE OR REPLACE TABLE results AS
SELECT
*,
1 / makespan / pricing AS makespan_throughput_per_dollar,
1 / processing_time / pricing AS processing_throughput_per_dollar,
FROM results
;
""")
# create list of unique platform names
con.sql("""
CREATE OR REPLACE TABLE platforms AS
SELECT DISTINCT
platform AS platform,
environment_name AS environment_name
FROM results
;
""")
# create the required workload for each size category
con.sql("""
CREATE OR REPLACE TABLE workload AS
SELECT
datasets.name AS dataset,
datasets.size AS size,
algorithms.name AS algorithm,
platforms.platform AS platform,
platforms.environment_name AS environment_name
FROM datasets
CROSS JOIN algorithms
CROSS JOIN platforms
WHERE (NOT algorithms.requires_weights OR datasets.weighted = true)
;
""")
# for systems that do not have a result (time == NULL) for a given workload (=algorithm/dataset combination),
# we assign 3 times the timeout as penalty
con.sql("""
CREATE OR REPLACE TABLE results_full AS
SELECT
workload.size AS size,
workload.platform AS platform,
results.environment_name AS environment_name,
results.pricing AS pricing,
workload.dataset AS dataset,
workload.algorithm AS algorithm,
coalesce(results.makespan, NULL) AS makespan,
coalesce(results.processing_time, NULL) AS processing_time,
coalesce(makespan_throughput_per_dollar, NULL) AS makespan_throughput_per_dollar,
coalesce(processing_throughput_per_dollar, NULL) AS processing_throughput_per_dollar,
coalesce(runs, 3) AS runs
FROM workload
LEFT JOIN results
ON workload.dataset = results.dataset
AND workload.algorithm = results.algorithm
AND workload.platform = results.platform
AND workload.environment_name = results.environment_name
JOIN timeouts
ON timeouts.size = workload.size
JOIN platforms
ON platforms.platform = workload.platform
AND platforms.environment_name = workload.environment_name
;
""")
con.sql("""
CREATE OR REPLACE TABLE platforms_not_run_all_datasets AS
SELECT
size,
platform,
count(platform) as number_of_missing
FROM results_full
WHERE
makespan is NULL
AND processing_time is NULL
AND makespan_throughput_per_dollar is NULL
AND processing_throughput_per_dollar is NULL
GROUP BY ALL
ORDER BY size ASC
;
""")
con.sql("""
CREATE OR REPLACE TABLE results_filtered AS
SELECT
results_full.*
FROM results_full
WHERE
NOT EXISTS(
SELECT 1 FROM platforms_not_run_all_datasets
WHERE
results_full.size = platforms_not_run_all_datasets.size
AND results_full.platform = platforms_not_run_all_datasets.platform
)
GROUP BY ALL
ORDER BY results_full.size ASC
;
""")
con.sql("""
CREATE OR REPLACE TABLE results_platform_algorithm AS
SELECT
size,
platform,
max(environment_name) AS environment_name,
max(pricing) AS pricing,
algorithm,
avg(makespan) AS mean_makespan,
avg(processing_time) AS mean_processing_time,
1/avg(1/makespan_throughput_per_dollar) AS makespan_throughput_per_dollar,
1/avg(1/processing_throughput_per_dollar) AS processing_throughput_per_dollar,
min(runs) AS min_runs_per_workload_item
FROM results_filtered
GROUP BY ALL
ORDER BY size ASC, algorithm ASC, mean_processing_time ASC
;
""")
con.sql("""
CREATE OR REPLACE TABLE results_platform AS
SELECT
size,
platform,
environment_name,
avg(pricing) AS pricing, -- NULL prices should be consumed by max()
avg(mean_makespan) AS mean_makespan,
avg(mean_processing_time) AS mean_processing_time,
1/avg(1/makespan_throughput_per_dollar) AS makespan_throughput_per_dollar,
1/avg(1/processing_throughput_per_dollar) AS processing_throughput_per_dollar,
min(min_runs_per_workload_item) AS min_runs_per_workload_item
FROM results_platform_algorithm
GROUP BY ALL
ORDER BY size ASC, mean_processing_time ASC
;
""")
con.sql("""
CREATE OR REPLACE TABLE results_platform_price_adjusted AS
SELECT
rank,
size,
platform,
environment_name,
pricing,
mean_makespan,
mean_processing_time,
makespan_throughput_per_dollar,
processing_throughput_per_dollar,
min(min_runs_per_workload_item) AS min_runs_per_workload_item
FROM results_platform
JOIN size_ordering
ON size_ordering.name = results_platform.size
GROUP BY ALL
ORDER BY rank ASC, processing_throughput_per_dollar DESC
;
""")
con.sql("""
COPY (
SELECT
row_number() OVER (PARTITION BY rank ORDER BY rank) AS "Position",
size AS "Size",
platform AS "Platform",
environment_name AS "Environment name",
pricing AS "Pricing (USD)",
mean_makespan AS "Mean makespan (s)",
mean_processing_time AS "Mean processing time (s)",
1000000*makespan_throughput_per_dollar AS "Makespan throughput per dollar",
1000000*processing_throughput_per_dollar AS "Processing throughput per dollar",
min_runs_per_workload_item AS "Minimum runs per workload item"
FROM results_platform_price_adjusted
ORDER BY rank)
TO 'results_platform_price_adjusted.csv' (SEPARATOR '\t', QUOTE '');
""")