-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcompress.py
136 lines (111 loc) · 6.07 KB
/
compress.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import math
import numpy as np
import pandas as pd
import logging
import time
import torch
import argparse
from datetime import datetime
from deep_squeeze.autoencoder import AutoEncoder
from deep_squeeze.preprocessing import ds_preprocessing
from deep_squeeze.train_loop import train
from deep_squeeze.materialization import materialize, materialize_with_post_binning, \
materialize_with_bin_difference
from deep_squeeze.disk_storing import store_on_disk, calculate_compression_ratio
from deep_squeeze.experiment import repeat_n_times, display_compression_results, run_full_experiments, \
run_scaling_experiment, baseline_compression_ratios
from deep_squeeze.bayesian_optimizer import minimize_comp_ratio
logging.basicConfig(level=logging.INFO, format='%(levelname)s | %(asctime)s | %(message)s',
datefmt='%m/%d/%Y %I:%M:%S')
compression_repeats = 1
@repeat_n_times(n=compression_repeats) # To produce a consistent result we repeat the experiment n times
def compression_pipeline(params):
"""
The full compression pipeline performing the following steps:
1. Preprocess the input table (scaling and quantization)
2. Initialize the autoencoder model
3. Train the autoencoder
4. Materialize the results by retrieving the codes and their respective failures
5. Store the codes, the failures, the model and the scaler on disk
6. Get the final compression ratio we managed to achieve, which is our main evaluation metric
Args:
params: A dictionary of hyper-parameters (check main below for an example)
"""
start_time = time.time()
# Check if a CUDA enabled GPU exists
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
logging.debug(f"Running with: {device}")
# Read and preprocess the data
logging.debug("Reading and preprocessing data...")
raw_table = np.array(pd.read_csv(params['data_path'], header=None))
quantized, scaler = ds_preprocessing(raw_table, params['error_threshold'], min_val=0, max_val=1)
params['features'] = quantized.shape[1] # Need to store feature number for decompression
logging.debug("Done\n")
# Create the model and send it to the GPU (if a GPU exists)
logging.debug("Creating model...")
ae = AutoEncoder(quantized.shape[1], params['code_size'], params['width_multiplier'], params['ae_depth'])
ae.to(device)
logging.debug("Done\n")
# If the dataset is too big we must first sample it
# Of course at the end we compress the whole file and not just the sample
sample_data_size = int(min([params['sample_max_size'], len(quantized)]))
sample_data_inds = np.random.choice(len(quantized), sample_data_size, replace=False)
sample_data = quantized[sample_data_inds, :]
# Train the autoencoder
logging.debug("Training...")
model, loss = train(ae, device, sample_data, epochs=params['epochs'],
batch_size=sample_data.shape[0] // params['batch_size'], lr=params['lr'])
logging.debug(f"Training finished. Final loss: {float(loss):.3f}")
# Set the model to eval mode
model.eval()
# Materialization step
if params['binning_strategy'] == "POST_BINNING":
codes, failures = materialize_with_post_binning(model, quantized, device, params['error_threshold'])
elif params['binning_strategy'] == "BIN_DIFFERENCE":
codes, failures = materialize_with_bin_difference(model, quantized, device, params['error_threshold'])
elif params['binning_strategy'] == "NONE":
codes, failures = materialize(model, quantized, device)
else:
raise ValueError("Available binning strategies: \"NONE\", "
"\"POST_BINNING\", \"BIN_DIFFERENCE\"")
# Store the final file on disk
comp_path = store_on_disk(params['compression_path'], model, codes, failures, scaler, params)
total_time = time.time() - start_time
# Log the final compression ratio DeepSqueeze achieved
comp_ratio, comp_size, orig_size = calculate_compression_ratio(params['data_path'], comp_path)
logging.debug(
f"Compression ratio: {(comp_ratio * 100):.2f}% ({comp_size * 1e-6:.2f}MB / {orig_size * 1e-6:.2f}MB) | "
f"Time: {total_time:.2f}s")
return comp_ratio
if __name__ == '__main__':
params = {
"epochs": 1,
"ae_depth": 2, # Value in paper: 2
"width_multiplier": 2, # Value in paper: 2
"batch_size": [1_000, 2_000], # Optimized through bayesian optimization
"lr": 1e-4,
"code_size": [1, 3], # Optimized through bayesian optimization
"binning_strategy": "POST_BINNING", # "NONE", "POST_BINNING", "BIN_DIFFERENCE",
"sample_max_size": 2e5
}
# Parse the input arguments, of input file, output file and error threshold
parser = argparse.ArgumentParser(description='Give the input, output and error threshold.')
parser.add_argument('-i', '--input', type=str, help='path to input table', required=True)
parser.add_argument('-o', '--output', type=str, help='path to compressed file', required=True)
parser.add_argument('-e', '--error', type=float, help='Percentage [0, 100] of error allowed', required=True)
args = parser.parse_args()
params['data_path'] = args.input
params['compression_path'] = args.output
params['error_threshold'] = args.error / 100 # Transform percentage [0, 100] to [0,1] range
# Getting starting date and time for logging
today = datetime.now().strftime("%d_%m_%Y__%HH_%MM_%SS")
# __________ Bayesian optimization run __________
logging.info("Starting Bayesian Optimization, fine-tuning code size and batch size\n")
best_params = minimize_comp_ratio(compression_pipeline, params)['params']
# __________ Best parameters run __________
logging.info("Creating final compressed file with the so far best parameters")
for par, val in best_params.items():
params[par] = int(val) # Set the best parameters we found as the best parameters
params['sample_max_size'] = math.inf
comp_ratio, _ = compression_pipeline(params)
logging.info(f"Finished. Final compression ratio: {(comp_ratio * 100):.2f}%")