This repository has been archived by the owner on Apr 14, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpredict.py
95 lines (78 loc) · 4.83 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import yaml
from glob import glob
import hydra
from hydra.utils import to_absolute_path
from omegaconf import DictConfig
import mlflow
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.models import load_model
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.config.experimental.set_virtual_device_configuration(physical_devices[0],
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10*1024)])
@hydra.main(config_path='configs', config_name='predict')
def main(cfg: DictConfig) -> None:
mlflow.set_tracking_uri(f'file://{to_absolute_path(cfg["path_to_mlflow"])}')
# setup gpu
physical_devices = tf.config.list_physical_devices('GPU')
# tf.config.experimental.set_memory_growth(physical_devices[cfg["gpu_id"]], True)
tf.config.set_logical_device_configuration(
physical_devices[cfg["gpu_id"]],
[tf.config.LogicalDeviceConfiguration(memory_limit=cfg["memory_limit"]*1024)])
logical_gpus = tf.config.list_logical_devices('GPU')
print(len(physical_devices), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
print('\n-> Loading model\n')
if cfg["checkpoint"] is not None:
path_to_model = to_absolute_path(f'{cfg["path_to_mlflow"]}/{cfg["experiment_id"]}/{cfg["run_id"]}/artifacts/checkpoints/{cfg["checkpoint"]}')
else:
path_to_model = to_absolute_path(f'{cfg["path_to_mlflow"]}/{cfg["experiment_id"]}/{cfg["run_id"]}/artifacts/model/')
with mlflow.start_run(experiment_id=cfg["experiment_id"], run_id=cfg["run_id"]) as active_run:
trained_with_custom_schedule = 'CustomSchedule' in active_run.data.params["opt_learning_rate"]
if trained_with_custom_schedule: # have to pass schedule signature as custom_objects
model = load_model(path_to_model, {'CustomSchedule': lambda **unused_kwargs: None})
else:
model = load_model(path_to_model)
if cfg["n_files"] == -1: # take all the files
paths = glob(to_absolute_path(f'{cfg["path_to_dataset"]}/{cfg["dataset_name"]}/{cfg["dataset_type"]}/*/{cfg["tau_type"]}'))
else: # take only first n_files
paths = glob(to_absolute_path(f'{cfg["path_to_dataset"]}/{cfg["dataset_name"]}/{cfg["dataset_type"]}/*/{cfg["tau_type"]}'))[:cfg["n_files"]]
for p in paths:
file_name = p.split('/')[-2]
dataset = tf.data.experimental.load(p)
dataset = dataset.batch(cfg["batch_size"])
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
# load cfg used to produce dataset and retrieve column names
with open(to_absolute_path(f'{p}/cfg.yaml'), 'r') as f:
dataset_cfg = yaml.safe_load(f)
label_column_names = dataset_cfg["label_columns"]
add_column_names = dataset_cfg['data_cfg']['input_data'][cfg["dataset_type"]]["add_columns"]
print(f'\n-> Predicting {file_name}')
predictions, labels, add_columns = [], [], []
for (*X, y, add_data) in dataset:
predictions.append(model.predict(X))
labels.append(y)
add_columns.append(add_data)
predictions = tf.concat(predictions, axis=0).numpy()
labels = tf.concat(labels, axis=0).numpy()
add_columns = tf.concat(add_columns, axis=0).numpy()
# log to mlflow and delete intermediate file
with mlflow.start_run(experiment_id=cfg["experiment_id"], run_id=cfg["run_id"]) as active_run:
# extract mapping between model nodes and corresponding label names
model_node_to_name = {k: v for k,v in active_run.data.params.items() if k.startswith('model_node_')}
model_node_to_name = {int(k.split("model_node_")[-1]): v for k,v in model_node_to_name.items()}
model_node_names = [model_node_to_name[k] for k in sorted(model_node_to_name)]
predictions = pd.DataFrame(data=predictions, columns=[f'pred_{tau_type}' for tau_type in model_node_names])
labels = pd.DataFrame(data=labels, columns=label_column_names, dtype=np.int64)
add_columns = pd.DataFrame(data=add_columns, columns=add_column_names)
print(f' Saving to hdf5\n')
predictions.to_hdf(f'{cfg["output_filename"]}.h5', key='predictions', mode='w', format='fixed', complevel=1, complib='zlib')
labels.to_hdf(f'{cfg["output_filename"]}.h5', key='labels', mode='r+', format='fixed', complevel=1, complib='zlib')
add_columns.to_hdf(f'{cfg["output_filename"]}.h5', key='add_columns', mode='r+', format='fixed', complevel=1, complib='zlib')
mlflow.log_artifact(f'{cfg["output_filename"]}.h5', f'predictions/{cfg["dataset_name"]}/{file_name}/{cfg["tau_type"]}')
os.remove(f'{cfg["output_filename"]}.h5')
if __name__ == '__main__':
main()