-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
92 lines (63 loc) · 2.57 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import torch
import torch.distributed as dist
import numpy as np
def evaluate(model, loss_func, data_loader):
# device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
with torch.no_grad():
val_accuracy = []
val_loss = []
model.eval()
for data, target in data_loader:
data = data.to(device)
target = target.to(device)
output = model(data)
_, pred = torch.max(output, dim=1)
val_accuracy.append(torch.sum(pred == target).item() / len(pred))
loss = loss_func(output, target)
val_loss.append(loss.item())
return np.mean(val_loss), np.mean(val_accuracy)
def collect_metrics(epoch, history, model, loss_func, opt, data_loader,
verbose=True):
eval_loss, eval_acc = evaluate(model, loss_func, data_loader)
eval_loss = torch.tensor(eval_loss)
dist.all_reduce(eval_loss, op=dist.ReduceOp.SUM)
eval_loss /= dist.get_world_size()
eval_acc = torch.tensor(eval_acc)
dist.all_reduce(eval_acc, op=dist.ReduceOp.SUM)
eval_acc /= dist.get_world_size()
data_transferred = torch.tensor([opt.data_transferred])
dist.all_reduce(data_transferred, op=dist.ReduceOp.SUM)
if dist.get_rank() == 0:
if verbose:
print(
'Epoch {} ::\tLoss = {},\tAccuracy = {},\tTransferred {}MB'
.format(epoch, eval_loss.item(), eval_acc.item(),
data_transferred.item() / (2 ** 20))
)
history['acc'].append(eval_acc.item())
history['loss'].append(eval_loss.item())
history['interconnect'].append(data_transferred.item())
def consensus_train(model, loss_func, opt, train_loader,
n_epochs=10, verbose=True):
# device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
train_history = {
'acc': [],
'loss': [],
'interconnect': []
}
collect_metrics(0, train_history, model, loss_func, opt,
train_loader, verbose=verbose)
for epoch in range(n_epochs):
model.train()
for data, target in train_loader:
data, target = data.to(device), target.to(device)
def local_objective():
output = model(data)
return loss_func(output, target)
# Consensus optimization
opt.step(local_objective)
collect_metrics(epoch + 1, train_history, model, loss_func, opt,
train_loader, verbose=verbose)
return train_history