-
Notifications
You must be signed in to change notification settings - Fork 0
/
calc.py
166 lines (138 loc) · 4.97 KB
/
calc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
""" test cpp embedding:
# pass variable from cpp to python and return
# uses pybind11 with MPI
"""
import os, sys, time, logging, numpy as np, scipy as sp
from mpi4py import MPI
import torch
import torch.nn as nn
"""
allocate basic helpers defined in cpp
"""
def get_topology(crank,cd_glob,cn_glob,cm_glob,cl_glob):
global rank, d_glob, n_glob, m_glob, l_glob
rank = crank # cpp rank
d_glob = cd_glob # cpp global dimensions
n_glob = cn_glob # cpp topology information on n-axis
m_glob = cm_glob # cpp topology information on m-axis
l_glob = cl_glob # cpp topology information on l-axis
"""
fast interpolate input to any desired dimensions
ussing scipy's map_coordinates function
https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.map_coordinates.html
"""
def interpolate(field,prank,fac=1):
# start a timer
st = time.perf_counter()
# map to any arbitrary dimensions
ni,nj,nk=field.shape[:] # exemplray to a factor
desired_x = ni*fac
desired_y = nj*fac
desired_z = nk*fac
# create 3D list of coordonates
des_dims = []
for inp_l, out_l in zip(field.shape, (desired_x,desired_y,desired_z)):
des_dims.append(np.linspace(0, inp_l-1, out_l))
# create meshgrid from list
coords = np.meshgrid(*des_dims, indexing='ij')
# return mapped data to new coordinates, order=0 ensures that the process is fast (only copy, no interpolate)
res = sp.ndimage.map_coordinates(field, coords, order=0, mode='nearest', cval=0, prefilter=False)
# end timer
if prank==0:
logging.info('interpolate time: {:.2f}'.format(time.perf_counter()-st)+' s')
return res
"""
split the data in MPI rank(s) to #GPU(s)
e.g., 5 MPI ranks (prank) in 2 GPUs (grank) will then
1. grank will've data from 0:2 pranks
2. grank will've data from 2:5 pranks
last grank will've leftover prank
note: psize is the MPI world size
"""
def split_to_gpu(field,grank,psize,gsize):
# indice to start depending on grank
n1 = grank * psize // gsize
# indice to end depending on grank
n2 = n1 + psize // gsize
# leftover prank to last grank
if grank==gsize-1:
n2 += psize % gsize
return field[n1:n2,:,:,:]
"""
example model for MNIST training - https://github.com/CSCfi/pytorch-ddp-examples/blob/master/mnist_ddp.py
"""
class CAE(nn.Module):
def __init__(self, num_classes=10):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(7*7*32, num_classes)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
"""
restore model from a saved file
"""
def model_restore(model,grank):
res_name='checkpoint.pth.tar'
loc = {'cuda:%d' % 0: 'cuda:%d' % grank}
checkpoint = torch.load('./'+res_name, map_location=loc)
# only state_dict is needed
model.load_state_dict(checkpoint['state_dict'])
# fix parameters for evaluation
model.eval()
return model
# Get rank and size from mpi4py
def torch_couple(field):
# start timer
st = time.perf_counter()
# debug
logging.basicConfig(format='%(levelname)s: %(message)s', stream=sys.stdout, level=logging.INFO)
# define MPI
comm = MPI.COMM_WORLD
prank = comm.Get_rank() # CPU rank
psize = comm.Get_size() # CPU world size
grank = prank%torch.cuda.device_count() # GPU rank
gsize = torch.cuda.device_count() # GPU world size
assert rank == prank # check MPI ranks with cpp
if prank==0:
print(f'Torch:')
logging.info('CPU ranks:'+str(psize)+''+' / GPU ranks:'+str(gsize))
# move tuple to torch array
field = torch.asarray(field)
# interpolate
field_i = interpolate(field,prank,fac=2)
# gather from all ranks
field_global = comm.allgather(field_i) # leads to dim:ranks,n,m,l
field_global = torch.asarray(np.array(field_global))
# selective GPU distribution
device='cuda:'+str(grank)
# split data to GPU(s)
inputs = split_to_gpu(field_global,grank,psize,gsize).to(device)
# init model
model = CAE().to(device)
# restore model
model = model_restore(model,grank)
# apply input to model to get desired output
with torch.no_grad():
inputs = inputs.permute(0,3,1,2).reshape(inputs.size()[0]*inputs.size()[-1],1,*inputs.size()[1:3]).float()
outputs = model(inputs).float()
# compute maximum value
test = torch.max(outputs)
# timer end
if prank==0:
logging.info('final time: {:.2f}'.format(time.perf_counter()-st)+' s')
# return an integer back to cpp
return test
# eof