Skip to content

Commit 4ffc56a

Browse files
Add: Wideresnet (wang-xinyu#4) (wang-xinyu#518)
* Adding WideResnet C++ (wang-xinyu#2) * initialize wideresnet50 * add: wideresnet50 c++ code * add: wide resnet python (wang-xinyu#3) * add: wide resnet python * fix: typo Co-authored-by: makaveli <[email protected]> Co-authored-by: makaveli <[email protected]>
1 parent 1f7672c commit 4ffc56a

File tree

4 files changed

+675
-5
lines changed

4 files changed

+675
-5
lines changed

resnet/CMakeLists.txt

+4
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,9 @@ add_executable(resnext50 ${PROJECT_SOURCE_DIR}/resnext50_32x4d.cpp)
2929
target_link_libraries(resnext50 nvinfer)
3030
target_link_libraries(resnext50 cudart)
3131

32+
add_executable(wideresnet50 ${PROJECT_SOURCE_DIR}/wideresnet50.cpp)
33+
target_link_libraries(wideresnet50 nvinfer)
34+
target_link_libraries(wideresnet50 cudart)
35+
3236
add_definitions(-O2 -pthread)
3337

resnet/README.md

+29-5
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,18 @@ ResNet-18 and ResNet-50 model from "Deep Residual Learning for Image Recognition
44

55
For the Pytorch implementation, you can refer to [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet)
66

7+
Wide Resnet-50 model from "Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf> . For the Pytorch implementation, you can refer to [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz)
8+
79
Following tricks are used in this resnet, nothing special, residual connection and batchnorm are used.
810

911
- Batchnorm layer, implemented with scale layer.
1012

1113
## TensorRT C++ API
1214

1315
```
14-
// 1. generate resnet18.wts or resnet50.wts from [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet)
16+
// 1a. generate resnet18.wts or resnet50.wts from [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet)
17+
18+
// 1b. generate wide_resnet50.wts from [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz)
1519
1620
// 2. put resnet18.wts or resnet50.wts into tensorrtx/resnet
1721
@@ -35,16 +39,29 @@ or
3539
sudo ./resnet50 -s // serialize model to plan file i.e. 'resnet50.engine'
3640
sudo ./resnet50 -d // deserialize plan file and run inference
3741
42+
or
43+
44+
sudo ./resnext50 -s // serialize model to plan file i.e. 'resnext50.engine'
45+
sudo ./resnext50 -d // deserialize plan file and run inference
46+
47+
or
48+
49+
sudo ./wide_resnet50 -s // serialize model to plan file i.e. 'wide_resnet50.engine'
50+
sudo ./wide_resnet50 -d // deserialize plan file and run inference
51+
3852
39-
// 4. see if the output is same as pytorchx/resnet
53+
// 4. see if the output is same as
54+
- [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet) - for resnet18, resnet50, resnext50
55+
- [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz) - for wide_resnet50
4056
```
4157

4258
### TensorRT Python API
4359

4460
```
45-
# 1. generate resnet50.wts from [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet)
61+
# 1a. generate resnet50.wts from [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet)
62+
# 1b. generate wide_resnet50.wts from [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz)
4663
47-
# 2. put resnet50.wts into tensorrtx/resnet
64+
# 2. put resnet50.wts or wide_resnet50.wts into tensorrtx/resnet
4865
4966
# 3. install Python dependencies (tensorrt/pycuda/numpy)
5067
@@ -53,5 +70,12 @@ cd tensorrtx/resnet
5370
python resnet50.py -s // serialize model to plan file i.e. 'resnet50.engine'
5471
python resnet50.py -d // deserialize plan file and run inference
5572
56-
# 4. see if the output is same as pytorchx/resnet
73+
or
74+
75+
python wide_resnet50.py -s // serialize model to plan file i.e. 'wide_resnet50.engine'
76+
python wide_resnet50.py -d // deserialize plan file and run inference
77+
78+
# 4. see if the output is same as
79+
- pytorchx/resnet - for resnet50
80+
- BlueMirrors/torchtrtz - for wide_resnet50
5781
```

resnet/wide_resnet50.py

+275
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
import os
2+
import sys
3+
import struct
4+
import argparse
5+
6+
import numpy as np
7+
import pycuda.autoinit
8+
import pycuda.driver as cuda
9+
import tensorrt as trt
10+
11+
BATCH_SIZE = 1
12+
INPUT_H = 224
13+
INPUT_W = 224
14+
OUTPUT_SIZE = 1000
15+
BS = 1
16+
INPUT_BLOB_NAME = "data"
17+
OUTPUT_BLOB_NAME = "prob"
18+
EPS = 1e-5
19+
20+
WEIGHT_PATH = "./wide_resnet50.wts"
21+
ENGINE_PATH = "./wide_resnet50.engine"
22+
23+
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
24+
25+
26+
def load_weights(file):
27+
print(f"Loading weights: {file}")
28+
29+
assert os.path.exists(file), 'Unable to load weight file.'
30+
31+
weight_map = {}
32+
with open(file, "r") as f:
33+
lines = [line.strip() for line in f]
34+
count = int(lines[0])
35+
assert count == len(lines) - 1
36+
for i in range(1, count + 1):
37+
splits = lines[i].split(" ")
38+
name = splits[0]
39+
cur_count = int(splits[1])
40+
assert cur_count + 2 == len(splits)
41+
values = []
42+
for j in range(2, len(splits)):
43+
# hex string to bytes to float
44+
values.append(struct.unpack(">f", bytes.fromhex(splits[j])))
45+
weight_map[name] = np.array(values, dtype=np.float32)
46+
47+
return weight_map
48+
49+
50+
def addBatchNorm2d(network, weight_map, inputs, layer_name, eps):
51+
gamma = weight_map[layer_name + ".weight"]
52+
beta = weight_map[layer_name + ".bias"]
53+
mean = weight_map[layer_name + ".running_mean"]
54+
var = weight_map[layer_name + ".running_var"]
55+
print(layer_name + " " + str(len(weight_map[layer_name + ".running_var"])))
56+
var = np.sqrt(var + eps)
57+
58+
scale = gamma / var
59+
shift = -mean / var * gamma + beta
60+
return network.add_scale(input=inputs,
61+
mode=trt.ScaleMode.CHANNEL,
62+
shift=shift,
63+
scale=scale)
64+
65+
66+
def bottleneck(network, weight_map, input, in_channels, out_channels, stride, layer_name):
67+
# empty weights for bias
68+
emptywts = trt.Weights()
69+
70+
conv1 = network.add_convolution(input=input,
71+
num_output_maps=out_channels,
72+
kernel_shape=(1, 1),
73+
kernel=weight_map[layer_name + "conv1.weight"],
74+
bias=emptywts)
75+
assert conv1
76+
77+
bn1 = addBatchNorm2d(network, weight_map, conv1.get_output(0), layer_name + "bn1", EPS)
78+
assert bn1
79+
80+
relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU)
81+
assert relu1
82+
83+
conv2 = network.add_convolution(input=relu1.get_output(0),
84+
num_output_maps=out_channels,
85+
kernel_shape=(3, 3),
86+
kernel=weight_map[layer_name + "conv2.weight"],
87+
bias=emptywts)
88+
assert conv2
89+
conv2.stride = (stride, stride)
90+
conv2.padding = (1, 1)
91+
92+
bn2 = addBatchNorm2d(network, weight_map, conv2.get_output(0),
93+
layer_name + "bn2", EPS)
94+
assert bn2
95+
96+
relu2 = network.add_activation(bn2.get_output(0),
97+
type=trt.ActivationType.RELU)
98+
assert relu2
99+
100+
conv3 = network.add_convolution(input=relu2.get_output(0),
101+
num_output_maps=out_channels * 2,
102+
kernel_shape=(1, 1),
103+
kernel=weight_map[layer_name + "conv3.weight"],
104+
bias=emptywts)
105+
assert conv3
106+
107+
bn3 = addBatchNorm2d(network, weight_map, conv3.get_output(0), layer_name + "bn3", EPS)
108+
assert bn3
109+
110+
if stride != 1 or in_channels != 2 * out_channels:
111+
conv4 = network.add_convolution(
112+
input=input,
113+
num_output_maps=out_channels * 2,
114+
kernel_shape=(1, 1),
115+
kernel=weight_map[layer_name + "downsample.0.weight"],
116+
bias=emptywts)
117+
assert conv4
118+
conv4.stride = (stride, stride)
119+
120+
bn4 = addBatchNorm2d(network, weight_map, conv4.get_output(0), layer_name + "downsample.1", EPS)
121+
assert bn4
122+
123+
ew1 = network.add_elementwise(bn4.get_output(0), bn3.get_output(0),
124+
trt.ElementWiseOperation.SUM)
125+
else:
126+
ew1 = network.add_elementwise(input, bn3.get_output(0), trt.ElementWiseOperation.SUM)
127+
assert ew1
128+
129+
relu3 = network.add_activation(ew1.get_output(0), type=trt.ActivationType.RELU)
130+
assert relu3
131+
132+
return relu3
133+
134+
135+
def create_engine(maxBatchSize, builder, config, dt):
136+
weight_map = load_weights(WEIGHT_PATH)
137+
network = builder.create_network()
138+
139+
data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W))
140+
assert data
141+
142+
# empty weights for bias
143+
emptywts = trt.Weights()
144+
145+
conv1 = network.add_convolution(input=data,
146+
num_output_maps=64,
147+
kernel_shape=(7, 7),
148+
kernel=weight_map["conv1.weight"],
149+
bias=emptywts)
150+
assert conv1
151+
conv1.stride = (2, 2)
152+
conv1.padding = (3, 3)
153+
154+
bn1 = addBatchNorm2d(network, weight_map, conv1.get_output(0), "bn1", EPS)
155+
assert bn1
156+
157+
relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU)
158+
assert relu1
159+
160+
pool1 = network.add_pooling(input=relu1.get_output(0),
161+
window_size=trt.DimsHW(3, 3),
162+
type=trt.PoolingType.MAX)
163+
assert pool1
164+
pool1.stride = (2, 2)
165+
pool1.padding = (1, 1)
166+
167+
x = bottleneck(network, weight_map, pool1.get_output(0), 64, 128, 1, "layer1.0.")
168+
x = bottleneck(network, weight_map, x.get_output(0), 256, 128, 1, "layer1.1.")
169+
x = bottleneck(network, weight_map, x.get_output(0), 256, 128, 1, "layer1.2.")
170+
171+
x = bottleneck(network, weight_map, x.get_output(0), 256, 256, 2, "layer2.0.")
172+
x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 1, "layer2.1.")
173+
x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 1, "layer2.2.")
174+
x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 1, "layer2.3.")
175+
176+
x = bottleneck(network, weight_map, x.get_output(0), 512, 512, 2, "layer3.0.")
177+
x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.1.")
178+
x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.2.")
179+
x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.3.")
180+
x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.4.")
181+
x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.5.")
182+
183+
x = bottleneck(network, weight_map, x.get_output(0), 1024, 1024, 2, "layer4.0.")
184+
x = bottleneck(network, weight_map, x.get_output(0), 2048, 1024, 1, "layer4.1.")
185+
x = bottleneck(network, weight_map, x.get_output(0), 2048, 1024, 1, "layer4.2.")
186+
187+
pool2 = network.add_pooling(x.get_output(0),
188+
window_size=trt.DimsHW(7, 7),
189+
type=trt.PoolingType.AVERAGE)
190+
assert pool2
191+
pool2.stride = (1, 1)
192+
193+
fc1 = network.add_fully_connected(input=pool2.get_output(0),
194+
num_outputs=OUTPUT_SIZE,
195+
kernel=weight_map['fc.weight'],
196+
bias=weight_map['fc.bias'])
197+
assert fc1
198+
199+
fc1.get_output(0).name = OUTPUT_BLOB_NAME
200+
network.mark_output(fc1.get_output(0))
201+
202+
# Build engine
203+
builder.max_batch_size = maxBatchSize
204+
builder.max_workspace_size = 1 << 20
205+
engine = builder.build_engine(network, config)
206+
print("build out")
207+
del network
208+
del weight_map
209+
210+
return engine
211+
212+
213+
def APIToModel(maxBatchSize):
214+
builder = trt.Builder(TRT_LOGGER)
215+
config = builder.create_builder_config()
216+
engine = create_engine(maxBatchSize, builder, config, trt.float32)
217+
assert engine
218+
with open(ENGINE_PATH, "wb") as f:
219+
f.write(engine.serialize())
220+
221+
del engine
222+
del builder
223+
224+
225+
def doInference(context, host_in, host_out, batchSize):
226+
engine = context.engine
227+
assert engine.num_bindings == 2
228+
229+
devide_in = cuda.mem_alloc(host_in.nbytes)
230+
devide_out = cuda.mem_alloc(host_out.nbytes)
231+
bindings = [int(devide_in), int(devide_out)]
232+
stream = cuda.Stream()
233+
234+
cuda.memcpy_htod_async(devide_in, host_in, stream)
235+
context.execute_async(bindings=bindings, stream_handle=stream.handle)
236+
cuda.memcpy_dtoh_async(host_out, devide_out, stream)
237+
stream.synchronize()
238+
239+
240+
if __name__ == '__main__':
241+
parser = argparse.ArgumentParser()
242+
parser.add_argument("-s", action='store_true')
243+
parser.add_argument("-d", action='store_true')
244+
args = parser.parse_args()
245+
246+
if not (args.s ^ args.d):
247+
print(
248+
"arguments not right!\n"
249+
"python wide_resnet50.py -s # serialize model to plan file\n"
250+
"python wide_resnet50.py -d # deserialize plan file and run inference"
251+
)
252+
sys.exit()
253+
254+
if args.s:
255+
APIToModel(BATCH_SIZE)
256+
else:
257+
runtime = trt.Runtime(TRT_LOGGER)
258+
assert runtime
259+
260+
with open(ENGINE_PATH, "rb") as f:
261+
engine = runtime.deserialize_cuda_engine(f.read())
262+
assert engine
263+
264+
context = engine.create_execution_context()
265+
assert context
266+
267+
data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32)
268+
host_in = cuda.pagelocked_empty(BATCH_SIZE * 3 * INPUT_H * INPUT_W,
269+
dtype=np.float32)
270+
np.copyto(host_in, data.ravel())
271+
host_out = cuda.pagelocked_empty(OUTPUT_SIZE, dtype=np.float32)
272+
273+
doInference(context, host_in, host_out, BATCH_SIZE)
274+
275+
print(f'Output: \n{host_out[:10]}\n{host_out[-10:]}')

0 commit comments

Comments
 (0)