forked from Xilinx/Vitis-Tutorials
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhost.cpp
executable file
·447 lines (367 loc) · 15.9 KB
/
host.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
/*
* Copyright 2021 Xilinx, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdio.h>
#include <malloc.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <vector>
#include <chrono>
#include "xcl2.hpp"
#include "cmdlineparser.h"
#include "opencv2/opencv.hpp"
#include "coefficients.h"
#include "common.h"
using namespace sda;
using namespace sda::utils;
#define RESET "\033[0m"
#define RED "\033[31m"
#define GREEN "\033[32m"
static void IplImage2Raw(IplImage* img, uchar* y_buf, int stride_y, uchar* u_buf, int stride_u, uchar* v_buf, int stride_v)
{
// Assumes RGB or YUV 4:4:4
for (int y = 0; y < img->height; y++)
{
for (int x = 0; x < img->width; x++)
{
CvScalar cv_pix = cvGet2D(img, y, x);
y_buf[y*stride_y+x] = (uchar)cv_pix.val[0];
u_buf[y*stride_u+x] = (uchar)cv_pix.val[1];
v_buf[y*stride_v+x] = (uchar)cv_pix.val[2];
}
}
}
static void Raw2IplImage(uchar* y_buf, int stride_y, uchar* u_buf, int stride_u, uchar* v_buf, int stride_v, IplImage* img )
{
// Assumes RGB or YUV 4:4:4
for (int y = 0; y < img->height; y++)
{
for (int x = 0; x < img->width; x++)
{
CvScalar cv_pix;
cv_pix.val[0] = y_buf[y*stride_y+x];
cv_pix.val[1] = u_buf[y*stride_u+x];
cv_pix.val[2] = v_buf[y*stride_v+x];
cvSet2D(img, y, x, cv_pix);
}
}
}
static void writeRawImage(
unsigned width, unsigned height, unsigned stride, unsigned depth, unsigned nchannels,
uchar* y_buf, uchar* u_buf, uchar* v_buf, std::string filename)
{
IplImage *dst = cvCreateImage(cvSize(width, height), depth, nchannels);
// Convert processed image from Raw to cvImage
Raw2IplImage(y_buf, stride, u_buf, stride, v_buf, stride, dst);
// Conver to cvMat
cvConvert( dst, cvCreateMat(height, width, CV_32FC3 ) );
// Write to disk
cvSaveImage(filename.c_str(), dst);
}
// -------------------------------------------------------------------------------------------
// Class used to manage requests to the Filter2D kernel
// -------------------------------------------------------------------------------------------
class Filter2DRequest
{
cl::Kernel kernel;
cl::CommandQueue q;
cl::Buffer coef_buffer;
cl::Buffer src_buffer;
cl::Buffer dst_buffer;
std::vector<cl::Event> events;
public:
Filter2DRequest(cl::Context &context, cl::Program &program, cl::CommandQueue &queue)
{
cl_int err;
q = queue;
OCL_CHECK(err, kernel = cl::Kernel(program,"Filter2DKernel", &err));
// Allocate input and output buffers
OCL_CHECK(err, coef_buffer = cl::Buffer(context, CL_MEM_READ_ONLY, (FILTER_V_SIZE*FILTER_V_SIZE)*sizeof(char), nullptr, &err));
OCL_CHECK(err, src_buffer = cl::Buffer(context, CL_MEM_READ_ONLY, (1920*1080)*sizeof(char), nullptr, &err));
OCL_CHECK(err, dst_buffer = cl::Buffer(context, CL_MEM_WRITE_ONLY, (1920*1080)*sizeof(char), nullptr, &err));
// Set kernel arguments - this pins the buffers to specific global memory banks
OCL_CHECK(err, err = kernel.setArg(0, coef_buffer));
OCL_CHECK(err, err = kernel.setArg(6, src_buffer));
OCL_CHECK(err, err = kernel.setArg(7, dst_buffer));
// Make buffers resident in the device
// If done after setArg, then buffers are pinned and runtime knows in which bank they should be made resident,
// removing the need for using the vendor extensions to explicitely map to DDR.
OCL_CHECK(err, err = q.enqueueMigrateMemObjects({coef_buffer, src_buffer, dst_buffer}, CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED));
// Make sure buffers are migrated before continuing
q.finish();
};
void Filter2D(
const char coeffs[FILTER_V_SIZE][FILTER_H_SIZE],
float factor,
short bias,
unsigned short width,
unsigned short height,
unsigned short stride,
unsigned char *src,
unsigned char *dst )
{
assert(width <= 1920);
assert(height <= 1080);
assert(stride%64 == 0);
cl_int err;
cl::Event in1_event;
cl::Event in2_event;
cl::Event run_event;
cl::Event out_event;
size_t offset = 0;
int nbytes = stride*height*sizeof(char);
// If a previous transaction is pending, wait until it completes
finish();
// Set kernel arguments - since buffers are reused, no need to set these args each time...
// OCL_CHECK(err, err = kernel.setArg(0, coef_buffer));
OCL_CHECK(err, err = kernel.setArg(1, factor));
OCL_CHECK(err, err = kernel.setArg(2, bias));
OCL_CHECK(err, err = kernel.setArg(3, width));
OCL_CHECK(err, err = kernel.setArg(4, height));
OCL_CHECK(err, err = kernel.setArg(5, stride));
// OCL_CHECK(err, err = kernel.setArg(6, src_buffer));
// OCL_CHECK(err, err = kernel.setArg(7, dst_buffer));
// Schedule the writing of the inputs from host to device
OCL_CHECK(err, err = q.enqueueWriteBuffer(coef_buffer, CL_FALSE, offset, (FILTER_V_SIZE*FILTER_V_SIZE)*sizeof(char), &coeffs[0][0], nullptr, &in1_event) );
OCL_CHECK(err, err = q.enqueueWriteBuffer(src_buffer, CL_FALSE, offset, nbytes, src, nullptr, &in2_event) );
events.push_back(in1_event);
events.push_back(in2_event);
// Schedule the execution of the kernel
OCL_CHECK(err, err = q.enqueueTask(kernel, &events, &run_event));
events.push_back(run_event);
// Schedule the reading of the outputs from device back to host
OCL_CHECK(err, err = q.enqueueReadBuffer(dst_buffer, CL_FALSE, offset, nbytes, dst, &events, &out_event) );
events.push_back(out_event);
}
void finish()
{
if (events.size()>0) {
events.back().wait();
events.clear();
if (getenv("XCL_EMULATION_MODE") != NULL) {
printf(" finished Filter2DRequest\n");
}
}
}
};
// -------------------------------------------------------------------------------------------
// Class used to manage multiple simultaneous request to the Filter2D kernel
// Implements SW pipelining and scales with multiple CUs
// -------------------------------------------------------------------------------------------
class Filter2DDispatcher
{
std::vector<Filter2DRequest> req;
int max;
int cnt;
public:
Filter2DDispatcher(cl::Context &context, cl::Program &program, cl::CommandQueue &queue, int nreqs)
{
cnt = 0;
max = nreqs;
for(int i=0; i<max; i++) {
req.push_back( Filter2DRequest(context, program, queue) );
}
};
int operator () (
const char coeffs[FILTER_V_SIZE][FILTER_H_SIZE],
float factor,
short bias,
unsigned short width,
unsigned short height,
unsigned short stride,
unsigned char *src,
unsigned char *dst )
{
cnt++;
req[cnt%max].Filter2D(coeffs, factor, bias, width, height, stride, src, dst);
return (cnt%max);
}
void finish(int id) {
if (id<max) {
req[id].finish();
}
}
void finish() {
for(int i=0; i<max; i++) {
req[i].finish();
}
}
};
int main(int argc, char** argv)
{
printf("----------------------------------------------------------------------------\n");
printf("\n");
printf("Xilinx 2D Filter Example Application\n");
printf("\n");
// ---------------------------------------------------------------------------------
// Parse command line
// ---------------------------------------------------------------------------------
CmdLineParser parser;
parser.addSwitch("--nruns", "-n", "Number of times to image is processed", "1");
parser.addSwitch("--fpga", "-x", "FPGA binary (xclbin) file to use");
parser.addSwitch("--input", "-i", "Input image file");
parser.addSwitch("--filter", "-f", "Filter type (0-6)", "0");
parser.addSwitch("--maxreqs", "-r", "Maximum number of outstanding requests", "3");
parser.addSwitch("--compare", "-c", "Compare FPGA and SW performance", "false", true);
//parse all command line options
parser.parse(argc, argv);
string fpgaBinary = parser.value("fpga");
string inputImage = parser.value("input");
int numRuns = parser.value_to_int("nruns");
unsigned filterType = parser.value_to_int("filter");
int maxReqs = parser.value_to_int("maxreqs");
bool comparePerf = parser.value_to_bool("compare");
if (fpgaBinary.size() == 0) {
printf("ERROR: FPGA binary file (.xclbin) must be specified with the -x command line switch\n");
return -1;
}
if (inputImage.size() == 0) {
printf("ERROR: input image file must be specified using -i command line switch\n");
return -1;
}
if (filterType>6) {
printf("ERROR: Supported filter type values are [0:6]\n");
return -1;
}
printf("FPGA binary : %s\n", fpgaBinary.c_str());
printf("Input image : %s\n", inputImage.c_str());
printf("Number of runs : %d\n", numRuns);
printf("Filter type : %d\n", filterType);
printf("Max requests : %d\n", maxReqs);
printf("Compare perf. : %d\n", comparePerf);
printf("\n");
// ---------------------------------------------------------------------------------
// Read input image and format inputs
// ---------------------------------------------------------------------------------
std::string srcFileName = inputImage;
// Read Input image
IplImage *src;
src = cvLoadImage(srcFileName.c_str()); //format is BGR
if(!src) {
printf("ERROR: Loading image %s failed\n", srcFileName.c_str());
return -1;
}
unsigned width = src->width;
unsigned height = src->height;
unsigned depth = src->depth;
unsigned nchannels = src->nChannels;
unsigned stride = ceil(width/64.0)*64;
unsigned nbytes = (stride*height);
// Input and output buffers (Y,U, V)
unsigned char *y_src = (unsigned char *)malloc(nbytes);
unsigned char *u_src = (unsigned char *)malloc(nbytes);
unsigned char *v_src = (unsigned char *)malloc(nbytes);
unsigned char *y_dst = (unsigned char *)malloc(nbytes);
unsigned char *u_dst = (unsigned char *)malloc(nbytes);
unsigned char *v_dst = (unsigned char *)malloc(nbytes);
// Convert CV Image to AXI video data
IplImage2Raw(src, y_src, stride, u_src, stride, v_src, stride);
// Release allocated memory
cvReleaseImage(&src);
// Retrieve filter factor and bias
float factor = filterFactors[filterType];
short bias = filterBiases[filterType];
// ---------------------------------------------------------------------------------
// Load XCLBIN file, create OpenCL context, device and program
// ---------------------------------------------------------------------------------
printf("Programming FPGA device\n");
cl_int err;
std::vector<cl::Device> devices = xcl::get_xil_devices();
devices.resize(1); // (arbitrarily) use the first Xilinx device that is found
OCL_CHECK(err, cl::Context context(devices[0], NULL, NULL, NULL, &err));
unsigned fileBufSize;
char* fileBuf = xcl::read_binary_file(fpgaBinary.c_str(), fileBufSize);
cl::Program::Binaries bins{{fileBuf, fileBufSize}};
OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err));
OCL_CHECK(err, cl::CommandQueue queue(context, devices[0], cl::QueueProperties::Profiling | cl::QueueProperties::OutOfOrder, &err));
// ---------------------------------------------------------------------------------
// Make requests to kernel(s)
// ---------------------------------------------------------------------------------
printf("Running FPGA accelerator on %d images\n", numRuns);
// Dispatcher of requests to the kernel
// 'maxReqs' controls the maximum number of outstanding requests to the kernel
// and equates to the depth of SW pipelining.
Filter2DDispatcher Filter2DKernel(context, program, queue, maxReqs);
auto fpga_begin = std::chrono::high_resolution_clock::now();
for(int n=0; n<numRuns; n++)
{
// Enqueue independent requests to Blur Y, U and V planes
// Requests will run sequentially if there is a single kernel
// Requests will run in parallel is there are two or more kernels
Filter2DKernel(filterCoeffs[filterType], factor, bias, width, height, stride, y_src, y_dst);
Filter2DKernel(filterCoeffs[filterType], factor, bias, width, height, stride, u_src, u_dst);
Filter2DKernel(filterCoeffs[filterType], factor, bias, width, height, stride, v_src, v_dst);
}
Filter2DKernel.finish();
auto fpga_end = std::chrono::high_resolution_clock::now();
// Write image to file
writeRawImage(width, height, stride, depth, nchannels, y_dst, u_dst, v_dst, inputImage.substr(0, inputImage.size()-4)+"_out.bmp");
// ---------------------------------------------------------------------------------
// Compute reference results and compare
// ---------------------------------------------------------------------------------
if (comparePerf) {
printf("Running Software version\n");
}
// Create output buffers for reference results
unsigned char *y_ref = (unsigned char *)malloc(nbytes);
unsigned char *u_ref = (unsigned char *)malloc(nbytes);
unsigned char *v_ref = (unsigned char *)malloc(nbytes);
unsigned int numRunsSW = comparePerf?numRuns:1;
auto cpu_begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for num_threads(3)
for(unsigned int n=0; n<numRunsSW; n++)
{
// Compute reference results
Filter2D(filterCoeffs[filterType], factor, bias, width, height, stride, y_src, y_ref);
Filter2D(filterCoeffs[filterType], factor, bias, width, height, stride, u_src, u_ref);
Filter2D(filterCoeffs[filterType], factor, bias, width, height, stride, v_src, v_ref);
}
auto cpu_end = std::chrono::high_resolution_clock::now();
// Write image to file
writeRawImage(width, height, stride, depth, nchannels, y_ref, u_ref, v_ref, inputImage.substr(0, inputImage.size()-4)+"_ref.bmp");
printf("Comparing results\n");
// Compare results
bool diff = false;
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
if ( y_dst[y*stride+x] != y_ref[y*stride+x] ) diff = true;
if ( u_dst[y*stride+x] != u_ref[y*stride+x] ) diff = true;
if ( v_dst[y*stride+x] != v_ref[y*stride+x] ) diff = true;
}
}
if(diff) {
printf("\n%sTest FAILED: Output has mismatches with reference%s\n", RED, RESET);
} else {
printf("\n%sTest PASSED: Output matches reference%s\n", GREEN, RESET);
}
// Report performance (if not running in emulation mode)
if (getenv("XCL_EMULATION_MODE") == NULL) {
std::chrono::duration<double> fpga_duration = fpga_end - fpga_begin;
std::chrono::duration<double> cpu_duration = cpu_end - cpu_begin;
float fpga_throughput = (double) numRuns*3*nbytes / fpga_duration.count() / (1024.0*1024.0);
float cpu_throughput = (double) numRuns*3*nbytes / cpu_duration.count() / (1024.0*1024.0);
printf("\n");
printf("FPGA Time : %10.4f s\n", fpga_duration.count());
printf("FPGA Throughput : %10.4f MB/s\n", fpga_throughput);
if (comparePerf) {
printf("CPU Time : %10.4f s\n", cpu_duration.count());
printf("CPU Throughput : %10.4f MB/s\n", cpu_throughput);
printf("FPGA Speedup : %10.4f x\n", cpu_duration.count() / fpga_duration.count());
}
}
printf("----------------------------------------------------------------------------\n");
return (diff?1:0);
}