-
Notifications
You must be signed in to change notification settings - Fork 1
/
bmp_processing_better.cpp
152 lines (125 loc) · 8.24 KB
/
bmp_processing_better.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/**
* Example using the parallel work group writers
* this leads to a huge performance improvement on GPUs over `bmp_processing.cpp`
*/
#include <sycl/sycl.hpp>
#include <sycl_fs.hpp>
#include <tools/sycl_queue_helpers.hpp>
#include <tools/usm_smart_ptr.hpp>
#include <tools/bmp_io.hpp>
#include <tools/scope_chrono.hpp>
class generating_kernel;
class processing_kernel;
using namespace usm_smart_ptr;
size_t get_alloc_count(size_t width, size_t height) {
return width * height + 4 - ((width * height) % 4);
}
/**
* Each work group will process several pictures, one at a time, and then only all work-items will write the picture to the hard drive.
* This means that we can keep the number of parallel writers small this reduce the number of channels needed, reduce memory, and more importantly:
* don't have to stop the gpu from running. One could notice that the memory allocation size is independent from the number of pictures to process, as
* well as the number of kernel submissions.
*/
size_t launch_image_generator(size_t file_count, sycl::queue &q, size_t work_groups, size_t work_items, const usm_shared_ptr<char, alloc::shared> &filenames, size_t filename_size, size_t width,
size_t height) {
/* We initialise the file system api on the host */
sycl::fs<uint8_t> fs(q, work_groups, bmp::get_buffer_size(width, height));
/* Allocating buffer for processing */
usm_shared_ptr<pixel, alloc::device> gpu_image_buffer(get_alloc_count(width, height) * work_groups, q);
q.submit([&, filenames = filenames.raw(), gpu_image_buffer = gpu_image_buffer.raw()](sycl::handler &cgh) {
/* To create the parallel file accessor, we need to pass the sycl::handler in order to get access to local memory (shared within a work group) */
auto image_writer = fs.get_access_work_group(cgh);
sycl::stream os(1024, 256, cgh);
cgh.parallel_for<generating_kernel>(sycl::nd_range<1>(work_items * work_groups, work_items), [=](sycl::nd_item<1> item) {
const size_t work_group_id = item.get_group_linear_id();
const size_t work_item_id = item.get_local_linear_id();
const size_t channel_idx = work_group_id;
pixel *work_group_image_buffer = gpu_image_buffer + channel_idx * get_alloc_count(width, height); // Each wg has its own memory region
/* Iterating over the pictures that are to be processed by the current work group */
for (size_t processed_file_id = work_group_id; processed_file_id < file_count; processed_file_id += work_groups) {
const char *filename_ptr = filenames + filename_size * processed_file_id; // Getting the file name pointer
/* Writing dummy data to the buffer with the work items, in a packed manner */
for (size_t i = work_item_id; i < width * height; i += work_items) {
work_group_image_buffer[i] = yuv_2_rgb((50 * work_group_id + i % width) % 256, (50 * work_group_id + i / height) % 256, 150);
}
/* Writing the picture with all the work items in parallel. The first work item will open/close the file, but
* the actual writing will be done in parallel, using all the work items */
if (!bmp::save_picture_work_group(item, channel_idx, image_writer, filename_ptr, width, height, work_group_image_buffer)) {
os << "Failure saving: " << filename_ptr << sycl::endl;
image_writer.abort_host();
}
} // Back to for loop over the pictures
}); // parallel_for
}).wait();
std::cout << "Generating pass done!" << std::endl;
return 3 * width * height * file_count; // Data processed
}
/**
* Little example of how to read files (fundamentally the same as the previous one)
*/
size_t launch_image_checker(size_t file_count, sycl::queue &q, size_t work_groups, size_t work_items, const usm_shared_ptr<char, alloc::shared> &filenames, size_t filename_size, size_t width,
size_t height) {
/**
* replace by sycl::fs<uint8_t, true, true> to start using DMA
*/
sycl::fs<uint8_t> fs(q, work_groups, bmp::get_buffer_size(width, height));
/* Allocating buffer for processing */
usm_shared_ptr<pixel, alloc::device> device_image_buffer(get_alloc_count(width, height) * work_groups, q);
q.submit([&, filenames = filenames.raw(), device_image_buffer = device_image_buffer.raw()](sycl::handler &cgh) {
/* To create the parallel file accessor, we need to pass the sycl::handler in order to get access to local memory (shared within a work group) */
auto image_accessor = fs.get_access_work_group(cgh);
sycl::stream os(1024, 256, cgh);
cgh.parallel_for<processing_kernel>(sycl::nd_range<1>(work_items * work_groups, work_items), [=](sycl::nd_item<1> item) {
const size_t work_group_id = item.get_group_linear_id();
const size_t work_item_id = item.get_local_linear_id();
const size_t channel_idx = work_group_id;
pixel *work_group_image_buffer = device_image_buffer + channel_idx * get_alloc_count(width, height);
/* Iterating over the pictures that are to be processed by the current work group */
for (size_t processed_file_id = work_group_id; processed_file_id < file_count; processed_file_id += work_groups) {
const char *filename_ptr = filenames + filename_size * processed_file_id;
/* Parallel picture loading using the work group */
bmp::load_picture_work_group(item, channel_idx, image_accessor, filename_ptr, width, height, work_group_image_buffer);
for (size_t i = work_item_id; i < width * height; i += work_items) {
const pixel expected = yuv_2_rgb((50 * work_group_id + i % width) % 256, (50 * work_group_id + i / height) % 256, 150);
if (work_group_image_buffer[i] != expected) {
os << "Error, got: " << work_group_image_buffer[i] << " instead of: " << expected;
image_accessor.abort_host();
}
}
/* Parallel picture saving using the work group */
bmp::save_picture_work_group(item, channel_idx, image_accessor, filename_ptr, width, height, work_group_image_buffer);
}
});
}).wait();
std::cout << "Read, check and rewrite pass done!" << std::endl;
return 2 * 3 * width * height * file_count; // Data processed
}
int main(int, char **) {
sycl::queue q = try_get_queue(sycl::gpu_selector{});
std::cout << "Running on: " << q.get_device().get_info<sycl::info::device::name>() << std::endl;
/* Settings */
size_t files_to_process = 50;
const char format_string[] = "my_file%06d.bmp";
size_t width = 1920;
size_t height = 1080;
/* Generating the file names */
const size_t filename_size = (size_t) std::snprintf(nullptr, 0, format_string, (int) files_to_process) + 1;
assert(filename_size > 0 && "Wrong format string");
usm_shared_ptr<char, alloc::shared> filenames(filename_size * files_to_process, q);
for (size_t i = 0; i < files_to_process; ++i) {
std::snprintf(filenames.raw() + i * filename_size, filename_size, format_string, (int) i);
}
/* nd_range settings are independent of the number of files to process */
size_t work_item_count = 64; // work items
size_t work_group_count = 24 * 4; // work groups
/* Benchmarking and testing the batch image processor. */
{
scope_chrono c("Processing");
/* Generating the pictures */
size_t io_bytes = launch_image_generator(files_to_process, q, work_group_count, work_item_count, filenames, filename_size, width, height);
/* Reading them, checking whether everything is correct and writing them back */
io_bytes += launch_image_checker(files_to_process, q, work_group_count, work_item_count, filenames, filename_size, width, height);
io_bytes += launch_image_checker(files_to_process, q, work_group_count, work_item_count, filenames, filename_size, width, height);
std::cout << "Processed " << (double) io_bytes / (1024. * 1024.) << " MiB, bandwidth: " << (double) io_bytes / (1024. * 1024.) / c.stop() << " MiB/s " << std::endl;
}
}