-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmembwtest.mjs
171 lines (161 loc) · 5.93 KB
/
membwtest.mjs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import { range } from "./util.mjs";
import { BasePrimitive } from "./primitive.mjs";
class BaseMembwTest extends BasePrimitive {
constructor(params) {
super(params); // writes parameters into this class
this.category = "membw";
this.trials = 10;
}
validate = (memsrc, memdest) => {
for (let i = 0; i < memsrc.length; i++) {
const expected = memsrc[i] + 1.0;
if (expected != memdest[i]) {
return `Element ${i}: expected ${expected}, instead saw ${memdest[i]}.`;
} else {
return "";
}
}
};
}
const MembwSimpleTestParams = {
workgroupSize: range(0, 7).map((i) => 2 ** i),
memsrcSize: range(10, 25).map((i) => 2 ** i),
};
export class MembwSimpleTestClass extends BaseMembwTest {
constructor(params) {
super(params);
this.testname = "fp32-per-thread";
this.description =
"Copies input array to output array. One thread is assigned per 32b input element.";
this.kernel = () => /* wgsl */ `
/* output */
@group(0) @binding(0) var<storage, read_write> memDest: array<f32>;
/* input */
@group(0) @binding(1) var<storage, read> memSrc: array<f32>;
@compute @workgroup_size(${this.workgroupSize}) fn memcpyKernel(
@builtin(global_invocation_id) id: vec3u,
@builtin(num_workgroups) nwg: vec3u,
@builtin(workgroup_id) wgid: vec3u) {
let i = id.y * nwg.x * ${this.workgroupSize} + id.x;
memDest[i] = memSrc[i] + 1.0;
}`;
this.memdestSize = this.memsrcSize;
this.bytesTransferred = (this.memsrcSize + this.memdestSize) * 4;
this.workgroupCount = this.memsrcSize / this.workgroupSize;
}
static plots = [
{
x: { field: "memsrcSize", label: "Copied array size (B)" },
y: { field: "bandwidth", label: "Achieved bandwidth (GB/s)" },
stroke: { field: "workgroupSize" },
caption:
"Memory bandwidth test, 1 fp32 per thread (lines are workgroup size)",
},
{
x: { field: "memsrcSize", label: "Copied array size (B)" },
y: {
field: "bandwidthCPU",
label: "Achieved bandwidth (GB/s) [CPU measurement]",
},
stroke: { field: "workgroupSize" },
caption:
"Memory bandwidth test, 1 fp32 per thread (lines are workgroup size)",
},
{
x: { field: "time", label: "GPU time (ns)" },
y: { field: "cpuns", label: "CPU time (ns)" },
stroke: { field: "workgroupSize" },
caption:
"Memory bandwidth test, 1 fp32 per thread (lines are workgroup size)",
},
{
x: { field: "memsrcSize", label: "Copied array size (B)" },
y: { field: "cpugpuDelta", label: "CPU - GPU time (ns)" },
stroke: { field: "workgroupSize" },
caption:
"Memory bandwidth test, 1 fp32 per thread (lines are workgroup size)",
},
];
}
/**
* grid stride loop, now we don't assign a fixed number of elements per thread
* background: https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
*/
const MembwGSLTestParams = {
workgroupSize: range(0, 7).map((i) => 2 ** i),
memsrcSize: range(10, 25).map((i) => 2 ** i),
workgroupCount: range(5, 10).map((i) => 2 ** i),
};
export class MembwGSLTestClass extends BaseMembwTest {
constructor(params) {
super(params);
this.testname = "GSL fp32-per-thread";
this.kernel = (param) => /* wgsl */ `
/* output */
@group(0) @binding(0) var<storage, read_write> memDest: array<f32>;
/* input */
@group(0) @binding(1) var<storage, read> memSrc: array<f32>;
@compute @workgroup_size(${this.workgroupSize}) fn memcpyKernel(
@builtin(global_invocation_id) id: vec3u,
@builtin(num_workgroups) nwg: vec3u, // == dispatch
@builtin(workgroup_id) wgid: vec3u) {
/* grid-stride loop: assume nwg.y == 1 */
for (var i = id.x;
i < arrayLength(&memSrc);
i += nwg.x * ${this.workgroupSize}) {
memDest[i] = memSrc[i] + 1.0;
}
}`;
this.memdestSize = this.memsrcSize;
this.bytesTransferred = (this.memsrcSize + this.memdestSize) * 4;
this.dispatchGeometry = [this.workgroupCount];
}
static plots = [
{
x: { field: "memsrcSize", label: "Copied array size (B)" },
y: { field: "bandwidth", label: "Achieved bandwidth (GB/s)" },
fy: { field: "workgroupCount", label: "Workgroup Count" },
stroke: { field: "workgroupSize", label: "Workgroup Size" },
caption: "Memory bandwidth test GSL (lines are workgroup size)",
},
{
x: { field: "memsrcSize", label: "Copied array size (B)" },
y: { field: "bandwidth", label: "Achieved bandwidth (GB/s)" },
fy: { field: "workgroupSize", label: "Workgroup Size" },
stroke: { field: "workgroupCount", label: "Workgroup Count" },
caption:
"Memory bandwidth test GSL (lines are workgroup size). Looks like max throughput doesn't occur until ~512 threads/workgroup.",
},
];
}
class MembwAdditionalPlots extends BaseMembwTest {
testname = "additional-plots";
static plots = [
{
filter: function (row) {
return (
row.category == "membw" /* this.category */ &&
(row.testname != "GSL fp32-per-thread" || row.workgroupCount == 128)
);
},
x: { field: "memsrcSize", label: "Copied array size (B)" },
y: { field: "bandwidth", label: "Achieved bandwidth (GB/s)" },
fy: { field: "workgroupSize", label: "Workgroup Size" },
stroke: { field: "testname" },
caption:
"Memory bandwidth test (lines are test name, workgroupCount GSL == 128). Results should indicate that for ~large workgroup sizes, a GSL is at least as good as one thread per item.",
},
];
}
export const MembwSimpleTestSuite = {
class: MembwSimpleTestClass,
params: MembwSimpleTestParams,
};
export const MembwGSLTestSuite = {
class: MembwGSLTestClass,
params: MembwGSLTestParams,
};
export const MembwAdditionalPlotsSuite = {
class: MembwAdditionalPlots,
params: {},
};