-
Notifications
You must be signed in to change notification settings - Fork 0
/
utpx.cpp
435 lines (400 loc) · 19.9 KB
/
utpx.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
#include <algorithm>
#include <cstring>
#include <thread>
#include "intercept_kernel.h"
#include "intercept_memory.h"
#include "utpx.h"
// #define LOG
namespace utpx {
// #ifdef LOG
//
// void vlog(const char *fmt, va_list args1) {
// std::va_list args2;
// va_copy(args2, args1);
// std::vector<char> buf(1 + std::vsnprintf(nullptr, 0, fmt, args1));
// va_end(args1);
// std::vsnprintf(buf.data(), buf.size(), fmt, args2);
// va_end(args2);
// auto epochMs = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
// std::fprintf(stderr, "[UTPX][+%ld,t=%d] %s\n", epochMs.count(), gettid(), buf.data());
// }
// #else
// void vlog(const char *fmt, va_list args1) {}
// #endif
// void log(const char *fmt, ...) {
//
// std::va_list args;
// va_start(args, fmt);
// vlog(fmt, args);
// va_end(args);
// }
//
// void fatal(const char *fmt, ...) {
// std::va_list args;
// va_start(args, fmt);
// vlog(fmt, args);
// va_end(args);
// std::abort();
// }
enum class Mode : uint32_t { Advise, Device, Mirror };
std::atomic<Mode> mode = Mode::Mirror;
static _hipMalloc originalHipMalloc;
static _hipMemcpy originalHipMemcpy;
static _hipGetDevice originalHipGetDevice;
static _hipMemAdvise originalHipMemAdvise;
static _hipMemPrefetchAsync originalHipMemPrefetchAsync;
struct MirroredAllocation {
void *devicePtr;
size_t size;
void create() {
log("[MEM] Creating mirrored allocation of of %ld bytes on device", size);
if (auto result = originalHipMalloc(&devicePtr, size); result != hipSuccess) {
fatal("\t\tUnable to create mirrored allocation: hipMalloc(%p, %ld) failed with %d", //
&devicePtr, size, result);
}
if (!devicePtr) fatal("\t\tUnable to create mirrored allocation: hipMalloc produced NULL");
}
void mirror(void *hostPtr) {
if (auto result = originalHipMemcpy(devicePtr, hostPtr, size, hipMemcpyHostToDevice); result != hipSuccess) {
fatal("\t\tUnable to copy to mirrored allocation: hipMemcpy(%p <- %p, %ld) failed with %d", //
devicePtr, hostPtr, size, result);
}
}
};
static std::shared_mutex allocationsLock{};
static std::unordered_map<uintptr_t, MirroredAllocation> allocations;
// see rocvirtual.cpp VirtualGPU::submitKernelInternal
// VirtualGPU::processMemObjects
static auto findHostAllocations(uintptr_t maybePointer) {
return std::find_if(allocations.begin(), allocations.end(),
[&](auto &kv) { return maybePointer >= kv.first && maybePointer < kv.first + kv.second.size; });
}
static void *findHostAllocationsAndCreateMirrored(uintptr_t maybePointer, int device, hipStream_t stream) {
if (mode == Mode::Device) return nullptr;
for (auto &[hostPtr, alloc] : allocations) {
if (maybePointer >= hostPtr && maybePointer < hostPtr + alloc.size) {
size_t offset = maybePointer - hostPtr;
log("\t\tLocated host ptr: %p (offset=%ld) from (0x%lx+%ld)", reinterpret_cast<void *>(maybePointer), offset, hostPtr, alloc.size);
switch (mode) {
case Mode::Device: break;
case Mode::Advise:
if (auto result = originalHipMemPrefetchAsync(reinterpret_cast<void *>(maybePointer), alloc.size, device, stream);
result != hipSuccess)
log("WARN: hipMemPrefetchAsync failed with %d", result);
return nullptr;
case Mode::Mirror:
if (alloc.devicePtr) {
log("\t\t-> Existing mirrored allocation exists: %p", alloc.devicePtr);
} else {
log("\t\t-> No mirrored allocation, creating...");
kernel::suspendInterception(); // hipMemcpy may launch more kernels, so we suspend interception for now
alloc.create();
alloc.mirror(reinterpret_cast<void *>(hostPtr));
fault::registerPage(reinterpret_cast<void *>(hostPtr), alloc.size);
kernel::resumeInterception();
}
return &alloc.devicePtr;
}
}
}
return nullptr;
}
void kernel::interceptKernelLaunch(const void *fn, const HSACOKernelMeta &meta, void **args, dim3, dim3, hipStream_t stream) {
log("\tAttempting to replace host allocations for %p, argCount=%ld, argSize=%ld", fn, meta.args.size(), meta.kernargSize);
std::unique_lock<std::shared_mutex> write(allocationsLock);
log("\tCurrent host allocations (%zu): ", allocations.size());
{
int p = 0;
size_t totalHost = 0;
size_t totalDevice = 0;
for (const auto &[hostPtr, alloc] : allocations) {
log("\t\t[%3d] host=(0x%lx+%ld) => device=%p", p, hostPtr, alloc.size, alloc.devicePtr);
p++;
totalHost += alloc.size;
totalDevice += alloc.devicePtr ? alloc.size : 0;
}
log("\tTotal host = %ld MB, device = %ld MB", totalHost / 1024 / 1024, totalDevice / 1024 / 1024);
}
int device = -1;
if (originalHipGetDevice(&device) != hipSuccess) fatal("Cannot resolve device for allocation");
for (size_t i = 0; i < meta.args.size(); i++) {
const HSACOKernelMeta::Arg &arg = meta.args[i];
if (arg.kind == HSACOKernelMeta::Arg::Kind::Hidden) continue;
if (arg.kind == HSACOKernelMeta::Arg::Kind::Unknown) {
fatal("\tUnknown arg! [%ld] (%ld + %ld) ptr=%p", i, arg.offset, arg.size, args[i]);
}
log("\tChecking argument [%ld] (%ld + %ld) ptr=%p", i, arg.offset, arg.size, args[i]);
if (arg.size < sizeof(void *)) continue;
if (arg.size == sizeof(void *)) { // same size as a pointer, check if it is one
auto target = reinterpret_cast<void **>(args[i]); // we're looking for a void* in our arg list
if (!target) return;
auto deref = reinterpret_cast<uintptr_t>(*target);
if (auto that = findHostAllocationsAndCreateMirrored(deref, device, stream); that) {
log("\t\t-> Rewritten pointer argument with mirrored: old=%p, new=%p", args[i], that);
args[i] = that;
}
} else { // type larger than a pointer, it may be a struct containing pointers
auto minIncrement = meta.packed(i) ? 1 : 2; // check every byte if packed, two byte alignment otherwise for (TODO maybe 8 byte align?)
char *argData = reinterpret_cast<char *>(args[i]);
if (!argData) continue;
for (size_t byteOffset = 0; byteOffset < arg.size; byteOffset += minIncrement) {
auto maybePointer = *reinterpret_cast<uintptr_t *>(argData + byteOffset);
if (auto that = findHostAllocationsAndCreateMirrored(maybePointer, device, stream); that) {
log("\t\t-> Rewritten pointer argument at struct offset %ld with mirrored: old=%p, new=%p", byteOffset, argData + byteOffset,
that);
std::memcpy(argData + byteOffset, that, sizeof(void *));
}
}
}
}
log("\t----");
}
void fault::handleUserspaceFault(void *faultAddr, void *allocAddr, size_t allocLength) {
std::shared_lock<std::shared_mutex> read(allocationsLock);
if (auto it = allocations.find(reinterpret_cast<uintptr_t>(allocAddr)); it != allocations.end()) {
log("[KERNEL] \t\tfound device ptr in fault handler host=%p, device=%p+%ld, fault is %p (offset=%lu)", //
allocAddr, it->second.devicePtr, it->second.size, faultAddr,
reinterpret_cast<uintptr_t>(faultAddr) - reinterpret_cast<uintptr_t>(allocAddr));
if (auto result = originalHipMemcpy(allocAddr, it->second.devicePtr, allocLength, hipMemcpyDeviceToHost); result != hipSuccess) {
log("[KERNEL] hipMemcpy writeback failed: %d", result);
}
fault::unregisterPage(allocAddr);
} else {
log("[KERNEL] \t\t!found device ptr in fault handler %p+%ld", allocAddr, allocLength);
}
//
// fault::accessRegisteredPages([&](const auto ®isteredPages) {
// log("[KERNEL] \tCurrent allocations: %d", registeredPages.size());
// int p = 0;
// for (const auto &[ptr, size] : registeredPages) {
// log("[KERNEL] \t\t[%d] %p + %ld => %x", p, ptr, size, reinterpret_cast<uintptr_t>(ptr));
// p++;
// }
// });
}
extern "C" [[maybe_unused]] void __attribute__((constructor)) preload_main() {
fault::initialiseUserspacePagefaultHandling();
originalHipMemPrefetchAsync = dlSymbol<_hipMemPrefetchAsync>("hipMemPrefetchAsync", HipLibrarySO);
originalHipGetDevice = dlSymbol<_hipGetDevice>("hipGetDevice", HipLibrarySO);
originalHipMemAdvise = dlSymbol<_hipMemAdvise>("hipMemAdvise", HipLibrarySO);
originalHipMalloc = dlSymbol<_hipMalloc>("hipMalloc", HipLibrarySO);
originalHipMemcpy = dlSymbol<_hipMemcpy>("hipMemcpy", HipLibrarySO);
static const char *UTPX_MODE = "UTPX_MODE";
if (auto modePtr = std::getenv(UTPX_MODE); modePtr) {
std::string rawMode(modePtr);
if (rawMode == "DEVICE") mode = Mode::Device;
else if (rawMode == "MIRROR")
mode = Mode::Mirror;
else if (rawMode == "ADVISE")
mode = Mode::Advise;
else
fatal("Unknown %s mode, terminating...", UTPX_MODE);
}
switch (mode) {
case Mode::Advise: log("Using Advise mode"); break;
case Mode::Device: log("Using Device mode"); break;
case Mode::Mirror: log("Using Mirror mode"); break;
}
}
extern "C" [[maybe_unused]] void __attribute__((destructor)) preload_exit() { fault::terminateUserspacePagefaultHandling(); }
extern "C" [[maybe_unused]] hipError_t hipMallocManaged(void **ptr, size_t size, unsigned int flags) {
auto original = dlSymbol<_hipMallocManaged>("hipMallocManaged", HipLibrarySO);
auto emplaceAlloc = [&](hipError_t result) {
if (result == hipSuccess) {
std::unique_lock<std::shared_mutex> write(allocationsLock);
allocations.emplace(reinterpret_cast<uintptr_t>(*ptr), MirroredAllocation{.devicePtr = nullptr, .size = size});
}
return result;
};
switch (mode) {
case Mode::Advise: {
auto result = emplaceAlloc(original(ptr, size, flags));
if (result == hipSuccess) {
int device = -1;
if (originalHipGetDevice(&device) != hipSuccess) fatal("Cannot resolve device for allocation");
if (originalHipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device) != hipSuccess) {
log("WARN: cannot set memAdvise flag: hipMemAdviseSetCoarseGrain");
}
if (originalHipMemAdvise(*ptr, size, hipMemAdviseSetPreferredLocation, device) != hipSuccess) {
log("WARN: cannot set memAdvise flag: hipMemAdviseSetPreferredLocation");
}
if (originalHipMemAdvise(*ptr, size, hipMemAdviseSetAccessedBy, device) != hipSuccess) {
log("WARN: cannot set memAdvise flag: hipMemAdviseSetAccessedBy");
}
}
return result;
}
case Mode::Device: return emplaceAlloc(originalHipMalloc(ptr, size));
case Mode::Mirror: {
if (size < fault::hostPageSize()) {
log("[MEM] Allocation (%zu) less than page size (%zu), skipping", size, fault::hostPageSize());
return original(ptr, size, flags);
}
// auto r = original(ptr, size + 4096 , flags);
*ptr = aligned_alloc(fault::hostPageSize(),
size + fault::hostPageSize()); // XXX burn extra page worth of memory so that we don't lock the wrong thing
if (!*ptr) return hipErrorOutOfMemory;
log("[MEM] Intercepting hipMallocManaged(%p, %ld, %x)", (void *)ptr, size, flags);
log("[MEM] -> %p ", *ptr);
return emplaceAlloc(hipSuccess);
}
}
return hipErrorInvalidValue;
}
// For roc-stdpar, deallocation calls __hipstdpar_hidden_free if the pointer, as queried with hipPointerGetAttributes, is not managed.
// This is a problem because our interposed hipMallocManaged returns a non-managed pointer, so roc-stdpar attempts to do a normal free on a
// pointer from hipMalloc. We intercept all hipPointerGetAttributes calls to work around this.
// thread_local bool __hipstdpar_dealloc_active = false;
extern "C" [[maybe_unused]] hipError_t hipMemcpy(void *dst, const void *src, size_t size, hipMemcpyKind kind) {
auto original = dlSymbol<_hipMemcpy>("hipMemcpy", HipLibrarySO);
switch (mode) {
case Mode::Advise: return original(dst, src, size, kind);
case Mode::Device: return original(dst, src, size, hipMemcpyDefault);
case Mode::Mirror:
switch (kind) {
case hipMemcpyHostToHost: // fallthrough
case hipMemcpyDeviceToDevice: return original(dst, src, size, kind);
case hipMemcpyDefault: // fallthrough
case hipMemcpyHostToDevice: // fallthrough
case hipMemcpyDeviceToHost: {
const auto kindName = [](hipMemcpyKind kind) {
switch (kind) {
case hipMemcpyHostToHost: return "MemcpyHostToHost";
case hipMemcpyDeviceToDevice: return "MemcpyDeviceToDevice";
case hipMemcpyDefault: return "MemcpyDefault";
case hipMemcpyHostToDevice: return "MemcpyHostToDevice";
case hipMemcpyDeviceToHost: return "MemcpyDeviceToHost";
default: return "Unknown";
}
};
auto srcIt = allocations.find(reinterpret_cast<uintptr_t>(src));
auto dstIt = allocations.find(reinterpret_cast<uintptr_t>(dst));
if (srcIt != allocations.end() && dstIt != allocations.end()) {
log("Intercepting hipMemcpy(%p, %p, %zu, %s) , dst=[host=%p;device=%p], src=[host=%p;device=%p]", //
dst, src, size, kindName(kind), //
reinterpret_cast<void *>(dstIt->first), reinterpret_cast<void *>(dstIt->second.devicePtr),
reinterpret_cast<void *>(srcIt->first), reinterpret_cast<void *>(srcIt->second.devicePtr));
auto result = original(dstIt->second.devicePtr, srcIt->second.devicePtr, size, kind);
fault::registerPage(reinterpret_cast<void *>(dstIt->first), dstIt->second.size);
return result;
} else if (srcIt != allocations.end()) { // the source ptr is mirrored, and dest is not:
log("Intercepting hipMemcpy(%p, %p, %zu, %s) , dst=%p, src=[host=%p;device=%p]", //
dst, src, size, kindName(kind), dst, reinterpret_cast<void *>(srcIt->first),
reinterpret_cast<void *>(srcIt->second.devicePtr));
// just copy to the dest (host/device) ptr, we use the device pointer as the source as it's always up-to-date
return original(dst, srcIt->second.devicePtr, size, kind);
} else if (dstIt != allocations.end()) { // dest ptr is mirrored, and the source is not:
log("Intercepting hipMemcpy(%p, %p, %zu, %s) , dst=[host=%p;device=%p], src=%p", //
dst, src, size, kindName(kind), reinterpret_cast<void *>(dstIt->first), reinterpret_cast<void *>(dstIt->second.devicePtr),
src);
// just copy to the device ptr and register the host page if not already registered, synchronisation happens on next page fault
if (!dstIt->second.devicePtr) dstIt->second.create();
auto result = original(dstIt->second.devicePtr, src, size, kind);
fault::registerPage(reinterpret_cast<void *>(dstIt->first), dstIt->second.size);
return result;
} else {
return original(dst, src, size, kind);
}
}
default: return original(dst, src, size, kind);
}
}
return hipErrorInvalidValue;
}
extern "C" [[maybe_unused]] hipError_t hipMemset(void *ptr, int value, size_t size) {
auto original = dlSymbol<_hipMemset>("hipMemset", HipLibrarySO);
switch (mode) {
case Mode::Advise: // fallthrough
case Mode::Device: return original(ptr, value, size);
case Mode::Mirror:
// XXX ptr may be an offset from base we need to do a ranged search
std::shared_lock<std::shared_mutex> read(allocationsLock);
if (auto it = findHostAllocations(reinterpret_cast<uintptr_t>(ptr)); it != allocations.end()) {
log("Intercepting hipMemset(%p, %d, %ld), existing host allocation found", ptr, value, size);
size_t offsetFromBase = reinterpret_cast<uintptr_t>(ptr) - it->first;
if (offsetFromBase != 0) fatal("IMPL: hipMemset with offset\n");
std::memset(ptr, value, size); // memset the host using the already offset ptr from the arg
if (!it->second.devicePtr) it->second.create(); // XXX devicePtr is nullptr if memset is called before any dependent kernel
if (auto result = original(it->second.devicePtr, value, size); result != hipSuccess) {
fatal("hipMemset(%p, %d, %ld) failed to memset mirrored allocation: %d", it->second.devicePtr, value, size, result);
}
return hipSuccess;
} else {
return original(ptr, value, size);
}
}
return hipErrorInvalidValue;
}
extern "C" [[maybe_unused]] hipError_t hipFree(void *ptr) {
auto original = dlSymbol<_hipFree>("hipFree", HipLibrarySO);
switch (mode) {
case Mode::Advise: // fallthrough
case Mode::Device: return original(ptr);
case Mode::Mirror:
if (!ptr)
return original(nullptr); // XXX still delegate to HIP because hipFree(nullptr) can be used as an implicit hipDeviceSynchronize or
// initialisation of the HIP runtime
std::unique_lock<std::shared_mutex> write(allocationsLock);
if (auto it = allocations.find(reinterpret_cast<uintptr_t>(ptr)); it != allocations.end()) {
log("Intercepting hipFree(%p), existing host allocation found", ptr);
if (auto page = fault::lookupRegisteredPage(ptr); page) {
fault::unregisterPage(page->first);
}
free(ptr);
if (auto result = original(it->second.devicePtr); result != hipSuccess) {
fatal("hipFree(%p) failed to release mirrored allocation: %d", it->second.devicePtr, result);
}
allocations.erase(it);
return hipSuccess;
} else {
return original(ptr);
}
}
return hipErrorInvalidValue;
}
extern "C" [[maybe_unused]] hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, const void *ptr) {
auto original = dlSymbol<_hipPointerGetAttributes>("hipPointerGetAttributes", HipLibrarySO);
switch (mode) {
case Mode::Advise: return original(attributes, ptr);
case Mode::Device: // fallthrough
case Mode::Mirror:
log("Replace hipPointerGetAttributes(%p, %p), isManaged=%d", attributes, ptr, attributes->isManaged);
auto result = original(attributes, ptr);
std::shared_lock<std::shared_mutex> read(allocationsLock);
if (auto it = findHostAllocations(reinterpret_cast<uintptr_t>(ptr)); it != allocations.end()) {
log(" -> Replace hipPointerGetAttributes(%p, %p), isManaged=%d", attributes, ptr, attributes->isManaged);
}
attributes->isManaged = true; // FIXME we should only do this if allocation is found really, but it crashes a few apps early
return result;
}
return hipErrorInvalidValue;
}
// The following are marked inline in hipstdpar_lib.hpp, so we just always shim hipPointerGetAttributes, seems to work OK for now.
// extern "C" void *__hipstdpar_realloc(void *p, std::size_t n)
// {
// __hipstdpar_dealloc_active = true;
// std::fprintf(stderr, "[malloc_switch] Intercepted __hipstdpar_realloc(%p, %ld)", p, n);
// auto result = dlSymbol<___hipstdpar_realloc>("hipstdpar_realloc")(p, n);
// __hipstdpar_dealloc_active = false;
// return result;
// }
// extern "C" void __hipstdpar_free(void *p)
// {
// __hipstdpar_dealloc_active = true;
// std::fprintf(stderr, "[malloc_switch] Intercepted __hipstdpar_free(%p)", p);
// dlSymbol<___hipstdpar_free>("hipstdpar_free")(p);
// __hipstdpar_dealloc_active = false;
// }
// extern "C" void __hipstdpar_operator_delete_aligned_sized(void *p, std::size_t n, std::size_t a) noexcept
// {
// __hipstdpar_dealloc_active = true;
// std::fprintf(stderr, "[malloc_switch] Intercepted __hipstdpar_operator_delete_aligned_sized(%p, %ld, %ld)", p, n, a);
// dlSymbol<___hipstdpar_operator_delete_aligned_sized>("__hipstdpar_operator_delete_aligned_sized")(p, n, a);
// __hipstdpar_dealloc_active = false;
// }
// namespace utpx
// extern "C" hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, const void *ptr) {
//
//
// }
} // namespace utpx