diff --git a/66_PropertyPools/CMakeLists.txt b/66_PropertyPools/CMakeLists.txt new file mode 100644 index 000000000..bc1624875 --- /dev/null +++ b/66_PropertyPools/CMakeLists.txt @@ -0,0 +1,24 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/66_PropertyPools/app_resources/common.hlsl b/66_PropertyPools/app_resources/common.hlsl new file mode 100644 index 000000000..456dc6740 --- /dev/null +++ b/66_PropertyPools/app_resources/common.hlsl @@ -0,0 +1,20 @@ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +// Unfortunately not every piece of C++14 metaprogramming syntax is available in HLSL 202x +// https://github.com/microsoft/DirectXShaderCompiler/issues/5751#issuecomment-1800847954 +typedef nbl::hlsl::float32_t3 input_t; +typedef nbl::hlsl::float32_t output_t; + +NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxPossibleElementCount = 1 << 20; + +struct PushConstantData +{ + uint64_t inputAddress; + uint64_t outputAddress; + uint32_t dataElementCount; +}; + +NBL_CONSTEXPR uint32_t WorkgroupSize = 256; + +// Yes we do have our own re-creation of C++'s STL in HLSL2021 ! +#include "nbl/builtin/hlsl/limits.hlsl" \ No newline at end of file diff --git a/66_PropertyPools/app_resources/shader.comp.hlsl b/66_PropertyPools/app_resources/shader.comp.hlsl new file mode 100644 index 000000000..4aeef0e0f --- /dev/null +++ b/66_PropertyPools/app_resources/shader.comp.hlsl @@ -0,0 +1,33 @@ +#include "common.hlsl" + +// just a small test +#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" + +[[vk::push_constant]] PushConstantData pushConstants; + +// does absolutely nothing, a later example will show how it gets used +template +void dummyTraitTest() {} + +[numthreads(WorkgroupSize,1,1)] +void main(uint32_t3 ID : SV_DispatchThreadID) +{ + dummyTraitTest(); + if (ID.x>=pushConstants.dataElementCount) + return; + + const input_t self = vk::RawBufferLoad(pushConstants.inputAddress+sizeof(input_t)*ID.x); + + nbl::hlsl::Xoroshiro64StarStar rng = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(pushConstants.dataElementCount,ID.x)^0xdeadbeefu); + + float32_t acc = nbl::hlsl::numeric_limits::max; + const static uint32_t OthersToTest = 15; + [[unroll(OthersToTest)]] + for (uint32_t i=0; i(pushConstants.inputAddress+sizeof(input_t)*offset); + acc = min(length(other-self),acc); + } + vk::RawBufferStore(pushConstants.outputAddress+sizeof(float32_t)*ID.x,acc); +} \ No newline at end of file diff --git a/66_PropertyPools/config.json.template b/66_PropertyPools/config.json.template new file mode 100644 index 000000000..717d05d53 --- /dev/null +++ b/66_PropertyPools/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", // should be none + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/66_PropertyPools/main.cpp b/66_PropertyPools/main.cpp new file mode 100644 index 000000000..2e28ca527 --- /dev/null +++ b/66_PropertyPools/main.cpp @@ -0,0 +1,205 @@ +// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + + +#include "nbl/video/surface/CSurfaceVulkan.h" +#include "nbl/video/alloc/SubAllocatedDescriptorSet.h" + +#include "../common/BasicMultiQueueApplication.hpp" +#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp" + +using namespace nbl; +using namespace core; +using namespace system; +using namespace ui; +using namespace asset; +using namespace video; + +#include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/bit.hlsl" + +// In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants +class PropertyPoolsApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = examples::MonoDeviceApplication; + using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; + + smart_refctd_ptr m_propertyPoolHandler; + smart_refctd_ptr m_scratchBuffer; + smart_refctd_ptr m_addressBuffer; + smart_refctd_ptr m_transferSrcBuffer; + smart_refctd_ptr m_transferDstBuffer; + std::vector m_data; + + // The pool cache is just a formalized way of round-robining command pools and resetting + reusing them after their most recent submit signals finished. + // Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools. + smart_refctd_ptr m_poolCache; + + // This example really lets the advantages of a timeline semaphore shine through! + smart_refctd_ptr m_timeline; + uint64_t m_iteration = 0; + constexpr static inline uint64_t MaxIterations = 200; + + static constexpr uint64_t TransfersAmount = 1024; + static constexpr uint64_t MaxValuesPerTransfer = 512; + + + public: + // Yay thanks to multiple inheritance we cannot forward ctors anymore + PropertyPoolsApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD,_localOutputCWD,_sharedInputCWD,_sharedOutputCWD) {} + + // we stuff all our work here because its a "single shot" app + bool onAppInitialized(smart_refctd_ptr&& system) override + { + using nbl::video::IGPUDescriptorSetLayout; + + // Remember to call the base class initialization! + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + m_propertyPoolHandler = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device)); + + auto createBuffer = [&](uint64_t size, core::bitflag flags, const char* name, bool hostVisible) + { + video::IGPUBuffer::SCreationParams creationParams; + creationParams.size = ((size + 3) / 4) * 4; // Align + creationParams.usage = flags + | asset::IBuffer::EUF_STORAGE_BUFFER_BIT + | asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT + | asset::IBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF; + + auto buffer = m_device->createBuffer(std::move(creationParams)); + nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buffer->getMemoryReqs(); + if (hostVisible) + reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); + m_device->allocate(reqs, buffer.get(), nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT); + buffer->setObjectDebugName(name); + + return buffer; + }; + + m_scratchBuffer = createBuffer(sizeof(nbl::hlsl::property_pools::TransferRequest) * TransfersAmount, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_scratchBuffer", true); + m_addressBuffer = createBuffer(sizeof(uint32_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_addressBuffer", false); + m_transferSrcBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_TRANSFER_DST_BIT), "m_transferSrcBuffer", false); + m_transferDstBuffer = createBuffer(sizeof(uint16_t) * TransfersAmount * MaxValuesPerTransfer, core::bitflag(asset::IBuffer::EUF_NONE), "m_transferDstBuffer", true); + + for (uint16_t i = 0; i < uint16_t((uint32_t(1) << 16) - 1); i++) + m_data.push_back(i); + + // We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are + // the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously. + constexpr auto MaxConcurrency = 64; + + // Since this time we don't throw the Command Pools away and we'll reset them instead, we don't create the pools with the transient flag + m_poolCache = ICommandPoolCache::create(core::smart_refctd_ptr(m_device),getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::NONE,MaxConcurrency); + + // In contrast to fences, we just need one semaphore to rule all dispatches + m_timeline = m_device->createSemaphore(m_iteration); + + return true; + } + + // Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script) + bool keepRunning() override { return m_iterationacquirePool(); + } while (poolIx==ICommandPoolCache::invalid_index); + + smart_refctd_ptr cmdbuf; + { + m_poolCache->getPool(poolIx)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1},core::smart_refctd_ptr(m_logger)); + // lets record, its still a one time submit because we have to re-record with different push constants each time + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + // COMMAND RECORDING + uint32_t dataSize = (((sizeof(uint16_t) * m_data.size()) + 3) / 4) * 4; + uint32_t maxUpload = 65536; + for (uint32_t offset = 0; offset < dataSize; offset += maxUpload) + { + cmdbuf->updateBuffer({ offset, maxUpload, core::smart_refctd_ptr(m_transferSrcBuffer) }, &m_data[offset / sizeof(uint16_t)]); + } + CPropertyPoolHandler::TransferRequest transferRequest; + transferRequest.memblock = asset::SBufferRange { 0, sizeof(uint16_t) * m_data.size(), core::smart_refctd_ptr(m_transferSrcBuffer) }; + transferRequest.elementSize = 1; + transferRequest.elementCount = (m_data.size() * sizeof(uint16_t)) / sizeof(uint32_t); + transferRequest.buffer = asset::SBufferBinding { 0, core::smart_refctd_ptr(m_transferDstBuffer) }; + transferRequest.srcAddressesOffset = IPropertyPool::invalid; + transferRequest.dstAddressesOffset = IPropertyPool::invalid; + + m_propertyPoolHandler->transferProperties(cmdbuf.get(), + asset::SBufferBinding{0, core::smart_refctd_ptr(m_scratchBuffer)}, + asset::SBufferBinding{0, core::smart_refctd_ptr(m_addressBuffer)}, + &transferRequest, &transferRequest + 1, + m_logger.get(), 0, m_data.size() + ); + + auto result = cmdbuf->end(); + assert(result); + } + + + const auto savedIterNum = m_iteration++; + { + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = + { + .cmdbuf = cmdbuf.get() + }; + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = + { + .semaphore = m_timeline.get(), + .value = m_iteration, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + // Generally speaking we don't need to wait on any semaphore because in this example every dispatch gets its own clean piece of memory to use + // from the point of view of the GPU. Implicit domain operations between Host and Device happen upon a submit and a semaphore/fence signal operation, + // this ensures we can touch the input and get accurate values from the output memory using the CPU before and after respectively, each submit becoming PENDING. + // If we actually cared about this submit seeing the memory accesses of a previous dispatch we could add a semaphore wait + const IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = {}, + .commandBuffers = {&cmdbufInfo,1}, + .signalSemaphores = {&signalInfo,1} + }; + + queue->startCapture(); + auto statusCode = queue->submit({ &submitInfo,1 }); + queue->endCapture(); + assert(statusCode == IQueue::RESULT::SUCCESS); + } + + { + ISemaphore::SWaitInfo infos[1] = {{.semaphore=m_timeline.get(),.value=m_iteration}}; + m_device->blockForSemaphores(infos); + + // Readback ds + // (we'll read back the destination buffer and check that copy went through as expected) + auto mem = m_transferDstBuffer->getBoundMemory(); // Scratch buffer has the transfer requests + void* ptr = mem.memory->map({ mem.offset, mem.memory->getAllocationSize() }); + + for (uint32_t i = 0; i < 1024; /*m_data.size();*/ i++) + { + uint16_t expected = reinterpret_cast(ptr)[i]; + uint16_t actual = m_data[i]; + std::printf("%i, ", expected); + assert(expected == actual); + } + std::printf("\n"); + bool success = mem.memory->unmap(); + assert(success); + } + } +}; + +NBL_MAIN_FUNC(PropertyPoolsApp) \ No newline at end of file diff --git a/66_PropertyPools/pipeline.groovy b/66_PropertyPools/pipeline.groovy new file mode 100644 index 000000000..1a7b043a4 --- /dev/null +++ b/66_PropertyPools/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CStreamingAndBufferDeviceAddressBuilder extends IBuilder +{ + public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a20a33a9..09a73bfe0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,5 +65,6 @@ if(NBL_BUILD_EXAMPLES) #add_subdirectory(61_UI EXCLUDE_FROM_ALL) add_subdirectory(62_CAD EXCLUDE_FROM_ALL) add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL) + add_subdirectory(66_PropertyPools EXCLUDE_FROM_ALL) add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42 endif() \ No newline at end of file