From 3abea9dfab5ae515f76a429945ed94196285fc08 Mon Sep 17 00:00:00 2001 From: qiaojbao Date: Tue, 4 Jun 2024 11:17:08 +0800 Subject: [PATCH] Update xgl from commit aa6b5d35 Update Khronos Vulkan Headers to 1.3.285 [Proton][DX12] Fix Halo Infinite crash after the loading screen Add enableMergedEncode settings for RT Add new tuning profiles for Enshrouded. Add option to disable ReZ for depth-only pipelines Add RP image transitions Clean up chrono Cleanup Linux compiler warnings Fix bvh_batch_layer warnings Fix CTS failure in dEQP-VK.binding_model.inline_uniform_blocks.copy_* Fix minor BVH batching issues Fix mutable descriptor memory issue Fix some bugs in recent swapchain refactor Fix steam screenshot washed out colors with X4 Foundations Implement BatchBvhBuilds Implement VK_NV_device_generated_commands Increment RGP Instrumentation API version to 4 Remove a duplicated gpurt setting Replace IFH raytracing with RT Toss Points Set AppendExeNameToPipelineDump to TRUE by default Set GpurtOptions to compiler Support vertex offset mode Update PAL Version in XGL 878 Use maxPrimCount during indirect TLAS build VK_NV_Device_Generated_Commands: optimize preprocess buffer --- cmake/XglCompileDefinitions.cmake | 6 + cmake/XglVersions.cmake | 2 +- icd/CMakeLists.txt | 1 + icd/Loader/LunarG/Lnx/amd-icd.json | 4 +- icd/api/app_profile.cpp | 121 ++- icd/api/app_shader_optimizer.cpp | 10 + icd/api/appopt/bvh_batch_layer.cpp | 818 ++++++++++++++++++ icd/api/appopt/bvh_batch_layer.h | 160 ++++ .../generic/StrangeBrigade/profile.json | 3 + .../gfxIp11_0/Navi31/Enshrouded/profile.json | 56 ++ .../gfxIp11_0/Navi32/Enshrouded/profile.json | 42 + .../generic/StrangeBrigade/profile.json | 3 + icd/api/compiler_solution_llpc.cpp | 9 +- icd/api/debug_printf.cpp | 10 +- icd/api/include/app_profile.h | 8 +- icd/api/include/app_shader_optimizer.h | 1 + icd/api/include/compiler_solution.h | 2 +- .../khronos/sdk-1.3/vulkan/vulkan_core.h | 55 +- .../khronos/sdk-1.3/vulkan/vulkan_metal.h | 12 +- icd/api/include/khronos/vulkan.h | 3 + icd/api/include/vk_cmdbuffer.h | 14 +- icd/api/include/vk_conv.h | 12 +- icd/api/include/vk_device.h | 5 +- icd/api/include/vk_extensions.h | 2 + icd/api/include/vk_image.h | 6 +- icd/api/include/vk_indirect_commands_layout.h | 20 +- icd/api/include/vk_physical_device.h | 10 +- icd/api/include/vk_queue.h | 3 +- icd/api/include/vk_swapchain.h | 10 +- icd/api/pipeline_binary_cache.cpp | 15 +- icd/api/pipeline_compiler.cpp | 49 +- icd/api/raytrace/ray_tracing_device.cpp | 72 +- icd/api/raytrace/ray_tracing_device.h | 14 + .../raytrace/vk_acceleration_structure.cpp | 7 +- icd/api/raytrace/vk_acceleration_structure.h | 1 + icd/api/raytrace/vk_ray_tracing_pipeline.cpp | 3 +- icd/api/renderpass/renderpass_builder.cpp | 126 +-- icd/api/sqtt/sqtt_rgp_annotations.h | 2 +- icd/api/strings/extensions.txt | 2 + icd/api/vk_buffer_view.cpp | 6 +- icd/api/vk_cmdbuffer.cpp | 455 +++++++--- icd/api/vk_cmdbuffer_transfer.cpp | 19 +- icd/api/vk_compute_pipeline.cpp | 7 +- icd/api/vk_conv.cpp | 4 + icd/api/vk_descriptor_buffer.cpp | 2 +- icd/api/vk_descriptor_pool.cpp | 21 +- icd/api/vk_descriptor_set.cpp | 9 +- icd/api/vk_descriptor_set_layout.cpp | 10 +- icd/api/vk_descriptor_update_template.cpp | 6 +- icd/api/vk_device.cpp | 19 +- icd/api/vk_graphics_pipeline.cpp | 8 +- icd/api/vk_graphics_pipeline_library.cpp | 5 +- icd/api/vk_image.cpp | 116 +-- icd/api/vk_indirect_commands_layout.cpp | 157 +++- icd/api/vk_memory.cpp | 1 + icd/api/vk_physical_device.cpp | 24 +- icd/api/vk_pipeline_layout.cpp | 12 +- icd/api/vk_query.cpp | 5 +- icd/api/vk_queue.cpp | 35 +- icd/api/vk_swapchain.cpp | 55 +- icd/api/vk_utils.cpp | 2 +- icd/res/ver.h | 4 +- icd/settings/settings.cpp | 26 +- icd/settings/settings_xgl.json | 183 +++- 64 files changed, 2412 insertions(+), 478 deletions(-) create mode 100644 icd/api/appopt/bvh_batch_layer.cpp create mode 100644 icd/api/appopt/bvh_batch_layer.h create mode 100644 icd/api/appopt/shader_profiles/llpc/gfxIp10_3/generic/StrangeBrigade/profile.json create mode 100644 icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi31/Enshrouded/profile.json create mode 100644 icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi32/Enshrouded/profile.json create mode 100644 icd/api/appopt/shader_profiles/llpc/gfxIp11_0/generic/StrangeBrigade/profile.json diff --git a/cmake/XglCompileDefinitions.cmake b/cmake/XglCompileDefinitions.cmake index 8ddc2268..7a345008 100644 --- a/cmake/XglCompileDefinitions.cmake +++ b/cmake/XglCompileDefinitions.cmake @@ -119,6 +119,9 @@ macro(xgl_set_compile_definitions) endif() #endif +#if VKI_RAY_TRACING +#endif + if (XGL_ENABLE_GCOV) target_compile_definitions(xgl PRIVATE ICD_ENABLE_GCOV) endif() @@ -132,6 +135,9 @@ macro(xgl_set_compile_definitions) #if VKI_RAY_TRACING #endif +#if VKI_RAY_TRACING +#endif + #if VKI_RAY_TRACING #endif diff --git a/cmake/XglVersions.cmake b/cmake/XglVersions.cmake index 02029eeb..4f65ae21 100644 --- a/cmake/XglVersions.cmake +++ b/cmake/XglVersions.cmake @@ -28,7 +28,7 @@ include_guard() # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION. It describes the version of the PAL interface # that the ICD supports. PAL uses this value to enable backwards-compatibility for older interface versions. # It must be updated on each PAL promotion after handling all of the interface changes described in palLib.h. -set(ICD_PAL_CLIENT_MAJOR_VERSION "867") +set(ICD_PAL_CLIENT_MAJOR_VERSION "878") # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1. # It describes the interface version of the gpuopen shared module (part of PAL) that the ICD supports. diff --git a/icd/CMakeLists.txt b/icd/CMakeLists.txt index 2da6c892..3eefab43 100644 --- a/icd/CMakeLists.txt +++ b/icd/CMakeLists.txt @@ -169,6 +169,7 @@ if (VKI_RAY_TRACING) api/raytrace/vk_ray_tracing_pipeline.cpp api/raytrace/ray_tracing_device.cpp api/vk_deferred_operation.cpp + api/appopt/bvh_batch_layer.cpp ) endif() #endif diff --git a/icd/Loader/LunarG/Lnx/amd-icd.json b/icd/Loader/LunarG/Lnx/amd-icd.json index c6aed269..f7817f69 100644 --- a/icd/Loader/LunarG/Lnx/amd-icd.json +++ b/icd/Loader/LunarG/Lnx/amd-icd.json @@ -2,13 +2,13 @@ "file_format_version": "1.0.0", "ICD": { "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.280" + "api_version": "1.3.285" }, "layer": { "name": "VK_LAYER_AMD_switchable_graphics_@ISABITS@", "type": "GLOBAL", "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.280", + "api_version": "1.3.285", "implementation_version": "1", "description": "AMD switchable graphics layer", "functions": { diff --git a/icd/api/app_profile.cpp b/icd/api/app_profile.cpp index cf466325..c3013a5f 100644 --- a/icd/api/app_profile.cpp +++ b/icd/api/app_profile.cpp @@ -100,6 +100,12 @@ struct AppProfilePattern AppProfilePatternEntry entries[16]; }; +// define PatternEnd + +constexpr AppProfilePatternEntry PatternEnd = {}; + +// Section START AppProfilePatternEntry for all Games + constexpr AppProfilePatternEntry AppNameDoom = { PatternAppNameLower, @@ -600,6 +606,24 @@ constexpr AppProfilePatternEntry AppNameX4Foundations "x4" }; +constexpr AppProfilePatternEntry AppNameHaloInfiniteLauncher +{ + PatternAppNameLower, + "haloinfinite.exe" +}; + +constexpr AppProfilePatternEntry AppNameTf2Win64 +{ + PatternAppNameLower, + "tf_win64.exe" +}; + +constexpr AppProfilePatternEntry AppNameTf2Linux64 +{ + PatternAppNameLower, + "tf_linux64" +}; + constexpr AppProfilePatternEntry AppNameX4Engine { PatternEngineNameLower, @@ -732,9 +756,36 @@ constexpr AppProfilePatternEntry AppEngineQuanticDream "quantic dream engine" }; -constexpr AppProfilePatternEntry PatternEnd = {}; +constexpr AppProfilePatternEntry AppNameEnshrouded = +{ + PatternAppNameLower, + "enshrouded" +}; + +constexpr AppProfilePatternEntry AppEngineHolistic = +{ + PatternEngineNameLower, + "holistic" +}; + +constexpr AppProfilePatternEntry AppNameWindowKill = +{ + PatternAppNameLower, + "windowkill" +}; + +constexpr AppProfilePatternEntry AppEngineGodot = +{ + PatternEngineNameLower, + "godot engine" +}; + +// Section END of AppProfilePatternEntry for all games // This is a table of patterns. The first matching pattern in this table will be returned. +// Note: If an app gets detected by both app name and engine name, +// whatever comes first in this array will be the chosen app profile in ScanApplicationProfile(). +// This should get fixed so not as to get bitten by the order here! AppProfilePattern AppPatternTable[] = { { @@ -800,14 +851,6 @@ AppProfilePattern AppPatternTable[] = } }, - { - AppProfile::IdTechEngine, - { - AppEngineIdTech, - PatternEnd - } - }, - { AppProfile::Dota2, { @@ -1375,6 +1418,32 @@ AppProfilePattern AppPatternTable[] = } }, + { + AppProfile::DxvkHaloInfiniteLauncher, + { + AppNameHaloInfiniteLauncher, + AppEngineDXVK, + PatternEnd + } + }, + + { + AppProfile::DxvkTf2, + { + AppNameTf2Win64, + AppEngineDXVK, + PatternEnd + } + }, + + { + AppProfile::DxvkTf2, + { + AppNameTf2Linux64, + AppEngineDXVK, + PatternEnd + } + }, { AppProfile::MetalGearSolid5, { @@ -1466,6 +1535,23 @@ AppProfilePattern AppPatternTable[] = } }, + { + AppProfile::Enshrouded, + { + AppNameEnshrouded, + AppEngineHolistic, + PatternEnd + } + }, + + { + AppProfile::HolisticEngine, + { + AppEngineHolistic, + PatternEnd + } + }, + { AppProfile::Zink, { @@ -1496,6 +1582,23 @@ AppProfilePattern AppPatternTable[] = AppEngineDXVK, PatternEnd } + }, + + { + AppProfile::IdTechEngine, + { + AppEngineIdTech, + PatternEnd + } + }, + + { + AppProfile::WindowKill, + { + AppNameWindowKill, + AppEngineGodot, + PatternEnd + } } }; diff --git a/icd/api/app_shader_optimizer.cpp b/icd/api/app_shader_optimizer.cpp index 475b230f..94fbe86b 100644 --- a/icd/api/app_shader_optimizer.cpp +++ b/icd/api/app_shader_optimizer.cpp @@ -1154,12 +1154,22 @@ void ShaderOptimizer::BuildAppProfile() if ((m_settings.pipelineProfileIgnoresAppProfile == false) && (pMemory != nullptr)) { memset(pMemory, 0, newSize); + BuildAppProfileGeneric(); { BuildAppProfileLlpc(); } } } +// ===================================================================================================================== +void ShaderOptimizer::BuildAppProfileGeneric() +{ + const AppProfile appProfile = m_pDevice->GetAppProfile(); + const Pal::GpuType gpuType = m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties().gpuType; + + uint32 i = 0; +} + // ===================================================================================================================== void ShaderOptimizer::BuildAppProfileLlpc() { diff --git a/icd/api/appopt/bvh_batch_layer.cpp b/icd/api/appopt/bvh_batch_layer.cpp new file mode 100644 index 00000000..9fc0d5d7 --- /dev/null +++ b/icd/api/appopt/bvh_batch_layer.cpp @@ -0,0 +1,818 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file bvh_batch_layer.cpp +* @brief Implementation of bvh batch layer. +*********************************************************************************************************************** +*/ + +#if VKI_RAY_TRACING + +#include + +#include "bvh_batch_layer.h" +#include "vk_cmdbuffer.h" +#include "raytrace/ray_tracing_device.h" +#include "palVectorImpl.h" + +namespace vk +{ + +// ===================================================================================================================== +BvhBatchLayer::BvhBatchLayer( + Device* pDevice) + : + m_pInstance(pDevice->VkInstance()), + m_emptyStateCount(0), + m_pEmptyStateStack() +{ +} + +// ===================================================================================================================== +BvhBatchLayer::~BvhBatchLayer() +{ + for (uint32_t stateIdx = 0; stateIdx < m_emptyStateCount; ++stateIdx) + { + m_pEmptyStateStack[stateIdx]->DestroyState(); + } +} + +// ===================================================================================================================== +VkResult BvhBatchLayer::Init( + Device* pDevice) +{ + VkResult result = VK_SUCCESS; + + if (pDevice->GetRuntimeSettings().batchBvhBuilds == BatchBvhModeImplicitAndLog) + { + const char* pRootDir = pDevice->PalDevice(DefaultDeviceIndex)->GetDebugFilePath(); + + if (pRootDir != nullptr) + { + char absPath[1024] = {}; + Util::Snprintf(absPath, sizeof(absPath), "%s/%s", pRootDir, "BvhBatchLog.txt"); + + if (result == VK_SUCCESS) + { + result = PalToVkResult(m_logFile.Open(absPath, Util::FileAccessMode::FileAccessAppend)); + } + + if (result == VK_SUCCESS) + { + result = PalToVkResult(m_logFile.Printf("|--------------BEGIN RUN--------------\n")); + } + } + else + { + // AMD_DEBUG_DIR must be set for logging + result = VK_ERROR_UNKNOWN; + } + } + + return result; +} + +// ===================================================================================================================== +VkResult BvhBatchLayer::CreateLayer( + Device* pDevice, + BvhBatchLayer** ppLayer) +{ + VkResult result = VK_SUCCESS; + BvhBatchLayer* pLayer = nullptr; + const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + + if ((settings.batchBvhBuilds == BatchBvhModeImplicit) || (settings.batchBvhBuilds == BatchBvhModeImplicitAndLog)) + { + void* pMem = pDevice->VkInstance()->AllocMem(sizeof(BvhBatchLayer), VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + if (pMem != nullptr) + { + pLayer = VK_PLACEMENT_NEW(pMem) BvhBatchLayer(pDevice); + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + + if (result == VK_SUCCESS) + { + result = pLayer->Init(pDevice); + } + + if (result == VK_SUCCESS) + { + *ppLayer = pLayer; + } + + return result; +} + +// ===================================================================================================================== +void BvhBatchLayer::DestroyLayer() +{ + m_logFile.Printf("|--------------END RUN--------------\n"); + m_logFile.Close(); + + Instance* pInstance = VkInstance(); + Util::Destructor(this); + pInstance->FreeMem(this); +} + +// ===================================================================================================================== +void BvhBatchLayer::VLog( + const char* pFormat, + va_list argList) +{ + VK_ASSERT(LoggingEnabled()); + + Util::MutexAuto lock(&m_mutex); + + Util::Result printResult = m_logFile.VPrintf(pFormat, argList); + VK_ASSERT(printResult == Util::Result::Success); +} + +// ===================================================================================================================== +BvhBatchState* BvhBatchLayer::CreateState( + CmdBuffer* pCmdBuffer) +{ + // Try to reuse a previously freed state + BvhBatchState* pState = PopEmptyState(); + + if (pState != nullptr) + { + pState->Log("Reusing a stashed BvhBatchState.\n"); + } + else + { + // Allocate a new state if no previously freed states were available + void* pMem = m_pInstance->AllocMem(sizeof(BvhBatchState), VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + pState = (pMem != nullptr) ? (VK_PLACEMENT_NEW(pMem) BvhBatchState(this)) : nullptr; + } + + // Link this state to the given cmd buffer + pCmdBuffer->SetBvhBatchState(pState); + + VK_ASSERT(pState != nullptr); + return pState; +} + +// ===================================================================================================================== +bool BvhBatchLayer::PushEmptyState( + BvhBatchState* pState) +{ + bool success = false; + + Util::MutexAuto lock(&m_mutex); + + if (m_emptyStateCount < VK_ARRAY_SIZE(m_pEmptyStateStack)) + { + m_pEmptyStateStack[m_emptyStateCount] = pState; + m_emptyStateCount++; + + success = true; + } + + return success; +} + +// ===================================================================================================================== +BvhBatchState* BvhBatchLayer::PopEmptyState() +{ + BvhBatchState* pState = nullptr; + + Util::MutexAuto lock(&m_mutex); + + if (m_emptyStateCount > 0) + { + m_emptyStateCount--; + pState = m_pEmptyStateStack[m_emptyStateCount]; + } + + return pState; +} + +// ===================================================================================================================== +BvhBatchState::BvhBatchState( + BvhBatchLayer* pLayer) + : + m_type(BvhBatchType::Undefined), + m_pCmdBuffer(nullptr), + m_pLayer(pLayer), + m_geomInfos(pLayer->VkInstance()->Allocator()), + m_rangeInfosOrMaxPrimCounts(pLayer->VkInstance()->Allocator()), + m_indirectVirtAddrs(pLayer->VkInstance()->Allocator()), + m_indirectStrides(pLayer->VkInstance()->Allocator()), + m_infoCount(0), + m_allocations(pLayer->VkInstance()->Allocator()) +{ + Log("Allocating a new BvhBatchState.\n"); +} + +// ===================================================================================================================== +BvhBatchState::~BvhBatchState() +{ +} + +// ===================================================================================================================== +void BvhBatchState::Log( + const char* pFormat, + ...) +{ + if (m_pLayer->LoggingEnabled()) + { + char prependedStr[21] = {}; + Util::Snprintf(prependedStr, sizeof(prependedStr), "|-- 0x%" PRIx64 " - ", this); + + va_list argList = {}; + m_pLayer->VLog(prependedStr, argList); + + va_start(argList, pFormat); + m_pLayer->VLog(pFormat, argList); + va_end(argList); + } +} + +// ===================================================================================================================== +void BvhBatchState::DestroyState() +{ + Log("Freeing a BvhBatchState.\n"); + Util::Destructor(this); + m_pLayer->VkInstance()->FreeMem(this); +} + +// ===================================================================================================================== +void BvhBatchState::Reset() +{ + for (auto pMem : m_allocations) + { + m_pLayer->VkInstance()->FreeMem(pMem); + } + + m_type = BvhBatchType::Undefined; + m_allocations.Clear(); + m_geomInfos.Clear(); + m_rangeInfosOrMaxPrimCounts.Clear(); + m_indirectVirtAddrs.Clear(); + m_indirectStrides.Clear(); + m_infoCount = 0; + + // Unlink this state from the cmd buffer + m_pCmdBuffer->SetBvhBatchState(nullptr); + m_pCmdBuffer = nullptr; + + // Try to stash this now empty state to be reused later + if (m_pLayer->PushEmptyState(this)) + { + Log("Stashing a BvhBatchState during reset.\n"); + } + else + { + DestroyState(); + } +} + +// ===================================================================================================================== +template +bool BvhBatchState::EnqueueBvhBuild( + CmdBuffer* pCmdBuffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos, + const VkDeviceAddress* pIndirectDeviceAddresses, + const uint32_t* pIndirectStrides, + const uint32_t* const* ppMaxPrimitiveCounts) +{ + static_assert(batchType != BvhBatchType::Undefined, "Invalid batch type provided to EnqueueBvhBuild via template."); + + // Ensure the batch type in the state matches + if ((m_type != batchType) && (m_type != BvhBatchType::Undefined)) + { + Flush(); + } + + // Determine how much memory the hard copy needs + size_t memSize = GetHardCopyMemSize(infoCount, pInfos); + + // Allocate memory for the hard copy + void* pMem = m_pLayer->VkInstance()->AllocMem(memSize, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + // Hard copy given data + if (pMem != nullptr) + { + if (m_infoCount == 0) + { + m_pCmdBuffer = pCmdBuffer; + } + else if (m_pCmdBuffer != pCmdBuffer) + { + // CmdBuffer pointer shouldn't change when pending infos are present + VK_NEVER_CALLED(); + Flush(); + } + + Log("Enqueueing %u BVH build infos (batchType - %u).\n", infoCount, batchType); + HardCopyBuildInfos( + infoCount, + pInfos, + ppBuildRangeInfos, + pIndirectDeviceAddresses, + pIndirectStrides, + ppMaxPrimitiveCounts, + pMem, + memSize); + } + else + { + // Failed to allocate memory + VK_NEVER_CALLED(); + } + + return (pMem != nullptr); +} + +// ===================================================================================================================== +void BvhBatchState::Flush() +{ + if (m_infoCount > 0) + { + BvhBatchLayer* pLayer = m_pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + + VK_ASSERT(m_type != BvhBatchType::Undefined); + + if (m_type == BvhBatchType::Direct) + { + Log("Flushing a direct build batch (infoCount - %u).\n", m_infoCount); + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdBuildAccelerationStructuresKHR)( + reinterpret_cast(ApiCmdBuffer::FromObject(m_pCmdBuffer)), + m_infoCount, + m_geomInfos.Data(), + reinterpret_cast(m_rangeInfosOrMaxPrimCounts.Data())); + } + else + { + Log("Flushing an indirect build batch (infoCount - %u).\n", m_infoCount); + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdBuildAccelerationStructuresIndirectKHR)( + reinterpret_cast(ApiCmdBuffer::FromObject(m_pCmdBuffer)), + m_infoCount, + m_geomInfos.Data(), + m_indirectVirtAddrs.Data(), + m_indirectStrides.Data(), + reinterpret_cast(m_rangeInfosOrMaxPrimCounts.Data())); + } + + Reset(); + } +} + +// ===================================================================================================================== +void BvhBatchState::TryFlush( + VkFlags64 srcStageMask) +{ + constexpr VkFlags64 TargetSrcStages = + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR | + VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT; + + if ((srcStageMask & TargetSrcStages) != 0) + { + Log("Flushing via barrier or event (srcStageMask - %llu).\n", srcStageMask); + Flush(); + } +} + +// ===================================================================================================================== +void BvhBatchState::TryFlush( + uint32_t depInfoCount, + const VkDependencyInfo* pDependencyInfos) +{ + VkFlags64 globalSrcMask = 0u; + + for (uint32_t i = 0; i < depInfoCount; ++i) + { + const auto& dependencyInfo = pDependencyInfos[i]; + + for (uint32_t j = 0; j < dependencyInfo.memoryBarrierCount; j++) + { + globalSrcMask |= dependencyInfo.pMemoryBarriers[j].srcStageMask; + } + for (uint32_t j = 0; j < dependencyInfo.bufferMemoryBarrierCount; j++) + { + globalSrcMask |= dependencyInfo.pBufferMemoryBarriers[j].srcStageMask; + } + for (uint32_t j = 0; j < dependencyInfo.imageMemoryBarrierCount; j++) + { + globalSrcMask |= dependencyInfo.pImageMemoryBarriers[j].srcStageMask; + } + } + + TryFlush(globalSrcMask); +} + +// ===================================================================================================================== +template +size_t BvhBatchState::GetHardCopyMemSize( + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos) +{ + // Calculate total geometry structs and ptrs across all infos + size_t totalGeomCount = 0; + size_t totalGeomPtrCount = 0; + for (uint32_t infoIdx = 0; infoIdx < infoCount; ++infoIdx) + { + totalGeomCount += pInfos[infoIdx].geometryCount; + + if (pInfos[infoIdx].ppGeometries != nullptr) + { + totalGeomPtrCount += pInfos[infoIdx].geometryCount; + } + } + + // Memory size for pGeometries and ppGeometies + size_t memSize = + (totalGeomCount * sizeof(VkAccelerationStructureGeometryKHR)) + + (totalGeomPtrCount * sizeof(void*)); + + // Memory size for ppBuildRangeInfos or ppMaxPrimitiveCounts + if (batchType == BvhBatchType::Direct) + { + memSize += (totalGeomCount * sizeof(VkAccelerationStructureBuildRangeInfoKHR)); + } + else + { + memSize += (totalGeomCount * sizeof(uint32_t*)); + } + + // Report the memory size required + return memSize; +} + +// ===================================================================================================================== +template +void BvhBatchState::HardCopyBuildInfos( + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos, + const VkDeviceAddress* pIndirectDeviceAddresses, + const uint32_t* pIndirectStrides, + const uint32_t* const* ppMaxPrimitiveCounts, + void* pMem, + size_t memSize) +{ + m_allocations.PushBack(pMem); + + for (uint32_t infoIdx = 0; infoIdx < infoCount; ++infoIdx) + { + VkAccelerationStructureBuildGeometryInfoKHR geomInfoDst = pInfos[infoIdx]; + + // Per spec, pNext must be NULL + VK_ASSERT(geomInfoDst.pNext == nullptr); + + const size_t geometrySize = geomInfoDst.geometryCount * sizeof(VkAccelerationStructureGeometryKHR); + const size_t geometryPtrSize = geomInfoDst.geometryCount * sizeof(void*); + + if (geomInfoDst.ppGeometries != nullptr) + { + // Array of Goemetry pointers + VkAccelerationStructureGeometryKHR** ppGeometries = + static_cast(pMem); + + // Geometry descs follow the pointers + VkAccelerationStructureGeometryKHR* pGeometries = + static_cast(Util::VoidPtrInc(pMem, geometryPtrSize)); + + // Copy each geometry info and its new pointer into the internal allocation + for (uint32 i = 0; i < geomInfoDst.geometryCount; i++) + { + pGeometries[i] = *geomInfoDst.ppGeometries[i]; + ppGeometries[i] = &pGeometries[i]; + } + + // Apply the local copy + geomInfoDst.ppGeometries = + static_cast(pMem); + + // Increment the data pointer for the following copy + pMem = Util::VoidPtrInc(pMem, geometrySize + geometryPtrSize); + } + else + { + // Copy original geometry info into the internal allocation + memcpy(pMem, geomInfoDst.pGeometries, geometrySize); + + // Apply the local copy + geomInfoDst.pGeometries = + static_cast(pMem); + + // Increment the data pointer for the following copy + pMem = Util::VoidPtrInc(pMem, geometrySize); + } + + m_type = batchType; + m_geomInfos.PushBack(geomInfoDst); + m_infoCount++; + + if (batchType == BvhBatchType::Direct) + { + // Copy BuildRangeInfos into internal allocation + const size_t rangeInfoSize = geomInfoDst.geometryCount * sizeof(VkAccelerationStructureBuildRangeInfoKHR); + memcpy(pMem, ppBuildRangeInfos[infoIdx], rangeInfoSize); + + m_rangeInfosOrMaxPrimCounts.PushBack(pMem); + + // Increment the data pointer for the following copy + pMem = Util::VoidPtrInc(pMem, rangeInfoSize); + } + else + { + // Copy MaxPrimitiveCounts into internal allocation + const size_t maxPrimCountsSize = geomInfoDst.geometryCount * sizeof(uint32_t); + memcpy(pMem, ppMaxPrimitiveCounts[infoIdx], maxPrimCountsSize); + + m_rangeInfosOrMaxPrimCounts.PushBack(pMem); + + // Increment the data pointer for the following copy + pMem = Util::VoidPtrInc(pMem, maxPrimCountsSize); + + m_indirectVirtAddrs.PushBack(pIndirectDeviceAddresses[infoIdx]); + m_indirectStrides.PushBack(pIndirectStrides[infoIdx]); + } + } + + // Ensure that we did not overallocate nor underallocate + VK_ASSERT((reinterpret_cast(pMem) - reinterpret_cast(m_allocations.Back())) == memSize); +} + +namespace entry +{ + +namespace bvhBatchLayer +{ + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdBuildAccelerationStructuresKHR( + VkCommandBuffer commandBuffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos) +{ + bool queued = false; + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState == nullptr) + { + pState = pLayer->CreateState(pCmdBuffer); + } + + if (pState != nullptr) + { + queued = pState->EnqueueBvhBuild( + pCmdBuffer, + infoCount, + pInfos, + ppBuildRangeInfos, + nullptr, + nullptr, + nullptr); + + if (queued == false) + { + // State exists, but we were not able to enqueue. Flush any valid contents in the batch. + pState->Flush(); + } + } + + if (queued == false) + { + // We were not able to batch. Add directly to cmd buffer. + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdBuildAccelerationStructuresKHR)( + commandBuffer, + infoCount, + pInfos, + ppBuildRangeInfos); + } +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdBuildAccelerationStructuresIndirectKHR( + VkCommandBuffer commandBuffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkDeviceAddress* pIndirectDeviceAddresses, + const uint32_t* pIndirectStrides, + const uint32_t* const* ppMaxPrimitiveCounts) +{ + bool queued = false; + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState == nullptr) + { + pState = pLayer->CreateState(pCmdBuffer); + } + + if (pState != nullptr) + { + queued = pState->EnqueueBvhBuild( + pCmdBuffer, + infoCount, + pInfos, + nullptr, + pIndirectDeviceAddresses, + pIndirectStrides, + ppMaxPrimitiveCounts); + + if (queued == false) + { + // State exists, but we were not able to enqueue. Flush any valid contents in the batch. + pState->Flush(); + } + } + + if (queued == false) + { + // We were not able to batch. Add directly to cmd buffer. + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdBuildAccelerationStructuresIndirectKHR)( + commandBuffer, + infoCount, + pInfos, + pIndirectDeviceAddresses, + pIndirectStrides, + ppMaxPrimitiveCounts); + } +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdPipelineBarrier( + VkCommandBuffer commandBuffer, + VkPipelineStageFlags srcStageMask, + VkPipelineStageFlags dstStageMask, + VkDependencyFlags dependencyFlags, + uint32_t memoryBarrierCount, + const VkMemoryBarrier* pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier* pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier* pImageMemoryBarriers) +{ + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState != nullptr) + { + pState->TryFlush(srcStageMask); + } + + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdPipelineBarrier)( + commandBuffer, + srcStageMask, + dstStageMask, + dependencyFlags, + memoryBarrierCount, + pMemoryBarriers, + bufferMemoryBarrierCount, + pBufferMemoryBarriers, + imageMemoryBarrierCount, + pImageMemoryBarriers); +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdPipelineBarrier2( + VkCommandBuffer commandBuffer, + const VkDependencyInfoKHR* pDependencyInfo) +{ + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState != nullptr) + { + pState->TryFlush(1, pDependencyInfo); + } + + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdPipelineBarrier2)(commandBuffer, pDependencyInfo); +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdWaitEvents( + VkCommandBuffer commandBuffer, + uint32_t eventCount, + const VkEvent* pEvents, + VkPipelineStageFlags srcStageMask, + VkPipelineStageFlags dstStageMask, + uint32_t memoryBarrierCount, + const VkMemoryBarrier* pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier* pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier* pImageMemoryBarriers) +{ + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState != nullptr) + { + pState->TryFlush(srcStageMask); + } + + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdWaitEvents)( + commandBuffer, + eventCount, + pEvents, + srcStageMask, + dstStageMask, + memoryBarrierCount, + pMemoryBarriers, + bufferMemoryBarrierCount, + pBufferMemoryBarriers, + imageMemoryBarrierCount, + pImageMemoryBarriers); +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdWaitEvents2( + VkCommandBuffer commandBuffer, + uint32_t eventCount, + const VkEvent* pEvents, + const VkDependencyInfoKHR* pDependencyInfos) +{ + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState != nullptr) + { + pState->TryFlush(eventCount, pDependencyInfos); + } + + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdWaitEvents2)(commandBuffer, eventCount, pEvents, pDependencyInfos); +} + +// ===================================================================================================================== +VKAPI_ATTR VkResult VKAPI_CALL vkEndCommandBuffer( + VkCommandBuffer commandBuffer) +{ + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState != nullptr) + { + pState->Log("Flushing via vkEndCommandBuffer\n"); + pState->Flush(); + } + + return BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkEndCommandBuffer)(commandBuffer); +} + +} // namespace bvhBatchLayer + +} // namespace entry + +// ===================================================================================================================== +void BvhBatchLayer::OverrideDispatchTable( + DispatchTable* pDispatchTable) +{ + // Save current device dispatch table to use as the next layer. + m_nextLayer = *pDispatchTable; + + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkCmdBuildAccelerationStructuresKHR); + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkCmdBuildAccelerationStructuresIndirectKHR); + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkCmdPipelineBarrier); + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkCmdPipelineBarrier2); + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkCmdWaitEvents); + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkCmdWaitEvents2); + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkEndCommandBuffer); +} + +} // namespace vk + +#endif diff --git a/icd/api/appopt/bvh_batch_layer.h b/icd/api/appopt/bvh_batch_layer.h new file mode 100644 index 00000000..aa648302 --- /dev/null +++ b/icd/api/appopt/bvh_batch_layer.h @@ -0,0 +1,160 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file bvh_batch_layer.h +* @brief Declaration of bvh batch layer +*********************************************************************************************************************** +*/ + +#if VKI_RAY_TRACING +#ifndef __BVH_BATCH_LAYER_H +#define __BVH_BATCH_LAYER_H + +#pragma once +#include "opt_layer.h" +#include "vk_alloccb.h" +#include "vk_cmdbuffer.h" +#include "palVector.h" +#include "palMutex.h" +#include "palFile.h" + +namespace vk +{ + +enum class BvhBatchType : uint32 +{ + Undefined, + Direct, + Indirect +}; + +class BvhBatchLayer; + +class BvhBatchState +{ +public: + BvhBatchState(BvhBatchLayer* pLayer); + ~BvhBatchState(); + + void Log(const char* pFormat, ...); + + template + bool EnqueueBvhBuild( + CmdBuffer* pCmdBuffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos, + const VkDeviceAddress* pIndirectDeviceAddresses, + const uint32_t* pIndirectStrides, + const uint32_t* const* ppMaxPrimitiveCounts); + + void Reset(); + void Flush(); + void TryFlush(VkFlags64 srcStageMask); + void TryFlush(uint32_t depInfoCount, const VkDependencyInfo* pDependencyInfos); + void DestroyState(); + +private: + template + size_t GetHardCopyMemSize( + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos); + + template + void HardCopyBuildInfos( + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos, + const VkDeviceAddress* pIndirectDeviceAddresses, + const uint32_t* pIndirectStrides, + const uint32_t* const* ppMaxPrimitiveCounts, + void* pMem, + size_t memSize); + + typedef Util::Vector GeometryInfoList; + typedef Util::Vector VoidPtrList; + typedef Util::Vector VirtAddrList; + typedef Util::Vector StrideList; + + BvhBatchType m_type; + CmdBuffer* m_pCmdBuffer; + BvhBatchLayer* m_pLayer; + GeometryInfoList m_geomInfos; + VoidPtrList m_rangeInfosOrMaxPrimCounts; + VirtAddrList m_indirectVirtAddrs; + StrideList m_indirectStrides; + uint32_t m_infoCount; + VoidPtrList m_allocations; +}; + +class BvhBatchLayer final : public OptLayer +{ +public: + ~BvhBatchLayer(); + + static VkResult CreateLayer(Device* pDevice, BvhBatchLayer** ppLayer); + void DestroyLayer(); + + virtual void OverrideDispatchTable(DispatchTable* pDispatchTable) override; + + void VLog(const char* pFormat, va_list argList); + + BvhBatchState* CreateState(CmdBuffer* pCmdBuffer); + bool PushEmptyState(BvhBatchState* pState); + BvhBatchState* PopEmptyState(); + + Instance* VkInstance() { return m_pInstance; } + bool LoggingEnabled() { return m_logFile.IsOpen(); } + +private: + PAL_DISALLOW_COPY_AND_ASSIGN(BvhBatchLayer); + + BvhBatchLayer(Device* pDevice); + + VkResult Init(Device* pDevice); + + Instance* m_pInstance; + Util::Mutex m_mutex; + + uint32_t m_emptyStateCount; + BvhBatchState* m_pEmptyStateStack[16]; + + Util::File m_logFile; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#define BVH_BATCH_LAYER_OVERRIDE_ALIAS(entry_name, func_name) \ + pDispatchTable->OverrideEntryPoints()->entry_name = vk::entry::bvhBatchLayer::func_name; + +#define BVH_BATCH_LAYER_OVERRIDE_ENTRY(entry_name) BVH_BATCH_LAYER_OVERRIDE_ALIAS(entry_name, entry_name) + +#define BVH_BATCH_LAYER_CALL_NEXT_LAYER(entry_name) \ + pLayer->GetNextLayer()->GetEntryPoints().entry_name + +} // namespace vk + +#endif /* __BVH_BATCH_LAYER_H */ +#endif /* VKI_RAY_TRACING */ diff --git a/icd/api/appopt/shader_profiles/llpc/gfxIp10_3/generic/StrangeBrigade/profile.json b/icd/api/appopt/shader_profiles/llpc/gfxIp10_3/generic/StrangeBrigade/profile.json new file mode 100644 index 00000000..c43cb320 --- /dev/null +++ b/icd/api/appopt/shader_profiles/llpc/gfxIp10_3/generic/StrangeBrigade/profile.json @@ -0,0 +1,3 @@ +{ + "entries": [] +} \ No newline at end of file diff --git a/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi31/Enshrouded/profile.json b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi31/Enshrouded/profile.json new file mode 100644 index 00000000..d584db1f --- /dev/null +++ b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi31/Enshrouded/profile.json @@ -0,0 +1,56 @@ +{ + "entries": [ + { + "pattern": { + "shaderOnly": true, + "cs": { + "codeHash": "0xcf8eb50df001f7cc ce615bacc3823464" + } + }, + "action": { + "cs": { + "threadGroupSwizzleMode": "_16x16" + } + } + }, + { + "pattern": { + "shaderOnly": true, + "cs": { + "codeHash": "0x4eb6c36d1b5fab73 110e3e5875ad5038" + } + }, + "action": { + "cs": { + "disableCodeSinking": true + } + } + }, + { + "pattern": { + "ps": { + "codeHash": "0x3c706d601cf4803e 107f065dcad03a0b" + } + }, + "action": { + "ps": { + "waveSize": 32 + } + } + }, + { + "pattern": { + "shaderOnly": true, + "cs": { + "codeHash": "0xaa8891d44ef6d284 ebf339f1b47fe1d1" + } + }, + "action": { + "cs": { + "wgpMode": 2, + "threadGroupSwizzleMode": "_16x16" + } + } + } + ] +} \ No newline at end of file diff --git a/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi32/Enshrouded/profile.json b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi32/Enshrouded/profile.json new file mode 100644 index 00000000..58c05d67 --- /dev/null +++ b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi32/Enshrouded/profile.json @@ -0,0 +1,42 @@ +{ + "entries": [ + { + "pattern": { + "shaderOnly": true, + "cs": { + "codeHash": "0x4eb6c36d1b5fab73 110e3e5875ad5038" + } + }, + "action": { + "cs": { + "disableCodeSinking": true + } + } + }, + { + "pattern": { + "ps": { + "codeHash": "0x3c706d601cf4803e 107f065dcad03a0b" + } + }, + "action": { + "ps": { + "waveSize": 32 + } + } + }, + { + "pattern": { + "shaderOnly": true, + "cs": { + "codeHash": "0xcf8eb50df001f7cc ce615bacc3823464" + } + }, + "action": { + "cs": { + "threadGroupSwizzleMode": "_16x16" + } + } + } + ] +} \ No newline at end of file diff --git a/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/generic/StrangeBrigade/profile.json b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/generic/StrangeBrigade/profile.json new file mode 100644 index 00000000..c43cb320 --- /dev/null +++ b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/generic/StrangeBrigade/profile.json @@ -0,0 +1,3 @@ +{ + "entries": [] +} \ No newline at end of file diff --git a/icd/api/compiler_solution_llpc.cpp b/icd/api/compiler_solution_llpc.cpp index 15be0b29..8f886284 100644 --- a/icd/api/compiler_solution_llpc.cpp +++ b/icd/api/compiler_solution_llpc.cpp @@ -37,6 +37,8 @@ #include +using namespace std::chrono_literals; + namespace vk { @@ -846,7 +848,7 @@ void LlpcHelperThreadProvider::WaitForTasks() { while (m_pDeferredWorkload->completedInstances < m_pDeferredWorkload->totalInstances) { - m_pDeferredWorkload->event.Wait(Util::fseconds { 1.0f }); + m_pDeferredWorkload->event.Wait(1s); } } @@ -1020,6 +1022,11 @@ VkResult CompilerSolutionLlpc::CreateLlpcCompiler( llpcOptions[numOptions++] = "-enable-pipeline-dump"; } + if (settings.enableImageMsaaLoadOpt) + { + llpcOptions[numOptions++] = "-mattr=-msaa-load-dst-sel-bug"; + } + optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-pipeline-dump-dir=%s", settings.pipelineDumpDir); ++optionLength; llpcOptions[numOptions++] = pOptionBuffer; diff --git a/icd/api/debug_printf.cpp b/icd/api/debug_printf.cpp index 6fd013ac..866ce8fa 100644 --- a/icd/api/debug_printf.cpp +++ b/icd/api/debug_printf.cpp @@ -37,6 +37,7 @@ #include using namespace vk; +using namespace std::chrono_literals; //===================================================================================================================== DebugPrintf::DebugPrintf( @@ -120,13 +121,18 @@ void DebugPrintf::BindPipeline( Pal::BufferViewInfo srdInfo = {}; srdInfo.gpuAddr = m_printfMemory.GpuVirtAddr(deviceIdx); - srdInfo.range = m_printfMemory.Size(); + srdInfo.range = m_printfMemory.Size(); + pDevice->PalDevice(deviceIdx)->CreateUntypedBufferViewSrds(1, &srdInfo, pTable); + m_frame = 1; + const Pal::uint32* pEntry = reinterpret_cast(&tableVa); + pCmdBuffer->CmdSetUserData(static_cast(bindPoint), userDataOffset, 1, pEntry); m_parsedFormatStrings.Reset(); + for (auto it = pPipeline->GetFormatStrings()->Begin(); it.Get() != nullptr; it.Next()) { bool found = true; @@ -214,7 +220,7 @@ Pal::Result DebugPrintf::PostQueueProcess( while (true) { palResult = pDevice->PalDevice(DefaultDeviceIndex)->WaitForSemaphores( - 1, palSemaphores, waitValues, 0, std::chrono::nanoseconds {1000000llu}); + 1, palSemaphores, waitValues, 0, 1ms); decodeOffset = ProcessDebugPrintfBuffer(pDevice, deviceIdx, decodeOffset, &file); if ((PalToVkResult(palResult) <= 0) || (loopIndex++ > 1000)) diff --git a/icd/api/include/app_profile.h b/icd/api/include/app_profile.h index 0c2f8811..27496109 100644 --- a/icd/api/include/app_profile.h +++ b/icd/api/include/app_profile.h @@ -94,7 +94,6 @@ enum class AppProfile : uint32_t EvilGenius2, // Evil Genius 2 KnockoutCity, // Knockout City SkyGold, // Sky Gold by NetEase - IdTechEngine, // id Tech Engine (Default) Feral3DEngine, // Feral3D Engine (Default) StrangeEngine, // Strange Engine (Default) SedpEngine, // Serious Engine (Default) @@ -112,6 +111,9 @@ enum class AppProfile : uint32_t SniperElite5, // Sniper Elite 5 by Rebellion SeriousSamVrTheLastHope, // Serious Sam VR The Last Hope by Croteam BaldursGate3, // Baldur's Gate by Larian Studios + Enshrouded, // Enshrouded by Keen Games + HolisticEngine, // Holistic Engine by Keen Games + IdTechEngine, // id Tech Engine (Default) #if VKI_RAY_TRACING ControlDX12, // VKD3D Control Ultimate Edition RayTracingWeekends, // RayTracingInVulkan demo @@ -122,6 +124,9 @@ enum class AppProfile : uint32_t DxvkGodOfWar, // DXVK God of War ELEX2, // ELEX II X4Foundations, // X4: Foundations by Egosoft + DxvkHaloInfiniteLauncher,// DXVK Halo Infinite Launcher (Don't Confuse it with VKD3D + // Halo Infinite Game) + DxvkTf2, // DXVK Team Fortress 2 MetalGearSolid5, // Metal Gear Solid5 : The Phantom Pain MetalGearSolid5Online, // Metal Gear Solid5 : The Phantom Pain Online YamagiQuakeII, // Yamagi Quake II @@ -142,6 +147,7 @@ enum class AppProfile : uint32_t Enscape, // Enscape by Chaos Vkd3dEngine, // vkd3d-proton for steam games DXVK, // DXVK + WindowKill, // Windowkill by torcado }; struct ProfileSettings diff --git a/icd/api/include/app_shader_optimizer.h b/icd/api/include/app_shader_optimizer.h index f3a1fc7b..4ac850b5 100644 --- a/icd/api/include/app_shader_optimizer.h +++ b/icd/api/include/app_shader_optimizer.h @@ -201,6 +201,7 @@ class ShaderOptimizer void BuildTuningProfile(); void BuildAppProfile(); + void BuildAppProfileGeneric(); void BuildAppProfileLlpc(); diff --git a/icd/api/include/compiler_solution.h b/icd/api/include/compiler_solution.h index 2ad93eb0..377401cc 100644 --- a/icd/api/include/compiler_solution.h +++ b/icd/api/include/compiler_solution.h @@ -365,6 +365,7 @@ class CompilerSolution virtual Vkgc::BinaryData ExtractPalElfBinary(const Vkgc::BinaryData& shaderBinary) = 0; static void DisableNggCulling(Vkgc::NggState* pNggState); + static const char* GetShaderStageName(ShaderStage shaderStage); #if VKI_RAY_TRACING static void UpdateRayTracingFunctionNames( @@ -400,7 +401,6 @@ class CompilerSolution PipelineBinaryCache* m_pBinaryCache; // Internal pipeline binary cache // NOTE: It is owned by PipelineCompiler. PipelineCompileCacheMatrix m_gplCacheMatrix; // Graphics pipeline compile statistic info - static const char* GetShaderStageName(ShaderStage shaderStage); static const char* GetGraphicsLibraryName(GraphicsLibraryType libraryType); private: diff --git a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h index 6d09e280..38468f26 100644 --- a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h +++ b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h @@ -69,7 +69,7 @@ extern "C" { #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)// Patch version should always be set to 0 // Version of this file -#define VK_HEADER_VERSION 280 +#define VK_HEADER_VERSION 285 // Complete version of this file #define VK_HEADER_VERSION_COMPLETE VK_MAKE_API_VERSION(0, 1, 3, VK_HEADER_VERSION) @@ -1046,6 +1046,8 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_SPARSE_ADDRESS_SPACE_PROPERTIES_NV = 1000492001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MUTABLE_DESCRIPTOR_TYPE_FEATURES_EXT = 1000351000, VK_STRUCTURE_TYPE_MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT = 1000351002, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LEGACY_VERTEX_ATTRIBUTES_FEATURES_EXT = 1000495000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LEGACY_VERTEX_ATTRIBUTES_PROPERTIES_EXT = 1000495001, VK_STRUCTURE_TYPE_LAYER_SETTINGS_CREATE_INFO_EXT = 1000496000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_BUILTINS_FEATURES_ARM = 1000497000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_BUILTINS_PROPERTIES_ARM = 1000497001, @@ -1112,6 +1114,9 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAW_ACCESS_CHAINS_FEATURES_NV = 1000555000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT16_VECTOR_FEATURES_NV = 1000563000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_VALIDATION_FEATURES_NV = 1000568000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_ALIGNMENT_CONTROL_FEATURES_MESA = 1000575000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_ALIGNMENT_CONTROL_PROPERTIES_MESA = 1000575001, + VK_STRUCTURE_TYPE_IMAGE_ALIGNMENT_CONTROL_CREATE_INFO_MESA = 1000575002, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES, VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, @@ -1676,7 +1681,7 @@ typedef enum VkFormat { VK_FORMAT_PVRTC1_4BPP_SRGB_BLOCK_IMG = 1000054005, VK_FORMAT_PVRTC2_2BPP_SRGB_BLOCK_IMG = 1000054006, VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG = 1000054007, - VK_FORMAT_R16G16_S10_5_NV = 1000464000, + VK_FORMAT_R16G16_SFIXED5_NV = 1000464000, VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR = 1000470000, VK_FORMAT_A8_UNORM_KHR = 1000470001, VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT = VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK, @@ -1733,6 +1738,7 @@ typedef enum VkFormat { VK_FORMAT_G16_B16R16_2PLANE_444_UNORM_EXT = VK_FORMAT_G16_B16R16_2PLANE_444_UNORM, VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT = VK_FORMAT_A4R4G4B4_UNORM_PACK16, VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT = VK_FORMAT_A4B4G4R4_UNORM_PACK16, + VK_FORMAT_R16G16_S10_5_NV = VK_FORMAT_R16G16_SFIXED5_NV, VK_FORMAT_MAX_ENUM = 0x7FFFFFFF } VkFormat; @@ -11109,6 +11115,7 @@ typedef VkFlags64 VkPipelineCreateFlagBits2KHR; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_DISABLE_OPTIMIZATION_BIT_KHR = 0x00000001ULL; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_ALLOW_DERIVATIVES_BIT_KHR = 0x00000002ULL; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_DERIVATIVE_BIT_KHR = 0x00000004ULL; +static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_ENABLE_LEGACY_DITHERING_BIT_EXT = 0x400000000ULL; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHR = 0x00000008ULL; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_DISPATCH_BASE_BIT_KHR = 0x00000010ULL; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_DEFER_COMPILE_BIT_NV = 0x00000020ULL; @@ -18502,7 +18509,7 @@ VKAPI_ATTR void VKAPI_CALL vkCmdOpticalFlowExecuteNV( // VK_EXT_legacy_dithering is a preprocessor guard. Do not pass it to API calls. #define VK_EXT_legacy_dithering 1 -#define VK_EXT_LEGACY_DITHERING_SPEC_VERSION 1 +#define VK_EXT_LEGACY_DITHERING_SPEC_VERSION 2 #define VK_EXT_LEGACY_DITHERING_EXTENSION_NAME "VK_EXT_legacy_dithering" typedef struct VkPhysicalDeviceLegacyDitheringFeaturesEXT { VkStructureType sType; @@ -18727,6 +18734,24 @@ typedef struct VkPhysicalDeviceExtendedSparseAddressSpacePropertiesNV { #define VK_EXT_MUTABLE_DESCRIPTOR_TYPE_EXTENSION_NAME "VK_EXT_mutable_descriptor_type" +// VK_EXT_legacy_vertex_attributes is a preprocessor guard. Do not pass it to API calls. +#define VK_EXT_legacy_vertex_attributes 1 +#define VK_EXT_LEGACY_VERTEX_ATTRIBUTES_SPEC_VERSION 1 +#define VK_EXT_LEGACY_VERTEX_ATTRIBUTES_EXTENSION_NAME "VK_EXT_legacy_vertex_attributes" +typedef struct VkPhysicalDeviceLegacyVertexAttributesFeaturesEXT { + VkStructureType sType; + void* pNext; + VkBool32 legacyVertexAttributes; +} VkPhysicalDeviceLegacyVertexAttributesFeaturesEXT; + +typedef struct VkPhysicalDeviceLegacyVertexAttributesPropertiesEXT { + VkStructureType sType; + void* pNext; + VkBool32 nativeUnalignedPerformance; +} VkPhysicalDeviceLegacyVertexAttributesPropertiesEXT; + + + // VK_EXT_layer_settings is a preprocessor guard. Do not pass it to API calls. #define VK_EXT_layer_settings 1 #define VK_EXT_LAYER_SETTINGS_SPEC_VERSION 2 @@ -19147,6 +19172,30 @@ typedef struct VkPhysicalDeviceRayTracingValidationFeaturesNV { +// VK_MESA_image_alignment_control is a preprocessor guard. Do not pass it to API calls. +#define VK_MESA_image_alignment_control 1 +#define VK_MESA_IMAGE_ALIGNMENT_CONTROL_SPEC_VERSION 1 +#define VK_MESA_IMAGE_ALIGNMENT_CONTROL_EXTENSION_NAME "VK_MESA_image_alignment_control" +typedef struct VkPhysicalDeviceImageAlignmentControlFeaturesMESA { + VkStructureType sType; + void* pNext; + VkBool32 imageAlignmentControl; +} VkPhysicalDeviceImageAlignmentControlFeaturesMESA; + +typedef struct VkPhysicalDeviceImageAlignmentControlPropertiesMESA { + VkStructureType sType; + void* pNext; + uint32_t supportedImageAlignmentMask; +} VkPhysicalDeviceImageAlignmentControlPropertiesMESA; + +typedef struct VkImageAlignmentControlCreateInfoMESA { + VkStructureType sType; + const void* pNext; + uint32_t maximumRequestedAlignment; +} VkImageAlignmentControlCreateInfoMESA; + + + // VK_KHR_acceleration_structure is a preprocessor guard. Do not pass it to API calls. #define VK_KHR_acceleration_structure 1 #define VK_KHR_ACCELERATION_STRUCTURE_SPEC_VERSION 13 diff --git a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_metal.h b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_metal.h index e6f7bf7a..89a55749 100644 --- a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_metal.h +++ b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_metal.h @@ -52,28 +52,28 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateMetalSurfaceEXT( #define VK_EXT_metal_objects 1 #ifdef __OBJC__ @protocol MTLDevice; -typedef id MTLDevice_id; +typedef __unsafe_unretained id MTLDevice_id; #else typedef void* MTLDevice_id; #endif #ifdef __OBJC__ @protocol MTLCommandQueue; -typedef id MTLCommandQueue_id; +typedef __unsafe_unretained id MTLCommandQueue_id; #else typedef void* MTLCommandQueue_id; #endif #ifdef __OBJC__ @protocol MTLBuffer; -typedef id MTLBuffer_id; +typedef __unsafe_unretained id MTLBuffer_id; #else typedef void* MTLBuffer_id; #endif #ifdef __OBJC__ @protocol MTLTexture; -typedef id MTLTexture_id; +typedef __unsafe_unretained id MTLTexture_id; #else typedef void* MTLTexture_id; #endif @@ -81,12 +81,12 @@ typedef void* MTLTexture_id; typedef struct __IOSurface* IOSurfaceRef; #ifdef __OBJC__ @protocol MTLSharedEvent; -typedef id MTLSharedEvent_id; +typedef __unsafe_unretained id MTLSharedEvent_id; #else typedef void* MTLSharedEvent_id; #endif -#define VK_EXT_METAL_OBJECTS_SPEC_VERSION 1 +#define VK_EXT_METAL_OBJECTS_SPEC_VERSION 2 #define VK_EXT_METAL_OBJECTS_EXTENSION_NAME "VK_EXT_metal_objects" typedef enum VkExportMetalObjectTypeFlagBitsEXT { diff --git a/icd/api/include/khronos/vulkan.h b/icd/api/include/khronos/vulkan.h index 030be791..11faf725 100644 --- a/icd/api/include/khronos/vulkan.h +++ b/icd/api/include/khronos/vulkan.h @@ -65,6 +65,9 @@ #include "devext/vk_amd_shader_texel_buffer_explicit_format.h" #endif +#if VKI_RAY_TRACING +#endif + #define VK_FORMAT_A1B5G5R5_UNORM_PACK16 VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR #define VK_FORMAT_BEGIN_RANGE VK_FORMAT_UNDEFINED diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h index 192d051a..b74a3f69 100644 --- a/icd/api/include/vk_cmdbuffer.h +++ b/icd/api/include/vk_cmdbuffer.h @@ -51,9 +51,6 @@ #include "renderpass/renderpass_builder.h" -#if VKI_RAY_TRACING -#endif - #include "debug_printf.h" #include "palCmdBuffer.h" #include "palDequeImpl.h" @@ -95,6 +92,7 @@ class QueryPool; #if VKI_RAY_TRACING class RayTracingPipeline; class AccelerationStructureQueryPool; +class BvhBatchState; #endif constexpr uint8_t DefaultStencilOpValue = 1; @@ -1458,6 +1456,10 @@ class CmdBuffer const Pal::IGpuMemory& cpsMem) const; bool HasRayTracing() const { return m_flags.hasRayTracing; } + + BvhBatchState* GetBvhBatchState() const { return m_pBvhBatchState; } + + void SetBvhBatchState(BvhBatchState* pBvhBatchState) { m_pBvhBatchState = pBvhBatchState; } #endif template @@ -1598,7 +1600,7 @@ class CmdBuffer void RPLoadOpClearColor(uint32_t count, const RPLoadOpClearInfo* pClears); void RPLoadOpClearDepthStencil(uint32_t count, const RPLoadOpClearInfo* pClears); void RPBindTargets(const RPBindTargetsInfo& targets); - void RPSyncPostLoadOpColorClear(); + void RPSyncPostLoadOpColorClear(uint32_t count, const RPLoadOpClearInfo* pClears); void BindTargets(); @@ -1930,7 +1932,8 @@ class CmdBuffer #else uint32_t reserved4 : 1; #endif - uint32_t reserved : 14; + uint32_t offsetMode : 1; + uint32_t reserved : 13; }; }; @@ -1979,6 +1982,7 @@ class CmdBuffer bool m_reverseThreadGroupState; #if VKI_RAY_TRACING Util::Vector m_scratchVidMemList; // Ray-tracing scratch memory + BvhBatchState* m_pBvhBatchState; uint64 m_maxCpsMemSize; // max ray sorting memory requested diff --git a/icd/api/include/vk_conv.h b/icd/api/include/vk_conv.h index d49c1d36..5a66cba6 100755 --- a/icd/api/include/vk_conv.h +++ b/icd/api/include/vk_conv.h @@ -4077,20 +4077,20 @@ const char* VkResultName(VkResult result); inline std::chrono::nanoseconds Uint64ToChronoNano(uint64_t nanoSeconds) { - const uint64_t maxNano = static_cast(std::chrono::nanoseconds::max().count()); - return std::chrono::nanoseconds { Util::Min(nanoSeconds, maxNano) }; + constexpr uint64_t MaxNanos = uint64_t(std::chrono::nanoseconds::max().count()); + return std::chrono::nanoseconds{ Util::Min(nanoSeconds, MaxNanos) }; } inline std::chrono::milliseconds Uint64ToChronoMilli(uint64_t milliSeconds) { - const uint64_t maxMilli = static_cast(std::chrono::milliseconds::max().count()); - return std::chrono::milliseconds { Util::Min(milliSeconds, maxMilli) }; + constexpr uint64_t MaxMillis = uint64_t(std::chrono::milliseconds::max().count()); + return std::chrono::milliseconds{ Util::Min(milliSeconds, MaxMillis) }; } inline std::chrono::seconds Uint64ToChronoSeconds(uint64_t seconds) { - const uint64_t maxSeconds = static_cast(std::chrono::seconds::max().count()); - return std::chrono::seconds { Util::Min(seconds, maxSeconds) }; + constexpr uint64_t MaxSeconds = uint64_t(std::chrono::seconds::max().count()); + return std::chrono::seconds{ Util::Min(seconds, MaxSeconds) }; } } // namespace vk diff --git a/icd/api/include/vk_device.h b/icd/api/include/vk_device.h index 7024ca43..1ebee441 100644 --- a/icd/api/include/vk_device.h +++ b/icd/api/include/vk_device.h @@ -167,8 +167,9 @@ class Device uint32 primitivesGeneratedQuery : 1; uint32 reserved1 : 1; uint32 reserved2 : 1; + uint32 robustVertexBufferExtend : 1; - uint32 reserved : 12; + uint32 reserved : 11; }; uint32 u32All; @@ -959,7 +960,7 @@ class Device // This is from device create info, VkDevicePrivateDataCreateInfoEXT uint32 m_privateDataSlotRequestCount; - volatile uint64 m_nextPrivateDataSlot; + uint64 m_nextPrivateDataSlot; size_t m_privateDataSize; Util::RWLock m_privateDataRWLock; diff --git a/icd/api/include/vk_extensions.h b/icd/api/include/vk_extensions.h index c116faed..443ef9ed 100644 --- a/icd/api/include/vk_extensions.h +++ b/icd/api/include/vk_extensions.h @@ -424,6 +424,8 @@ class DeviceExtensions final : public Extensions EXT_PRIVATE_DATA, EXT_PROVOKING_VERTEX, EXT_QUEUE_FAMILY_FOREIGN, +#if VKI_RAY_TRACING +#endif EXT_ROBUSTNESS2, EXT_SAMPLER_FILTER_MINMAX, EXT_SAMPLE_LOCATIONS, diff --git a/icd/api/include/vk_image.h b/icd/api/include/vk_image.h index 1b6534c1..85100d90 100644 --- a/icd/api/include/vk_image.h +++ b/icd/api/include/vk_image.h @@ -299,7 +299,8 @@ class Image final : public NonDispatchable uint32_t sampleLocsCompatDepth : 1; // VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT uint32_t isProtected : 1; // VK_IMAGE_CREATE_PROTECTED_BIT uint32_t treatAsSrgb : 1; // True if this image is to be interpreted as SRGB where possible - uint32_t reserved : 15; + uint32_t reserved1 : 1; + uint32_t reserved : 14; }; uint32_t u32All; }; @@ -334,7 +335,8 @@ class Image final : public NonDispatchable uint32_t externallyShareable : 1; // True if the backing memory of this image may be shared externally. uint32_t externalD3DHandle : 1; // True if image is backed by a D3D11 image uint32_t externalPinnedHost : 1; // True if image backing memory is compatible with pinned sysmem. - uint32_t reserved : 28; + uint32_t reserved1 : 1; + uint32_t reserved : 27; }; uint32_t u32All; }; diff --git a/icd/api/include/vk_indirect_commands_layout.h b/icd/api/include/vk_indirect_commands_layout.h index 211c0d10..9c58ef80 100644 --- a/icd/api/include/vk_indirect_commands_layout.h +++ b/icd/api/include/vk_indirect_commands_layout.h @@ -84,18 +84,13 @@ class IndirectCommandsLayout final : public NonDispatchable struct { - Pal::QueueType palQueueType; - Pal::EngineType palEngineType; - VkShaderStageFlags validShaderStages; - uint32_t palImageLayoutFlag; - VkQueueFamilyProperties properties; + Pal::QueueType palQueueType; + Pal::EngineType palEngineType; + VkShaderStageFlags validShaderStages; + uint32_t palImageLayoutFlag; + VkQueueFamilyProperties properties; } m_queueFamilies[Queue::MaxQueueFamilies]; // List of indices for compute engines that aren't exclusive. diff --git a/icd/api/include/vk_queue.h b/icd/api/include/vk_queue.h index ac3f5215..87add35f 100644 --- a/icd/api/include/vk_queue.h +++ b/icd/api/include/vk_queue.h @@ -211,7 +211,8 @@ class Queue enum { - MaxQueueFamilies = Pal::QueueTypeCount, // Maximum number of queue families + MaxQueueFamilies = Pal::QueueTypeCount // Maximum number of queue families + , MaxQueuesPerFamily = 8, // Maximum number of queues per family MaxMultiQueues = 4, diff --git a/icd/api/include/vk_swapchain.h b/icd/api/include/vk_swapchain.h index f04ae65c..946d89dd 100644 --- a/icd/api/include/vk_swapchain.h +++ b/icd/api/include/vk_swapchain.h @@ -157,10 +157,11 @@ class SwapChain final : public NonDispatchable bool IsFullscreenOrEfsePresent() const; Pal::IGpuMemory* UpdatePresentInfo( - uint32_t deviceIdx, - uint32_t imageIndex, - Pal::PresentSwapChainInfo* pPresentInfo, - const Pal::FlipStatusFlags& flipFlags); + uint32_t deviceIdx, + uint32_t imageIndex, + Pal::PresentSwapChainInfo* pPresentInfo, + const Pal::FlipStatusFlags& flipFlags, + const Pal::PerSourceFrameMetadataControl& metadataFlags); bool BuildPostProcessingCommands( Pal::ICmdBuffer* pCmdBuf, @@ -187,6 +188,7 @@ class SwapChain final : public NonDispatchable const VkHdrMetadataEXT* pMetadata); void MarkAsDeprecated( + bool releaseResources, const VkAllocationCallbacks* pAllocator); uint32_t GetVidPnSourceId() const diff --git a/icd/api/pipeline_binary_cache.cpp b/icd/api/pipeline_binary_cache.cpp index b157a3b2..bd438d7e 100644 --- a/icd/api/pipeline_binary_cache.cpp +++ b/icd/api/pipeline_binary_cache.cpp @@ -696,9 +696,7 @@ Util::Result PipelineBinaryCache::InjectBinariesFromDirectory( if ((fileCount > 0u) && (result == Util::Result::Success)) { - char* pFileNameBuffer = nullptr; - Util::Span> fileNames; - Util::Span fileNameBuffer; + char* pFileNameBuffer = nullptr; // Allocate space for pFileNames and pFileNameBuffer Util::StringView* pFileNames = static_cast*>( @@ -720,11 +718,11 @@ Util::Result PipelineBinaryCache::InjectBinariesFromDirectory( } } + Util::Span> fileNames(pFileNames, fileCount); + Util::Span fileNameBuffer(pFileNameBuffer, fileNameBufferSize); + if (result == Util::Result::Success) { - fileNames = Util::Span>(pFileNames, fileCount); - fileNameBuffer = Util::Span(pFileNameBuffer, fileNameBufferSize); - // Populate fileNames and fileNameBuffer. result = Util::GetFileNamesInDir(settings.devModeElfReplacementDirectory, fileNames, fileNameBuffer); @@ -1086,11 +1084,8 @@ VkResult PipelineBinaryCache::InitArchiveLayers( { if (totalSize >= settings.pipelineCacheDefaultLocationLimitation) { - const uint64 sec = oldestTime.time_since_epoch().count() + - settings.thresholdOfCleanUpCache; - Util::RemoveFilesOfDirOlderThan( - pCachePath, Util::SecondsSinceEpoch { Uint64ToChronoSeconds(sec) }); + pCachePath, oldestTime + Uint64ToChronoSeconds(settings.thresholdOfCleanUpCache)); } } } diff --git a/icd/api/pipeline_compiler.cpp b/icd/api/pipeline_compiler.cpp index 99083a4b..07c5e1a4 100644 --- a/icd/api/pipeline_compiler.cpp +++ b/icd/api/pipeline_compiler.cpp @@ -2233,7 +2233,7 @@ void PipelineCompiler::BuildPipelineShaderInfo( // but we want to force wavesize to wave32 internally depending on settings and shader stage. // We override any wavesize forced via shader opts also here. // NOTE: If the app uses subgroup size then wavesize forced here might get overriden later based on - // subgroupsize. To avoid this beahvior, DeprecateWave64Reporting must be set as well in settings. + // subgroupsize. To avoid this behavior, DeprecateWave64Reporting must be set as well in settings. pShaderInfoOut->options.waveSize = ShouldForceWave32(static_cast(stage), pDevice->GetRuntimeSettings().deprecateWave64) ? 32 : pShaderInfoOut->options.waveSize; @@ -2330,6 +2330,8 @@ static void BuildPipelineShadersInfo( const GraphicsPipelineShaderStageInfo* pShaderInfo, GraphicsPipelineBinaryCreateInfo* pCreateInfo) { + const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + if (pCreateInfo->pipelineInfo.options.enableRelocatableShaderElf) { CompilerSolution::DisableNggCulling(&pCreateInfo->pipelineInfo.nggState); @@ -2353,7 +2355,7 @@ static void BuildPipelineShadersInfo( (pShaderInfo->stages[stage].codeHash.lower != 0) || (pShaderInfo->stages[stage].codeHash.upper != 0))) { - GraphicsLibraryType gplType = GetGraphicsLibraryType(static_cast(stage)); + GraphicsLibraryType gplType = GetGraphicsLibraryType(static_cast(stage)); PipelineCompiler::BuildPipelineShaderInfo(pDevice, &pShaderInfo->stages[stage], @@ -2362,6 +2364,26 @@ static void BuildPipelineShadersInfo( pCreateInfo->pPipelineProfileKey, &pCreateInfo->pipelineInfo.nggState ); + + if ((stage == ShaderStage::ShaderStageFragment) && + (ppShaderInfoOut[stage]->options.allowReZ == true) && settings.disableDepthOnlyReZ) + { + bool usesDepthOnlyAttachments = true; + + for (uint32_t i = 0; i < Pal::MaxColorTargets; ++i) + { + if (pCreateInfo->pipelineInfo.cbState.target[i].channelWriteMask != 0) + { + usesDepthOnlyAttachments = false; + break; + } + } + + if (usesDepthOnlyAttachments) + { + ppShaderInfoOut[stage]->options.allowReZ = false; + } + } } } @@ -2375,8 +2397,8 @@ static void BuildPipelineShadersInfo( // details can be found in PipelineCompiler::ConvertGraphicsPipelineInfo(). // PS: For standard gfx pipeline, GraphicsPipelineBuildInfo::enableUberFetchShader is never set as TRUE with default // panel setting because VII and PRS are always available at the same time. - if (pDevice->GetRuntimeSettings().enableUberFetchShader || - pDevice->GetRuntimeSettings().enableEarlyCompile || + if (settings.enableUberFetchShader || + settings.enableEarlyCompile || (((pCreateInfo->libFlags & VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) == 0) && ((pCreateInfo->libFlags & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) != 0)) || (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::VertexInput) == true) @@ -2939,6 +2961,9 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( &pCreateInfo->pipelineInfo.options ); + pCreateInfo->pipelineInfo.useSoftwareVertexBufferDescriptors = + pDevice->GetEnabledFeatures().robustVertexBufferExtend; + } uint64_t dynamicStateFlags = 0; @@ -3787,7 +3812,7 @@ VkResult PipelineCompiler::ConvertRayTracingPipelineInfo( static_assert(RaytracingContinuations == static_cast(Vkgc::LlpcRaytracingMode::Continuations)); pCreateInfo->pipelineInfo.mode = static_cast(settings.llpcRaytracingMode); - static_assert(CpsFlagStackInGlobalMem == Vkgc::CpsFlagStackInGlobalMem); + static_assert(CpsFlagStackInGlobalMem == static_cast(Vkgc::CpsFlagStackInGlobalMem)); pCreateInfo->pipelineInfo.cpsFlags = settings.cpsFlags; pCreateInfo->pipelineInfo.isReplay = isReplay; @@ -3828,6 +3853,9 @@ VkResult PipelineCompiler::ConvertRayTracingPipelineInfo( tempBufferSize += sizeof(BinaryData) * pIn->pLibraryInfo->libraryCount; } + const auto& gpurtOptions = pDevice->RayTrace()->GetGpurtOptions(); + tempBufferSize += gpurtOptions.size() * sizeof(Vkgc::GpurtOption); + // We can't have a pipeline with 0 shader stages VK_ASSERT(tempBufferSize > 0); @@ -4023,6 +4051,17 @@ VkResult PipelineCompiler::ConvertRayTracingPipelineInfo( pSummaries[i] = summary; } } + + if (gpurtOptions.size() > 0) + { + Vkgc::GpurtOption* pGpurtOptions = reinterpret_cast( + VoidPtrInc(pCreateInfo->pTempBuffer, tempBufferOffset)); + size_t gpurtOptionsSize = sizeof(Vkgc::GpurtOption) * gpurtOptions.size(); + tempBufferOffset += gpurtOptionsSize; + pCreateInfo->pipelineInfo.pGpurtOptions = pGpurtOptions; + pCreateInfo->pipelineInfo.gpurtOptionCount = gpurtOptions.size(); + memcpy(pGpurtOptions, gpurtOptions.Data(), gpurtOptionsSize); + } } } diff --git a/icd/api/raytrace/ray_tracing_device.cpp b/icd/api/raytrace/ray_tracing_device.cpp index 17b93902..be8958be 100644 --- a/icd/api/raytrace/ray_tracing_device.cpp +++ b/icd/api/raytrace/ray_tracing_device.cpp @@ -34,7 +34,9 @@ #include "sqtt/sqtt_layer.h" #include "sqtt/sqtt_rgp_annotations.h" #include "palAutoBuffer.h" +#include "palVectorImpl.h" #include "gpurt/gpurtLib.h" +#include "g_gpurtOptions.h" #if ICD_GPUOPEN_DEVMODE_BUILD #include "devmode/devmode_mgr.h" @@ -48,7 +50,9 @@ RayTracingDevice::RayTracingDevice( Device* pDevice) : m_pDevice(pDevice), + m_gpurtOptions(pDevice->VkInstance()->Allocator()), m_cmdContext(), + m_pBvhBatchLayer(nullptr), m_accelStructTrackerResources() { @@ -73,6 +77,7 @@ VkResult RayTracingDevice::Init() } CreateGpuRtDeviceSettings(&m_gpurtDeviceSettings); + CollectGpurtOptions(&m_gpurtOptions); for (uint32_t deviceIdx = 0; (result == VK_SUCCESS) && (deviceIdx < m_pDevice->NumPalDevices()); ++deviceIdx) { @@ -99,7 +104,6 @@ VkResult RayTracingDevice::Init() initInfo.pAccelStructTracker = GetAccelStructTracker(deviceIdx); initInfo.accelStructTrackerGpuAddr = GetAccelStructTrackerGpuVa(deviceIdx); - initInfo.deviceSettings.gpuDebugFlags = m_pDevice->GetRuntimeSettings().rtGpuDebugFlags; initInfo.deviceSettings.emulatedRtIpLevel = Pal::RayTracingIpLevel::None; switch (m_pDevice->GetRuntimeSettings().emulatedRtIpLevel) { @@ -136,17 +140,24 @@ VkResult RayTracingDevice::Init() callbacks.pfnFreeGpuMem = &RayTracingDevice::ClientFreeGpuMem; callbacks.pfnClientGetTemporaryGpuMemory = &RayTracingDevice::ClientGetTemporaryGpuMemory; - Pal::Result palResult = GpuRt::CreateDevice(initInfo, callbacks, pMemory, &m_pGpuRtDevice[deviceIdx]); + result = PalToVkResult(GpuRt::CreateDevice(initInfo, callbacks, pMemory, &m_pGpuRtDevice[deviceIdx])); - if (palResult != Pal::Result::Success) + if (result == VK_SUCCESS) { - m_pDevice->VkInstance()->FreeMem(pMemory); + result = BvhBatchLayer::CreateLayer(m_pDevice, &m_pBvhBatchLayer); + } + if (result != VK_SUCCESS) + { VK_NEVER_CALLED(); - result = VK_ERROR_INITIALIZATION_FAILED; - } + m_pDevice->VkInstance()->FreeMem(pMemory); + if (m_pBvhBatchLayer != nullptr) + { + m_pBvhBatchLayer->DestroyLayer(); + } + } } } @@ -249,10 +260,47 @@ void RayTracingDevice::CreateGpuRtDeviceSettings( m_profileRayFlags = TraceRayProfileFlagsToRayFlag(settings); m_profileMaxIterations = TraceRayProfileMaxIterationsToMaxIterations(settings); - pDeviceSettings->gpuDebugFlags = settings.gpuRtGpuDebugFlags; + pDeviceSettings->gpuDebugFlags = settings.rtGpuDebugFlags; pDeviceSettings->enableRemapScratchBuffer = settings.enableRemapScratchBuffer; pDeviceSettings->enableEarlyPairCompression = settings.enableEarlyPairCompression; pDeviceSettings->trianglePairingSearchRadius = settings.trianglePairingSearchRadius; + + pDeviceSettings->enableMergedEncodeBuild = settings.enableMergedEncodeBuild; + pDeviceSettings->enableMergedEncodeUpdate = settings.enableMergedEncodeUpdate; +} + +// ===================================================================================================================== +void RayTracingDevice::CollectGpurtOptions( + GpurtOptions* const pGpurtOptions + ) const +{ + const uint32_t optionCount = sizeof(GpuRt::OptionDefaults) / sizeof(GpuRt::OptionDefaults[0]); + + // Set up option defaults so that it won't break when a newly added option has non-zero default. + Util::HashMap optionMap(optionCount, pGpurtOptions->GetAllocator()); + optionMap.Init(); + for (uint32_t i = 0; i < optionCount; i++) + { + // We should not have duplicated option defaults. + VK_ASSERT(optionMap.FindKey(GpuRt::OptionDefaults[i].nameHash) == nullptr); + optionMap.Insert(GpuRt::OptionDefaults[i].nameHash, GpuRt::OptionDefaults[i].value); + } + + auto& settings = m_pDevice->GetRuntimeSettings(); + + uint32_t threadTraceEnabled = 0; + if (settings.rtEmitRayTracingShaderDataToken || + m_pDevice->VkInstance()->PalPlatform()->IsRaytracingShaderDataTokenRequested()) + { + threadTraceEnabled = 1; + } + *optionMap.FindKey(GpuRt::ThreadTraceEnabledOptionNameHash) = threadTraceEnabled; + + pGpurtOptions->Clear(); + for (auto it = optionMap.Begin(); it.Get() != nullptr; it.Next()) + { + pGpurtOptions->PushBack({ it.Get()->key, it.Get()->value }); + } } // ===================================================================================================================== @@ -297,6 +345,11 @@ void RayTracingDevice::Destroy() m_pDevice->VkInstance()->FreeMem(m_accelStructTrackerResources[0].pMem); } + if (m_pBvhBatchLayer != nullptr) + { + m_pBvhBatchLayer->DestroyLayer(); + } + Util::Destructor(this); m_pDevice->VkInstance()->FreeMem(this); @@ -724,11 +777,10 @@ Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline( void** ppResultMemory) ///< [out] (Optional) Result PAL pipeline memory, ///< if different from obj { - uint64_t spvPassMask = - static_cast(initInfo.pClientUserData)->GetRuntimeSettings().rtInternalPipelineSpvPassMask; - vk::Device* pDevice = static_cast(initInfo.pClientUserData); + vk::Device* pDevice = static_cast(initInfo.pClientUserData); const auto& settings = pDevice->GetRuntimeSettings(); + uint64_t spvPassMask = settings.rtInternalPipelineSpvPassMask; uint64_t shaderTypeMask = 1ull << static_cast(buildInfo.shaderType); bool useSpvPass = (shaderTypeMask & spvPassMask); diff --git a/icd/api/raytrace/ray_tracing_device.h b/icd/api/raytrace/ray_tracing_device.h index d9a86636..8829dae4 100644 --- a/icd/api/raytrace/ray_tracing_device.h +++ b/icd/api/raytrace/ray_tracing_device.h @@ -31,6 +31,9 @@ #include "khronos/vulkan.h" #include "vk_defines.h" +#include "appopt/bvh_batch_layer.h" + +#include "vkgcDefs.h" namespace vk { @@ -39,6 +42,7 @@ class Device; class Queue; class InternalMemory; class CmdBuffer; +class PalAllocator; // Device-level structure for managing state related to ray-tracing. Instantiated as part of a VkDevice. class RayTracingDevice @@ -61,6 +65,8 @@ class RayTracingDevice uint32_t srd[BufferViewDwords]; }; + typedef Util::Vector GpurtOptions; + RayTracingDevice(Device* pDevice); ~RayTracingDevice(); @@ -70,6 +76,7 @@ class RayTracingDevice void CreateGpuRtDeviceSettings(GpuRt::DeviceSettings* pDeviceSettings); GpuRt::IDevice* GpuRt(uint32_t deviceIdx) { return m_pGpuRtDevice[deviceIdx]; } const GpuRt::DeviceSettings& DeviceSettings() const { return m_gpurtDeviceSettings; } + const GpurtOptions& GetGpurtOptions() const { return m_gpurtOptions; } Pal::Result InitCmdContext(uint32_t deviceIdx); CmdContext* GetCmdContext(uint32_t deviceIdx) { return &m_cmdContext[deviceIdx]; } @@ -84,6 +91,8 @@ class RayTracingDevice uint64_t GetAccelerationStructureUUID(const Pal::DeviceProperties& palProps); + BvhBatchLayer* GetBvhBatchLayer() { return m_pBvhBatchLayer; } + uint32_t GetProfileRayFlags() const { return m_profileRayFlags; } uint32_t GetProfileMaxIterations() const { return m_profileMaxIterations; } @@ -122,6 +131,7 @@ class RayTracingDevice GpuRt::IDevice* m_pGpuRtDevice[MaxPalDevices]; GpuRt::DeviceSettings m_gpurtDeviceSettings; + GpurtOptions m_gpurtOptions; uint32_t m_profileRayFlags; // Ray flag override for profiling uint32_t m_profileMaxIterations; // Max traversal iterations @@ -193,6 +203,10 @@ class RayTracingDevice const VkStridedDeviceAddressRegionKHR* pHitSbt, GpuRt::RtDispatchInfo* pDispatchInfo) const; + void CollectGpurtOptions(GpurtOptions* const pGpurtOptions) const; + + BvhBatchLayer* m_pBvhBatchLayer; + AccelStructTrackerResources m_accelStructTrackerResources[MaxPalDevices]; }; diff --git a/icd/api/raytrace/vk_acceleration_structure.cpp b/icd/api/raytrace/vk_acceleration_structure.cpp index 7930c3b1..3295d5c7 100644 --- a/icd/api/raytrace/vk_acceleration_structure.cpp +++ b/icd/api/raytrace/vk_acceleration_structure.cpp @@ -174,12 +174,13 @@ VkResult AccelerationStructure::ConvertBuildInputsKHR( uint32_t deviceIndex, const VkAccelerationStructureBuildGeometryInfoKHR& info, const VkAccelerationStructureBuildRangeInfoKHR* pBuildRangeInfos, + const uint32_t* pMaxPrimitiveCounts, GeometryConvertHelper* pHelper, GpuRt::AccelStructBuildInputs* pInputs) { VkResult result = VK_SUCCESS; - pHelper->pMaxPrimitiveCounts = nullptr; + pHelper->pMaxPrimitiveCounts = pMaxPrimitiveCounts; pHelper->pBuildRangeInfos = pBuildRangeInfos; pInputs->type = ConvertAccelerationStructureType(info.type); pInputs->flags = ConvertAccelerationStructureFlags(info.mode, info.flags); @@ -202,7 +203,9 @@ VkResult AccelerationStructure::ConvertBuildInputsKHR( if (pInstanceGeom->geometryType == VK_GEOMETRY_TYPE_INSTANCES_KHR) { - pInputs->inputElemCount = (pBuildRangeInfos != nullptr) ? pBuildRangeInfos->primitiveCount : 1; + pInputs->inputElemCount = (pBuildRangeInfos != nullptr) ? + pBuildRangeInfos->primitiveCount : + pMaxPrimitiveCounts[0]; pInputs->inputElemLayout = pInstanceGeom->geometry.instances.arrayOfPointers ? GpuRt::InputElementLayout::ArrayOfPointers : GpuRt::InputElementLayout::Array; diff --git a/icd/api/raytrace/vk_acceleration_structure.h b/icd/api/raytrace/vk_acceleration_structure.h index 93702db0..a245a7e6 100644 --- a/icd/api/raytrace/vk_acceleration_structure.h +++ b/icd/api/raytrace/vk_acceleration_structure.h @@ -75,6 +75,7 @@ class AccelerationStructure final : public NonDispatchablepipePoints[pBarrier->pipePointCount] = Pal::HwPipeBottom; pBarrier->pipePointCount++; - pBarrier->srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + pBarrier->implicitSrcCacheMask |= Pal::CoherColorTarget | Pal::CoherDepthStencilTarget; } } // ===================================================================================================================== static void ConvertImplicitSyncs( - RPBarrierInfo* pBarrier, + RPBarrierInfo* pBarrier, const RuntimeSettings& settings) { pBarrier->implicitSrcCacheMask = 0; @@ -1015,17 +1014,15 @@ static void ConvertImplicitSyncs( pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_BLIT_BIT_KHR; pBarrier->implicitSrcCacheMask |= Pal::CoherResolveSrc; + pBarrier->implicitDstCacheMask |= Pal::CoherResolveDst; } - if (pBarrier->flags.implicitExternalOutgoing && - (pBarrier->pipePointCount < (MaxHwPipePoints - 1)) && - settings.implicitExternalSynchronization) + if (pBarrier->flags.implicitExternalOutgoing && settings.implicitExternalSynchronization) { pBarrier->srcStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_BLIT_BIT_KHR; - pBarrier->srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + pBarrier->implicitSrcCacheMask |= Pal::CoherColorTarget | Pal::CoherDepthStencilTarget; } } @@ -1055,59 +1052,77 @@ void RenderPassBuilder::PostProcessSyncPoint( pSyncPoint->barrier.flags.needsGlobalTransition = 1; } - // The barrier is active if it does any waiting or global cache synchronization or attachment transitions - if ((pSyncPoint->barrier.pipePointCount > 0) || - (pSyncPoint->barrier.flags.needsGlobalTransition) || - (pSyncPoint->transitions.NumElements() > 0)) - { - pSyncPoint->flags.active = 1; + bool hasChangingLayout = false; + bool isTransitioningOutOfUndefined = false; - if (pSyncPoint->barrier.dstStageMask == 0) + if (pSyncPoint->transitions.NumElements() > 0) + { + for (auto it = pSyncPoint->transitions.Begin(); it.Get() != nullptr; it.Next()) { - if (pSyncPoint->flags.top && (pSyncPoint->transitions.NumElements() > 0)) + RPTransitionInfo* info = it.Get(); + + if (info->prevLayout.layout == VK_IMAGE_LAYOUT_UNDEFINED) { - // If a transition occurs when entering a subpass (top == 1), it must be synced before the - // attachment is accessed. If we're leaving the subpass, chances are there's another barrier down - // the line that will sync the image correctly. - pSyncPoint->barrier.dstStageMask = AllShaderStages; + isTransitioningOutOfUndefined = true; } - else + + if ((info->prevLayout.layout != info->nextLayout.layout) || + (info->prevStencilLayout.layout != info->nextStencilLayout.layout)) { - // BOTTOM_OF_PIPE in dst mask is effectively NONE. - pSyncPoint->barrier.dstStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; + hasChangingLayout = true; } - } - // If srcSubpass for this barrier is VK_SUBPASS_EXTERNAL, srcStageMask is TOP_OF_PIPE and srcAccessMask is - // 0 then this syncTop barrier might be doing a metadata Init with a layout transition out of undefined - // layout. Set a flag here that can be tested later to set the srcStageMask correctly. - const bool needsFixForMetaDataInit = - ((pSyncPoint->flags.top) && - (pSyncPoint->barrier.flags.explicitExternalIncoming) && - (pSyncPoint->barrier.srcStageMask == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR) && - (pSyncPoint->barrier.srcAccessMask == 0)); + if (hasChangingLayout || isTransitioningOutOfUndefined) + { + break; + } + } + } - if ((pSyncPoint->barrier.srcStageMask == 0) || needsFixForMetaDataInit) + // If srcSubpass for this barrier is VK_SUBPASS_EXTERNAL, srcStageMask is TOP_OF_PIPE and srcAccessMask is + // 0 then this syncTop barrier might be doing a metadata Init with a layout transition out of undefined + // layout. Set a flag here that can be tested later to set the srcStageMask correctly. + const bool needsFixForMetaDataInit = + ((pSyncPoint->flags.top) && + (pSyncPoint->barrier.flags.explicitExternalIncoming) && + (pSyncPoint->barrier.srcStageMask == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR) && + (pSyncPoint->barrier.srcAccessMask == 0)); + + // Set the dstStageMask to non-zero only if layout is changing. If the layout is not changing and if + // dstStageMask is 0, then it's quite likely that this is an empty barrier that can be skipped. + if ((pSyncPoint->barrier.dstStageMask == 0) && hasChangingLayout) + { + if (pSyncPoint->flags.top) { - if (pSyncPoint->transitions.NumElements() > 0) - { - // RPBarrierInfo consists of one set of src/dst stage masks which currently applies to each - // transition in RPSyncPoint(). PAL now supports specifying src/dst stage masks for each individual - // image transition. Since with this change we will loop over each transition to check for - // undefined 'prev' layout, there might be some cases where we add unnecessary stalls for at least - // some transitions. - for (auto it = pSyncPoint->transitions.Begin(); it.Get() != nullptr; it.Next()) - { - RPTransitionInfo* info = it.Get(); + // If a transition occurs when entering a subpass (top == 1), it must be synced before the + // attachment is accessed. If we're leaving the subpass, chances are there's another barrier down + // the line that will sync the image correctly. + pSyncPoint->barrier.dstStageMask = AllShaderStages; + } + else + { + // BOTTOM_OF_PIPE in dst mask is effectively NONE. + pSyncPoint->barrier.dstStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; + } + } - if (info->prevLayout.layout == VK_IMAGE_LAYOUT_UNDEFINED) - { - pSyncPoint->barrier.srcStageMask |= pSyncPoint->barrier.dstStageMask; - } - } - } + if ((pSyncPoint->barrier.srcStageMask == 0) || needsFixForMetaDataInit) + { + if (isTransitioningOutOfUndefined && hasChangingLayout) + { + pSyncPoint->barrier.srcStageMask |= pSyncPoint->barrier.dstStageMask; } } + + const bool stageMasksNotEmpty = (((pSyncPoint->barrier.srcStageMask == 0) && + (pSyncPoint->barrier.dstStageMask == 0)) == false); + + // The barrier is active if it does any waiting or global cache synchronization or attachment transitions + if ((pSyncPoint->barrier.flags.needsGlobalTransition) || + ((pSyncPoint->transitions.NumElements() > 0) && stageMasksNotEmpty)) + { + pSyncPoint->flags.active = 1; + } } else { @@ -1300,10 +1315,15 @@ Pal::Result RenderPassBuilder::TrackAttachmentUsage( WaitForResolves(pSync); } - // Detect if an automatic layout transition is needed and insert one to the given sync point if so. Note that - // these happen before load ops are triggered (below). - if ((pAttachment->prevReferenceLayout != layout) || - ((pStencilLayout != nullptr) && (pAttachment->prevReferenceStencilLayout != *pStencilLayout))) + // We want to include all transitions if acquire-release barrier interface is used. If not, then detect if an + // automatic layout transition is needed and insert one to the given sync point if so. Note that these happen + // before load ops are triggered (below). + const bool shouldIncludeTransition = + (m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetRuntimeSettings().useAcquireReleaseInterface) ? true : + ((pAttachment->prevReferenceLayout != layout) || + ((pStencilLayout != nullptr) && (pAttachment->prevReferenceStencilLayout != *pStencilLayout))); + + if (shouldIncludeTransition) { RPTransitionInfo transition = {}; diff --git a/icd/api/sqtt/sqtt_rgp_annotations.h b/icd/api/sqtt/sqtt_rgp_annotations.h index 7773b752..dcc1d146 100644 --- a/icd/api/sqtt/sqtt_rgp_annotations.h +++ b/icd/api/sqtt/sqtt_rgp_annotations.h @@ -40,7 +40,7 @@ constexpr uint32_t RgpSqttInstrumentationSpecVersion = 1; // RGP SQTT Instrumentation Specification version for Vulkan-specific tables -constexpr uint32_t RgpSqttInstrumentationApiVersion = 0; +constexpr uint32_t RgpSqttInstrumentationApiVersion = 4; #if defined(BIGENDIAN_CPU) || defined(__BIG_ENDIAN__) static_assert(false, "The bitfields in this header match the RGP format specification with the assumption that " diff --git a/icd/api/strings/extensions.txt b/icd/api/strings/extensions.txt index 86ba3001..458a16c4 100644 --- a/icd/api/strings/extensions.txt +++ b/icd/api/strings/extensions.txt @@ -218,3 +218,5 @@ VK_KHR_dynamic_rendering_local_read VK_KHR_vertex_attribute_divisor VK_EXT_frame_boundary VK_EXT_image_compression_control +#if VKI_RAY_TRACING +#endif diff --git a/icd/api/vk_buffer_view.cpp b/icd/api/vk_buffer_view.cpp index 858ae2fc..f1196a96 100644 --- a/icd/api/vk_buffer_view.cpp +++ b/icd/api/vk_buffer_view.cpp @@ -112,9 +112,9 @@ void BufferView::BuildSrd( Pal::BufferViewInfo info = {}; const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); - info.swizzledFormat = VkToPalFormat(format, settings); - info.stride = Pal::Formats::BytesPerPixel(info.swizzledFormat.format); - info.range = bufferRange; + info.swizzledFormat = VkToPalFormat(format, settings); + info.stride = Pal::Formats::BytesPerPixel(info.swizzledFormat.format); + info.range = bufferRange; // Bypass Mall read/write if no alloc policy is set for SRDs if (Util::TestAnyFlagSet(settings.mallNoAllocResourcePolicy, MallNoAllocBufferViewSrds)) diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp index 8170a40f..d3885915 100644 --- a/icd/api/vk_cmdbuffer.cpp +++ b/icd/api/vk_cmdbuffer.cpp @@ -600,6 +600,7 @@ CmdBuffer::CmdBuffer( m_reverseThreadGroupState(false) #if VKI_RAY_TRACING , m_scratchVidMemList(pDevice->VkInstance()->Allocator()) + , m_pBvhBatchState() , m_maxCpsMemSize(0) , m_patchCpsList { @@ -630,6 +631,7 @@ CmdBuffer::CmdBuffer( m_flags.disableResetReleaseResources = settings.disableResetReleaseResources; m_flags.subpassLoadOpClearsBoundAttachments = settings.subpassLoadOpClearsBoundAttachments; m_flags.preBindDefaultState = settings.preBindDefaultState; + m_flags.offsetMode = pDevice->GetEnabledFeatures().robustVertexBufferExtend; Pal::DeviceProperties info; m_pDevice->PalDevice(DefaultDeviceIndex)->GetProperties(&info); @@ -637,14 +639,14 @@ CmdBuffer::CmdBuffer( m_flags.useBackupBuffer = false; memset(m_pBackupPalCmdBuffers, 0, sizeof(Pal::ICmdBuffer*) * MaxPalDevices); - // If supportReleaseAcquireInterface is true, the ASIC provides new barrier interface CmdReleaseThenAcquire() - // designed for Acquire/Release-based driver. This flag is currently enabled for gfx9 and above. - // If supportSplitReleaseAcquire is true, the ASIC provides split CmdRelease() and CmdAcquire() to express barrier, - // and CmdReleaseThenAcquire() is still valid. This flag is currently enabled for gfx10 and above. - m_flags.useReleaseAcquire = info.gfxipProperties.flags.supportReleaseAcquireInterface && - settings.useAcquireReleaseInterface; - m_flags.useSplitReleaseAcquire = m_flags.useReleaseAcquire && - info.gfxipProperties.flags.supportSplitReleaseAcquire; + // If supportReleaseAcquireInterface is true, the ASIC provides new barrier interface CmdReleaseThenAcquire() + // designed for Acquire/Release-based driver. This flag is currently enabled for gfx9 and above. + // If supportSplitReleaseAcquire is true, the ASIC provides split CmdRelease() and CmdAcquire() to express barrier, + // and CmdReleaseThenAcquire() is still valid. This flag is currently enabled for gfx10 and above. + m_flags.useReleaseAcquire = info.gfxipProperties.flags.supportReleaseAcquireInterface && + settings.useAcquireReleaseInterface; + m_flags.useSplitReleaseAcquire = m_flags.useReleaseAcquire && + info.gfxipProperties.flags.supportSplitReleaseAcquire; } // ===================================================================================================================== @@ -1827,9 +1829,6 @@ void CmdBuffer::ResetState() m_flags.hasConditionalRendering = false; -#if VKI_RAY_TRACING -#endif - m_debugPrintf.Reset(m_pDevice); if (m_allGpuState.pDescBufBinding != nullptr) { @@ -1868,6 +1867,14 @@ VkResult CmdBuffer::Reset(VkCommandBufferResetFlags flags) #if VKI_RAY_TRACING FreeRayTracingScratchVidMemory(); FreePatchCpsList(); + + if (m_pBvhBatchState != nullptr) + { + // Called here (outside of the BvhBatchLayer because Reset can be triggered + // either directly on the command buffer or across the whole command pool. + m_pBvhBatchState->Log("Resetting via command buffer reset.\n"); + m_pBvhBatchState->Reset(); + } #endif result = PalToVkResult(PalCmdBufferReset(releaseResources)); @@ -2353,6 +2360,14 @@ VkResult CmdBuffer::Destroy(void) FreeRayTracingScratchVidMemory(); FreePatchCpsList(); + if (m_pBvhBatchState != nullptr) + { + // Called here (outside of the BvhBatchLayer because Destroy can be triggered + // either directly on the command buffer or across the whole command pool. + m_pBvhBatchState->Log("Resetting via command buffer destroy.\n"); + m_pBvhBatchState->Reset(); + } + #endif m_debugPrintf.Reset(m_pDevice); @@ -2982,7 +2997,7 @@ void CmdBuffer::BindVertexBuffersUpdateBindingRange( { pBinding->range = pSizes[inputIdx]; - if (offset != 0) + if ((offset != 0) && (m_flags.offsetMode == false)) { padVertexBuffers = true; } @@ -3052,7 +3067,30 @@ void CmdBuffer::BindVertexBuffers( pSizes, pStrides); - PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers(firstBinding, lowBindingCount, pBinding); + Pal::VertexBufferViews bufferViews = + { + .firstBuffer = firstBinding, + .bufferCount = lowBindingCount, + .offsetMode = (m_flags.offsetMode == 1) ? true : false + }; + Pal::VertexBufferView vertexViews[Pal::MaxVertexBuffers] = {}; + + if (m_flags.offsetMode) + { + for (uint32_t idx = 0; idx < lowBindingCount; idx++) + { + vertexViews[idx].gpuva = pBinding[idx].gpuAddr; + vertexViews[idx].sizeInBytes = pBinding[idx].range; + vertexViews[idx].strideInBytes = pBinding[idx].stride; + } + bufferViews.pVertexBufferViews = vertexViews; + } + else + { + bufferViews.pBufferViewInfos = pBinding; + } + + PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers(bufferViews); } } @@ -3115,9 +3153,31 @@ void CmdBuffer::UpdateVertexBufferStrides( if (firstChanged <= lastChanged) { - PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers( - firstChanged, (lastChanged - firstChanged) + 1, - &PerGpuState(deviceIdx)->vbBindings[firstChanged]); + Pal::VertexBufferViews bufferViews = + { + .firstBuffer = firstChanged, + .bufferCount = (lastChanged - firstChanged) + 1, + .offsetMode = (m_flags.offsetMode == 1) ? true : false + }; + Pal::VertexBufferView vertexViews[Pal::MaxVertexBuffers] = {}; + auto pBinding = &PerGpuState(deviceIdx)->vbBindings[firstChanged]; + + if (m_flags.offsetMode) + { + for (uint32_t idx = 0; idx < (lastChanged - firstChanged + 1); idx++) + { + vertexViews[idx].gpuva = pBinding[idx].gpuAddr; + vertexViews[idx].sizeInBytes = pBinding[idx].range; + vertexViews[idx].strideInBytes = pBinding[idx].stride; + } + bufferViews.pVertexBufferViews = vertexViews; + } + else + { + bufferViews.pBufferViewInfos = pBinding; + } + + PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers(bufferViews); } } while (deviceGroup.IterateNext()); @@ -3480,16 +3540,69 @@ void CmdBuffer::ExecuteIndirect( const VkGeneratedCommandsInfoNV* pInfo) { IndirectCommandsLayout* pLayout = IndirectCommandsLayout::ObjectFromHandle(pInfo->indirectCommandsLayout); + IndirectCommandsInfo info = pLayout->GetIndirectCommandsInfo(); + + uint64_t barrierCmd = 0; + + if ((info.actionType == IndirectCommandsActionType::Draw) || + (info.actionType == IndirectCommandsActionType::DrawIndexed)) + { + const bool indexed = (info.actionType == IndirectCommandsActionType::DrawIndexed); + barrierCmd = (indexed ? DbgBarrierDrawIndexed : DbgBarrierDrawNonIndexed) | DbgBarrierDrawIndirect; + + DbgBarrierPreCmd(barrierCmd); + + ValidateGraphicsStates(); + } + else if (info.actionType == IndirectCommandsActionType::Dispatch) + { + barrierCmd = DbgBarrierDispatchIndirect; + + DbgBarrierPreCmd(barrierCmd); + + if (PalPipelineBindingOwnedBy(Pal::PipelineBindPoint::Compute, PipelineBindCompute) == false) + { + RebindPipeline(); + } + } + else if (info.actionType == IndirectCommandsActionType::MeshTask) + { + barrierCmd = DbgBarrierDrawMeshTasksIndirect; + + DbgBarrierPreCmd(barrierCmd); + + ValidateGraphicsStates(); + } + else + { + VK_NEVER_CALLED(); + } + + VK_ASSERT(pInfo->streamCount == 1); + + const Buffer* pArgumentBuffer = Buffer::ObjectFromHandle(pInfo->pStreams[0].buffer); + const uint64_t argumentOffset = pInfo->pStreams[0].offset; + + const Buffer* pCountBuffer = Buffer::ObjectFromHandle(pInfo->sequencesCountBuffer); + const uint64_t countOffset = pInfo->sequencesCountOffset; + + const uint32_t maxCount = pInfo->sequencesCount; utils::IterateMask deviceGroup(m_curDeviceMask); do { const uint32_t deviceIdx = deviceGroup.Index(); - pLayout->BindPreprocessBuffer(pInfo->preprocessBuffer, - pInfo->preprocessOffset, - deviceIdx); + + PalCmdBuffer(deviceIdx)->CmdExecuteIndirectCmds( + *pLayout->PalIndirectCmdGenerator(deviceIdx), + pArgumentBuffer->GpuVirtAddr(deviceIdx) + argumentOffset, + maxCount, + (pCountBuffer == nullptr) ? 0 : pCountBuffer->GpuVirtAddr(deviceIdx) + countOffset); + } while (deviceGroup.IterateNext()); + + DbgBarrierPostCmd(barrierCmd); } // ===================================================================================================================== @@ -7733,33 +7846,99 @@ void CmdBuffer::RPEndSubpass() // ===================================================================================================================== // Handles post-clear synchronization for load-op color clears when not auto-syncing. -void CmdBuffer::RPSyncPostLoadOpColorClear() +void CmdBuffer::RPSyncPostLoadOpColorClear( + uint32_t colorClearCount, + const RPLoadOpClearInfo* pClears) { - static const Pal::BarrierTransition transition = + if (m_flags.useReleaseAcquire) { - Pal::CoherClear, - Pal::CoherColorTarget, - {} - }; + VK_ASSERT(colorClearCount > 0); - static const Pal::HwPipePoint PipePoint = Pal::HwPipePostBlt; - static const Pal::BarrierInfo Barrier = - { - Pal::HwPipePreRasterization, // waitPoint - 1, // pipePointWaitCount - &PipePoint, // pPipePoints - 0, // gpuEventWaitCount - nullptr, // ppGpuEvents - 0, // rangeCheckedTargetWaitCount - nullptr, // ppTargets - 1, // transitionCount - &transition, // pTransitions - 0, // globalSrcCacheMask - 0, // globalDstCacheMask - RgpBarrierExternalRenderPassSync // reason - }; + VirtualStackFrame virtStack(m_pStackAllocator); + + Pal::AcquireReleaseInfo barrierInfo = {}; + + barrierInfo.reason = RgpBarrierExternalRenderPassSync; + + Pal::ImgBarrier* pPalTransitions = (colorClearCount != 0) ? + virtStack.AllocArray(colorClearCount) : + nullptr; + const Image** ppImages = (colorClearCount != 0) ? + virtStack.AllocArray(colorClearCount) : + nullptr; + + for (uint32_t i = 0; i < colorClearCount; ++i) + { + const RPLoadOpClearInfo& clear = pClears[i]; + + const Framebuffer::Attachment& attachment = m_allGpuState.pFramebuffer->GetAttachment(clear.attachment); + + VK_ASSERT(pPalTransitions != nullptr); + VK_ASSERT(ppImages != nullptr); + + for (uint32_t sr = 0; sr < attachment.subresRangeCount; ++sr) + { + const uint32_t plane = attachment.subresRange[sr].startSubres.plane; + + const Pal::ImageLayout oldLayout = RPGetAttachmentLayout(clear.attachment, plane); + + const Pal::ImageLayout newLayout = { VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, 1 }; + + ppImages[barrierInfo.imageBarrierCount] = attachment.pImage; + + pPalTransitions[barrierInfo.imageBarrierCount].srcStageMask = Pal::PipelineStageBlt; + pPalTransitions[barrierInfo.imageBarrierCount].dstStageMask = Pal::PipelineStageEarlyDsTarget; + pPalTransitions[barrierInfo.imageBarrierCount].srcAccessMask = Pal::CoherClear; + pPalTransitions[barrierInfo.imageBarrierCount].dstAccessMask = Pal::CoherColorTarget; + // We set the pImage to nullptr by default here. But, this will be computed correctly later for + // each device including DefaultDeviceIndex based on the deviceId. + pPalTransitions[barrierInfo.imageBarrierCount].pImage = nullptr; + pPalTransitions[barrierInfo.imageBarrierCount].oldLayout = oldLayout; + pPalTransitions[barrierInfo.imageBarrierCount].newLayout = newLayout; + pPalTransitions[barrierInfo.imageBarrierCount].subresRange = attachment.subresRange[sr]; - PalCmdBarrier(Barrier, GetRpDeviceMask()); + barrierInfo.imageBarrierCount++; + } + } + + barrierInfo.pImageBarriers = pPalTransitions; + + PalCmdReleaseThenAcquire( + &barrierInfo, + nullptr, + nullptr, + pPalTransitions, + ppImages, + GetRpDeviceMask()); + } + else + { + static const Pal::BarrierTransition transition = + { + Pal::CoherClear, + Pal::CoherColorTarget, + {} + }; + + static const Pal::HwPipePoint PipePoint = Pal::HwPipePostBlt; + static const Pal::BarrierInfo Barrier = + { + Pal::HwPipePreRasterization, // waitPoint + 1, // pipePointWaitCount + &PipePoint, // pPipePoints + 0, // gpuEventWaitCount + nullptr, // ppGpuEvents + 0, // rangeCheckedTargetWaitCount + nullptr, // ppTargets + 1, // transitionCount + &transition, // pTransitions + 0, // globalSrcCacheMask + 0, // globalDstCacheMask + RgpBarrierExternalRenderPassSync // reason + }; + + PalCmdBarrier(Barrier, GetRpDeviceMask()); + } } // ===================================================================================================================== @@ -7799,14 +7978,15 @@ void CmdBuffer::RPBeginSubpass() if (subpasses[i].begin.loadOps.colorClearCount > 0) { RPLoadOpClearColor(subpasses[i].begin.loadOps.colorClearCount, - subpasses[i].begin.loadOps.pColorClears); + subpasses[i].begin.loadOps.pColorClears); } } // If we are manually pre-syncing color clears, we must post-sync also if (subpasses[0].begin.syncTop.barrier.flags.preColorClearSync) { - RPSyncPostLoadOpColorClear(); + RPSyncPostLoadOpColorClear(subpasses[0].begin.loadOps.colorClearCount, + subpasses[0].begin.loadOps.pColorClears); } for (uint32_t i = 0; i < subpassCount; ++i) @@ -7815,7 +7995,7 @@ void CmdBuffer::RPBeginSubpass() if (subpasses[i].begin.loadOps.dsClearCount > 0) { RPLoadOpClearDepthStencil(subpasses[i].begin.loadOps.dsClearCount, - subpasses[i].begin.loadOps.pDsClears); + subpasses[i].begin.loadOps.pDsClears); } } @@ -7838,7 +8018,7 @@ void CmdBuffer::RPBeginSubpass() // If we are manually pre-syncing color clears, we must post-sync also if (subpass.begin.syncTop.barrier.flags.preColorClearSync) { - RPSyncPostLoadOpColorClear(); + RPSyncPostLoadOpColorClear(subpass.begin.loadOps.colorClearCount, subpass.begin.loadOps.pColorClears); } // Execute any depth-stencil clear load operations @@ -8051,11 +8231,10 @@ void CmdBuffer::RPSyncPoint( pVirtStack->AllocArray(maxTransitionCount) : nullptr; - const bool isDstStageNotBottomOfPipe = (dstStageMask != Pal::PipelineStageBottomOfPipe); - // Construct global memory dependency to synchronize caches (subpass dependencies + implicit synchronization) if (rpBarrier.flags.needsGlobalTransition) { + Pal::BarrierTransition globalTransition = { }; m_pDevice->GetBarrierPolicy().ApplyBarrierCacheFlags( @@ -8082,16 +8261,6 @@ void CmdBuffer::RPSyncPoint( Pal::BarrierTransition imageTransition = { }; - m_pDevice->GetBarrierPolicy().ApplyBarrierCacheFlags( - rpBarrier.srcAccessMask, - rpBarrier.dstAccessMask, - VK_IMAGE_LAYOUT_GENERAL, - VK_IMAGE_LAYOUT_GENERAL, - &imageTransition); - - uint32_t srcAccessMask = imageTransition.srcCacheMask | rpBarrier.implicitSrcCacheMask; - uint32_t dstAccessMask = imageTransition.dstCacheMask | rpBarrier.implicitDstCacheMask; - for (uint32_t sr = 0; sr < attachment.subresRangeCount; ++sr) { const uint32_t plane = attachment.subresRange[sr].startSubres.plane; @@ -8107,50 +8276,55 @@ void CmdBuffer::RPSyncPoint( tr.attachment, plane); - if ((oldLayout.usages != newLayout.usages) || - (oldLayout.engines != newLayout.engines) || - ((srcAccessMask != dstAccessMask) && settings.rpBarrierCheckAccessMasks)) - { - VK_ASSERT(acquireReleaseInfo.imageBarrierCount < maxTransitionCount); - - ppImages[acquireReleaseInfo.imageBarrierCount] = attachment.pImage; - - pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcStageMask = srcStageMask; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstStageMask = dstStageMask; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcAccessMask = srcAccessMask; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstAccessMask = dstAccessMask; - // We set the pImage to nullptr by default here. But, this will be computed correctly later for - // each device including DefaultDeviceIndex based on the deviceId. - pPalTransitions[acquireReleaseInfo.imageBarrierCount].pImage = nullptr; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].oldLayout = oldLayout; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].newLayout = newLayout; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].subresRange = attachment.subresRange[sr]; + m_pDevice->GetBarrierPolicy().ApplyBarrierCacheFlags( + rpBarrier.srcAccessMask, + rpBarrier.dstAccessMask, + ((plane == 1) ? tr.prevStencilLayout.layout : tr.prevLayout.layout), + ((plane == 1) ? tr.nextStencilLayout.layout : tr.nextLayout.layout), + &imageTransition); + + uint32_t srcAccessMask = imageTransition.srcCacheMask | rpBarrier.implicitSrcCacheMask; + uint32_t dstAccessMask = imageTransition.dstCacheMask | rpBarrier.implicitDstCacheMask; + + VK_ASSERT(acquireReleaseInfo.imageBarrierCount < maxTransitionCount); + + ppImages[acquireReleaseInfo.imageBarrierCount] = attachment.pImage; + + pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcStageMask = srcStageMask; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstStageMask = dstStageMask; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcAccessMask = srcAccessMask; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstAccessMask = dstAccessMask; + // We set the pImage to nullptr by default here. But, this will be computed correctly later for + // each device including DefaultDeviceIndex based on the deviceId. + pPalTransitions[acquireReleaseInfo.imageBarrierCount].pImage = nullptr; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].oldLayout = oldLayout; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].newLayout = newLayout; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].subresRange = attachment.subresRange[sr]; - const Pal::MsaaQuadSamplePattern* pQuadSamplePattern = nullptr; + const Pal::MsaaQuadSamplePattern* pQuadSamplePattern = nullptr; - if (attachment.pImage->IsSampleLocationsCompatibleDepth() && - tr.flags.isInitialLayoutTransition) - { - VK_ASSERT(attachment.pImage->HasDepth()); + if (attachment.pImage->IsSampleLocationsCompatibleDepth() && + tr.flags.isInitialLayoutTransition) + { + VK_ASSERT(attachment.pImage->HasDepth()); - // Use the provided sample locations for this attachment if this is its - // initial layout transition - pQuadSamplePattern = - &m_renderPassInstance.pAttachments[tr.attachment].initialSamplePattern.locations; - } - else - { - // Otherwise, use the subpass' sample locations - uint32_t subpass = m_renderPassInstance.subpass; - pQuadSamplePattern = &m_renderPassInstance.pSamplePatterns[subpass].locations; - } + // Use the provided sample locations for this attachment if this is its + // initial layout transition + pQuadSamplePattern = + &m_renderPassInstance.pAttachments[tr.attachment].initialSamplePattern.locations; + } + else + { + // Otherwise, use the subpass' sample locations + uint32_t subpass = m_renderPassInstance.subpass; + pQuadSamplePattern = &m_renderPassInstance.pSamplePatterns[subpass].locations; + } - pPalTransitions[acquireReleaseInfo.imageBarrierCount].pQuadSamplePattern = pQuadSamplePattern; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].pQuadSamplePattern = pQuadSamplePattern; - RPSetAttachmentLayout(tr.attachment, plane, newLayout); + RPSetAttachmentLayout(tr.attachment, plane, newLayout); - acquireReleaseInfo.imageBarrierCount++; - } + acquireReleaseInfo.imageBarrierCount++; } } @@ -8170,12 +8344,13 @@ void CmdBuffer::RPSyncPoint( acquireReleaseInfo.dstGlobalAccessMask = 0; } + const bool stageMasksNotEmpty = (((srcStageMask == 0) && (dstStageMask == 0)) == false); + // We do not require a dumb transition here in acquire/release interface because unlike Legacy barriers, // PAL flushes caches even if only the global barriers are passed-in without any image/buffer memory barriers. // Execute the barrier if it actually did anything - if ((acquireReleaseInfo.dstGlobalStageMask != Pal::PipelineStageBottomOfPipe) || - ((acquireReleaseInfo.imageBarrierCount > 0) && isDstStageNotBottomOfPipe)) + if (stageMasksNotEmpty) { PalCmdReleaseThenAcquire( &acquireReleaseInfo, @@ -9304,7 +9479,7 @@ void CmdBuffer::PushDescriptorSetKHR( case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: default: VK_ASSERT(!"Unexpected descriptor type"); break; @@ -9862,9 +10037,31 @@ void CmdBuffer::SetVertexInput( if (firstChanged <= lastChanged) { - PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers( - firstChanged, (lastChanged - firstChanged) + 1, - &PerGpuState(deviceIdx)->vbBindings[firstChanged]); + Pal::VertexBufferViews bufferViews = + { + .firstBuffer = firstChanged, + .bufferCount = (lastChanged - firstChanged) + 1, + .offsetMode = (m_flags.offsetMode == 1) ? true : false + }; + Pal::VertexBufferView vertexViews[Pal::MaxVertexBuffers] = {}; + auto pBinding = &PerGpuState(deviceIdx)->vbBindings[firstChanged]; + + if (m_flags.offsetMode) + { + for (uint32_t idx = 0; idx < (lastChanged - firstChanged + 1); idx++) + { + vertexViews[idx].gpuva = pBinding[idx].gpuAddr; + vertexViews[idx].sizeInBytes = pBinding[idx].range; + vertexViews[idx].strideInBytes = pBinding[idx].stride; + } + bufferViews.pVertexBufferViews = vertexViews; + } + else + { + bufferViews.pBufferViewInfos = pBinding; + } + + PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers(bufferViews); } if (vertexBufferCount != pBindState->dynamicBindInfo.gfxDynState.vertexBufferCount) @@ -10420,6 +10617,9 @@ void CmdBuffer::BuildAccelerationStructuresPerDevice( { const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings(); + Util::Vector m_gpurtInfos(VkInstance()->Allocator()); + Util::Vector m_convHelpers(VkInstance()->Allocator()); + for (uint32_t infoIdx = 0; infoIdx < infoCount; ++infoIdx) { const VkAccelerationStructureBuildGeometryInfoKHR* pInfo = &pInfos[infoIdx]; @@ -10447,6 +10647,7 @@ void CmdBuffer::BuildAccelerationStructuresPerDevice( deviceIndex, *pInfo, pBuildRangeInfos, + (ppMaxPrimitiveCounts != nullptr) ? ppMaxPrimitiveCounts[infoIdx] : nullptr, &helper, &info.inputs); @@ -10456,7 +10657,19 @@ void CmdBuffer::BuildAccelerationStructuresPerDevice( const bool forceRebuildBottomLevel = Util::TestAnyFlagSet(settings.forceRebuildForUpdates, ForceRebuildForUpdatesBottomLevel); - if (settings.ifhRayTracing) + // Skip all work depending on rtTossPoint setting and type of work. + const uint32 rtTossPoint = settings.rtTossPoint; + + const bool isUpdate = Util::TestAnyFlagSet(info.inputs.flags, GpuRt::AccelStructBuildFlagPerformUpdate); + + const bool tossWork = (((info.inputs.type == GpuRt::AccelStructType::TopLevel) && + (rtTossPoint >= RtTossPointTlas)) || + ((info.inputs.type == GpuRt::AccelStructType::BottomLevel) && + (rtTossPoint >= RtTossPointBlasBuild)) || + ((info.inputs.type == GpuRt::AccelStructType::BottomLevel) && + (rtTossPoint >= RtTossPointBlasUpdate) && isUpdate)); + + if (tossWork) { info.inputs.inputElemCount = 0; } @@ -10477,18 +10690,42 @@ void CmdBuffer::BuildAccelerationStructuresPerDevice( info.indirect.indirectGpuAddr = pIndirectDeviceAddresses[infoIdx]; info.indirect.indirectStride = pIndirectStrides[infoIdx]; - helper.pMaxPrimitiveCounts = ppMaxPrimitiveCounts[infoIdx]; } - DbgBarrierPreCmd((pInfo->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR) ? - DbgBuildAccelerationStructureTLAS : DbgBuildAccelerationStructureBLAS); + if (settings.batchBvhBuilds == BatchBvhModeDisabled) + { + DbgBarrierPreCmd((pInfo->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR) ? + DbgBuildAccelerationStructureTLAS : DbgBuildAccelerationStructureBLAS); - m_pDevice->RayTrace()->GpuRt(deviceIndex)->BuildAccelStruct( + m_pDevice->RayTrace()->GpuRt(deviceIndex)->BuildAccelStruct( PalCmdBuffer(deviceIndex), info); - DbgBarrierPostCmd((pInfo->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR) ? - DbgBuildAccelerationStructureTLAS : DbgBuildAccelerationStructureBLAS); + DbgBarrierPostCmd((pInfo->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR) ? + DbgBuildAccelerationStructureTLAS : DbgBuildAccelerationStructureBLAS); + } + else + { + m_gpurtInfos.PushBack(info); + m_convHelpers.PushBack(helper); + } + } + + if (m_gpurtInfos.IsEmpty() == false) + { + DbgBarrierPreCmd(DbgBuildAccelerationStructureTLAS | DbgBuildAccelerationStructureBLAS); + + VK_ASSERT(m_gpurtInfos.NumElements() == m_convHelpers.NumElements()); + for (uint32 i = 0; i < m_gpurtInfos.NumElements(); ++i) + { + m_gpurtInfos[i].inputs.pClientData = &m_convHelpers[i]; + } + + m_pDevice->RayTrace()->GpuRt(deviceIndex)->BuildAccelStructs( + PalCmdBuffer(deviceIndex), + m_gpurtInfos); + + DbgBarrierPostCmd(DbgBuildAccelerationStructureTLAS | DbgBuildAccelerationStructureBLAS); } } diff --git a/icd/api/vk_cmdbuffer_transfer.cpp b/icd/api/vk_cmdbuffer_transfer.cpp index 5536150f..9310d62a 100644 --- a/icd/api/vk_cmdbuffer_transfer.cpp +++ b/icd/api/vk_cmdbuffer_transfer.cpp @@ -419,7 +419,18 @@ void CmdBuffer::BlitImage( palCopyInfo.rotation = Pal::ImageRotation::Ccw0; palCopyInfo.pRegions = pPalRegions; - palCopyInfo.flags.dstAsSrgb = pDstImage->TreatAsSrgb(); + + // PAL does gamma correction whenever the destination is a SRGB image or treated as one. + // If the source image is an UNORM image that contains SRGB data, we need to set dstAsNorm + // so PAL doesn't end up doing gamma correction on values that are already in SRGB space. + if (pSrcImage->TreatAsSrgb()) + { + palCopyInfo.flags.dstAsNorm = true; + } + else if (pDstImage->TreatAsSrgb()) + { + palCopyInfo.flags.dstAsSrgb = true; + } for (uint32_t regionIdx = 0; regionIdx < regionCount;) { @@ -802,9 +813,9 @@ void CmdBuffer::QueryCopy( // 64-bit values) Pal::BufferViewInfo bufferViewInfo = {}; - bufferViewInfo.range = destStride * queryCount; - bufferViewInfo.stride = 0; // Raw buffers have a zero byte stride - bufferViewInfo.swizzledFormat = Pal::UndefinedSwizzledFormat; + bufferViewInfo.range = destStride * queryCount; + bufferViewInfo.stride = 0; // Raw buffers have a zero byte stride + bufferViewInfo.swizzledFormat = Pal::UndefinedSwizzledFormat; // Set query count userData[queryCountOffset] = queryCount; diff --git a/icd/api/vk_compute_pipeline.cpp b/icd/api/vk_compute_pipeline.cpp index d2965fe0..34384780 100644 --- a/icd/api/vk_compute_pipeline.cpp +++ b/icd/api/vk_compute_pipeline.cpp @@ -33,6 +33,9 @@ #include "include/vk_pipeline_layout.h" #include "include/vk_memory.h" #include "include/vk_pipeline.h" +#if VKI_RAY_TRACING +#include "raytrace/ray_tracing_device.h" +#endif #include "palPipeline.h" #include "palPipelineAbi.h" @@ -152,7 +155,7 @@ VkResult ComputePipeline::CreatePipelineBinaries( bool shouldConvert = (pCreateInfo != nullptr) && (pDevice->GetRuntimeSettings().enablePipelineDump || - (shouldCompile && (pBinaryCreateInfo->pTempBuffer == nullptr))); + (shouldCompile && (pBinaryCreateInfo->pTempBuffer == nullptr))); VkResult convertResult = VK_ERROR_UNKNOWN; if (shouldConvert) @@ -226,7 +229,6 @@ VkResult ComputePipeline::CreatePipelineBinaries( // Add to any cache layer where missing if ((result == VK_SUCCESS) && storeBinaryToCache) - { pDevice->GetCompiler(deviceIdx)->CachePipelineBinary( &pCacheIds[deviceIdx], @@ -571,6 +573,7 @@ VkResult ComputePipeline::Create( static_cast(pipelineBinaries[DefaultDeviceIndex].pCode), pComputePipeline->GetFormatStrings()); } + } else { diff --git a/icd/api/vk_conv.cpp b/icd/api/vk_conv.cpp index d43cd44f..73869c24 100644 --- a/icd/api/vk_conv.cpp +++ b/icd/api/vk_conv.cpp @@ -750,6 +750,9 @@ const char* PalResultName( case Pal::Result::ErrorInvalidExternalHandle: resultName = "ErrorInvalidExternalHandle"; break; + case Pal::Result::ErrorIncompatibleDisplayMode: + resultName = "ErrorIncompatibleDisplayMode"; + break; default: VK_NOT_IMPLEMENTED; resultName = "??"; @@ -1127,6 +1130,7 @@ static uint32_t GetBufferSrdFormatInfo( bufferInfo.swizzledFormat = swizzledFormat; bufferInfo.range = UINT32_MAX; bufferInfo.stride = Pal::Formats::BytesPerPixel(swizzledFormat.format); + pPhysicalDevice->PalDevice()->CreateTypedBufferViewSrds(1, &bufferInfo, result); // NOTE: Until now, all buffer format info is stored the fourth DWORD of buffer SRD. please modify diff --git a/icd/api/vk_descriptor_buffer.cpp b/icd/api/vk_descriptor_buffer.cpp index c44c02ad..705fca54 100644 --- a/icd/api/vk_descriptor_buffer.cpp +++ b/icd/api/vk_descriptor_buffer.cpp @@ -258,7 +258,7 @@ VKAPI_ATTR void VKAPI_CALL vkGetDescriptorEXT( } break; } - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: default: diff --git a/icd/api/vk_descriptor_pool.cpp b/icd/api/vk_descriptor_pool.cpp index 567b1495..f693848a 100644 --- a/icd/api/vk_descriptor_pool.cpp +++ b/icd/api/vk_descriptor_pool.cpp @@ -517,6 +517,7 @@ VkResult DescriptorGpuMemHeap::Init( VkDescriptorPoolCreateFlags poolUsage = pCreateInfo->flags; uint32_t maxSets = pCreateInfo->maxSets; const VkDescriptorPoolSize* pTypeCount = pCreateInfo->pPoolSizes; + uint32_t maxInlineUniformBlockBindings = 0; m_numPalDevices = pDevice->NumPalDevices(); m_usage = poolUsage; @@ -540,6 +541,16 @@ VkResult DescriptorGpuMemHeap::Init( break; } + case VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO: + { + const VkDescriptorPoolInlineUniformBlockCreateInfo* pDescriptorPoolInlineUniformBlockCreateInfo = + reinterpret_cast(pHeader); + + maxInlineUniformBlockBindings = + pDescriptorPoolInlineUniformBlockCreateInfo->maxInlineUniformBlockBindings; + + break; + } default: break; @@ -551,6 +562,8 @@ VkResult DescriptorGpuMemHeap::Init( VkResult result = VK_SUCCESS; + m_gpuMemAddrAlignment = pDevice->GetProperties().descriptorSizes.alignmentInDwords * sizeof(uint32_t); + if (pDevice->GetRuntimeSettings().pipelineLayoutMode == PipelineLayoutAngle) { for (uint32_t i = 0; i < pCreateInfo->poolSizeCount; ++i) @@ -561,6 +574,10 @@ VkResult DescriptorGpuMemHeap::Init( } else { + constexpr uint32_t InlineUniformGranularity = 4; + + m_gpuMemSize += ((m_gpuMemAddrAlignment - InlineUniformGranularity) * maxInlineUniformBlockBindings); + for (uint32_t i = 0; i < pCreateInfo->poolSizeCount; ++i) { if (pTypeCount[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) @@ -587,7 +604,7 @@ VkResult DescriptorGpuMemHeap::Init( } VK_ASSERT(maxSize > 0); - m_gpuMemSize += maxSize * sizeof(uint32_t) * pTypeCount[i].descriptorCount; + m_gpuMemSize += maxSize * pTypeCount[i].descriptorCount; } else { @@ -597,8 +614,6 @@ VkResult DescriptorGpuMemHeap::Init( } } - m_gpuMemAddrAlignment = pDevice->GetProperties().descriptorSizes.alignmentInDwords * sizeof(uint32_t); - if (oneShot == false) //DYNAMIC USAGE { // In case of dynamic descriptor pools we have to prepare our management structures. diff --git a/icd/api/vk_descriptor_set.cpp b/icd/api/vk_descriptor_set.cpp index e4be7758..14576653 100644 --- a/icd/api/vk_descriptor_set.cpp +++ b/icd/api/vk_descriptor_set.cpp @@ -399,8 +399,8 @@ void DescriptorUpdate::WriteBufferInfoDescriptors( (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)); // Setup and create SRD for storage buffer case - info.swizzledFormat = Pal::UndefinedSwizzledFormat; - info.stride = 0; // Raw buffers have a zero byte stride + info.swizzledFormat = Pal::UndefinedSwizzledFormat; + info.stride = 0; // Raw buffers have a zero byte stride Pal::IDevice* pPalDevice = pDevice->PalDevice(deviceIdx); @@ -468,6 +468,7 @@ void DescriptorUpdate::SetAccelerationDescriptorsBufferViewFlags( pBufferViewInfo->flags.bypassMallRead = 1; pBufferViewInfo->flags.bypassMallWrite = 1; } + } void DescriptorUpdate::WriteAccelerationStructureDescriptors( @@ -703,7 +704,7 @@ void DescriptorUpdate::WriteDescriptorSets( destBinding.dyn.dwArrayStride); break; - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: { VK_ASSERT(params.pNext != nullptr); VK_ASSERT(Util::IsPow2Aligned(params.dstArrayElement, 4)); @@ -837,7 +838,7 @@ void DescriptorUpdate::CopyDescriptorSets( // Just to a straight memcpy covering the entire range. memcpy(pDestAddr, pSrcAddr, srcBinding.dyn.dwArrayStride * sizeof(uint32_t) * count); } - else if (srcBinding.info.descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + else if (srcBinding.info.descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { VK_ASSERT(Util::IsPow2Aligned(params.srcArrayElement, 4)); VK_ASSERT(Util::IsPow2Aligned(params.dstArrayElement, 4)); diff --git a/icd/api/vk_descriptor_set_layout.cpp b/icd/api/vk_descriptor_set_layout.cpp index 234a0b35..ed561e55 100644 --- a/icd/api/vk_descriptor_set_layout.cpp +++ b/icd/api/vk_descriptor_set_layout.cpp @@ -184,7 +184,7 @@ uint32_t DescriptorSetLayout::GetSingleDescStaticSize( // as we pack the whole buffer SRD in the dynamic section (i.e. user data registers). size = 0; break; - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: size = 1; break; default: @@ -193,7 +193,7 @@ uint32_t DescriptorSetLayout::GetSingleDescStaticSize( break; } - VK_ASSERT((type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) || (Util::IsPow2Aligned(size, sizeof(uint32_t)))); + VK_ASSERT((type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) || (Util::IsPow2Aligned(size, sizeof(uint32_t)))); return size; } @@ -231,7 +231,7 @@ uint32_t DescriptorSetLayout::GetDescStaticSectionDwSize( size *= maxMultiPlaneCount; } - if (descriptorInfo->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + if (descriptorInfo->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { // A single binding corresponds to a whole uniform block, so handle it as one descriptor not array. size *= descriptorInfo->descriptorCount; @@ -250,7 +250,7 @@ uint32_t DescriptorSetLayout::GetDescStaticSectionDwSize( { const BindingInfo& bindingInfo = pSrcDescSetLayout->Binding(binding); - return (bindingInfo.info.descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) ? + return (bindingInfo.info.descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) ? bindingInfo.sta.dwSize : bindingInfo.sta.dwArrayStride; } @@ -339,7 +339,7 @@ void DescriptorSetLayout::ConvertBindingInfo( // Dword offset to this binding pBindingSectionInfo->dwOffset = Util::RoundUpToMultiple(pSectionInfo->dwSize, descAlignmentInDw); - if (pBindingInfo->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + if (pBindingInfo->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { // This allows access to inline uniform blocks using dwords offsets. // Vk(Write/Copy/Update)DescriptorSet use byte values, convert them to dword. diff --git a/icd/api/vk_descriptor_update_template.cpp b/icd/api/vk_descriptor_update_template.cpp index cc5e2c69..87cfba51 100644 --- a/icd/api/vk_descriptor_update_template.cpp +++ b/icd/api/vk_descriptor_update_template.cpp @@ -81,9 +81,9 @@ VkResult DescriptorUpdateTemplate::Create( VK_ASSERT((pCreateInfo->templateType != VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR) || ((dstBinding.info.descriptorType != VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) && (dstBinding.info.descriptorType != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) && - (dstBinding.info.descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT))); + (dstBinding.info.descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK))); - if (dstBinding.info.descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + if (dstBinding.info.descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { // Convert dstArrayElement to dword VK_ASSERT(Util::IsPow2Aligned(srcEntry.dstArrayElement, 4)); @@ -182,7 +182,7 @@ DescriptorUpdateTemplate::PfnUpdateEntry DescriptorUpdateTemplate::GetUpdateEntr case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: pFunc = &UpdateEntryBuffer; break; - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: pFunc = &UpdateEntryInlineUniformBlock; break; #if VKI_RAY_TRACING diff --git a/icd/api/vk_device.cpp b/icd/api/vk_device.cpp index 98db2cc7..2e6cbc20 100644 --- a/icd/api/vk_device.cpp +++ b/icd/api/vk_device.cpp @@ -551,6 +551,9 @@ VkResult Device::Create( if (reinterpret_cast(pHeader)->robustBufferAccess2) { deviceFeatures.robustBufferAccessExtended = true; + { + deviceFeatures.robustVertexBufferExtend = true; + } } if (reinterpret_cast(pHeader)->robustImageAccess2) @@ -1478,6 +1481,13 @@ void Device::InitDispatchTable() m_pBarrierFilterLayer->OverrideDispatchTable(&m_dispatchTable); } +#if VKI_RAY_TRACING + if ((RayTrace() != nullptr) && (RayTrace()->GetBvhBatchLayer() != nullptr)) + { + RayTrace()->GetBvhBatchLayer()->OverrideDispatchTable(&m_dispatchTable); + } +#endif + #if VKI_GPU_DECOMPRESS if (m_pGpuDecoderLayer != nullptr) { @@ -3932,7 +3942,14 @@ void Device::GetAccelerationStructureBuildSizesKHR( const bool allowUpdate = inputs.flags & GpuRt::AccelStructBuildFlagAllowUpdate; - if (m_settings.ifhRayTracing) + const uint32 rtTossPoint = m_settings.rtTossPoint; + + // Skip all work depending on rtTossPoint setting and type of work. + const bool tossWork = (((inputs.type == GpuRt::AccelStructType::TopLevel) && (rtTossPoint >= RtTossPointTlas)) || + ((inputs.type == GpuRt::AccelStructType::BottomLevel) && + (rtTossPoint >= RtTossPointBlasBuild))); + + if (tossWork) { inputs.inputElemCount = 0; } diff --git a/icd/api/vk_graphics_pipeline.cpp b/icd/api/vk_graphics_pipeline.cpp index 5fc1f618..82811670 100644 --- a/icd/api/vk_graphics_pipeline.cpp +++ b/icd/api/vk_graphics_pipeline.cpp @@ -35,6 +35,9 @@ #include "include/vk_render_pass.h" #include "include/vk_shader.h" #include "include/vk_cmdbuffer.h" +#if VKI_RAY_TRACING +#include "raytrace/ray_tracing_device.h" +#endif #include "palAutoBuffer.h" #include "palCmdBuffer.h" @@ -49,6 +52,7 @@ #include using namespace Util; +using namespace std::chrono_literals; namespace vk { @@ -115,7 +119,7 @@ VkResult GraphicsPipeline::CreatePipelineBinaries( bool shouldConvert = (pCreateInfo != nullptr) && (pDevice->GetRuntimeSettings().enablePipelineDump || - (shouldCompile && (deviceIdx == DefaultDeviceIndex))); + (shouldCompile && (deviceIdx == DefaultDeviceIndex))); VkResult convertResult = VK_ERROR_UNKNOWN; if (shouldConvert) @@ -1660,7 +1664,7 @@ VkResult GraphicsPipeline::Destroy( { if (m_deferWorkload.pEvent != nullptr) { - auto result = m_deferWorkload.pEvent->Wait(Util::fseconds{ 10 }); + auto result = m_deferWorkload.pEvent->Wait(10s); if (result == Util::Result::Success) { Util::Destructor(m_deferWorkload.pEvent); diff --git a/icd/api/vk_graphics_pipeline_library.cpp b/icd/api/vk_graphics_pipeline_library.cpp index 8ce4eb75..058a2530 100644 --- a/icd/api/vk_graphics_pipeline_library.cpp +++ b/icd/api/vk_graphics_pipeline_library.cpp @@ -337,9 +337,8 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( uint32_t gplMask = 0; for (uint32_t i = 0; i < ShaderStage::ShaderStageGfxCount; ++i) { - if ((pShaderInfos[i]->pModuleData != nullptr) && - (pShaderStageInfo->stages[i].pModuleHandle != nullptr) && - pCompiler->IsValidShaderModule(pShaderStageInfo->stages[i].pModuleHandle) || + if (((pShaderInfos[i]->pModuleData != nullptr) && + pCompiler->IsValidShaderModule(pShaderStageInfo->stages[i].pModuleHandle)) || (pShaderStageInfo->stages[i].codeHash.lower != 0) || (pShaderStageInfo->stages[i].codeHash.upper != 0)) { diff --git a/icd/api/vk_image.cpp b/icd/api/vk_image.cpp index 2994ac89..db539869 100644 --- a/icd/api/vk_image.cpp +++ b/icd/api/vk_image.cpp @@ -409,68 +409,83 @@ void Image::ConvertImageCreateInfo( } } + const bool isZ24DsFormat = (settings.enableD24S8 && + ((pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT) || + (pCreateInfo->format == VK_FORMAT_X8_D24_UNORM_PACK32))); + + const bool isZ16DsFormat = ((pCreateInfo->format == VK_FORMAT_D16_UNORM) || + (pCreateInfo->format == VK_FORMAT_D16_UNORM_S8_UINT)); + + if (isZ24DsFormat) + { + pPalCreateInfo->usageFlags.depthAsZ24 = 1; + } + pPalCreateInfo->metadataMode = Pal::MetadataMode::Default; pPalCreateInfo->metadataTcCompatMode = Pal::MetadataTcCompatMode::Default; - // Don't force DCC to be enabled for performance reasons unless the image is larger than the minimum size set for - // compression, another performance optimization. const Pal::GfxIpLevel gfxLevel = palProperties.gfxLevel; - if (((pPalCreateInfo->extent.width * pPalCreateInfo->extent.height) > - (settings.disableSmallSurfColorCompressionSize * settings.disableSmallSurfColorCompressionSize)) && - (Formats::IsColorFormat(createInfoFormat))) + { - const uint32_t forceEnableDccMask = settings.forceEnableDcc; + // Don't force DCC to be enabled for performance reasons unless the image is larger than the minimum size set for + // compression, another performance optimization. + if (((pPalCreateInfo->extent.width * pPalCreateInfo->extent.height) > + (settings.disableSmallSurfColorCompressionSize * settings.disableSmallSurfColorCompressionSize)) && + (Formats::IsColorFormat(createInfoFormat))) + { + const uint32_t forceEnableDccMask = settings.forceEnableDcc; - const uint32_t bpp = Pal::Formats::BitsPerPixel(pPalCreateInfo->swizzledFormat.format); - const bool isShaderStorage = (pCreateInfo->usage & VK_IMAGE_USAGE_STORAGE_BIT); + const uint32_t bpp = Pal::Formats::BitsPerPixel(pPalCreateInfo->swizzledFormat.format); + const bool isShaderStorage = (pCreateInfo->usage & VK_IMAGE_USAGE_STORAGE_BIT); - if (isShaderStorage && - ((forceEnableDccMask & ForceDccDefault) == 0) && - ((forceEnableDccMask & ForceDisableDcc) == 0)) - { - const bool isColorAttachment = (pCreateInfo->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT); + if (isShaderStorage && + ((forceEnableDccMask & ForceDccDefault) == 0) && + ((forceEnableDccMask & ForceDisableDcc) == 0)) + { + const bool isColorAttachment = (pCreateInfo->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT); - const bool is2DShaderStorageImage = (pCreateInfo->imageType & VK_IMAGE_TYPE_2D); - const bool is3DShaderStorageImage = (pCreateInfo->imageType & VK_IMAGE_TYPE_3D); + const bool is2DShaderStorageImage = (pCreateInfo->imageType & VK_IMAGE_TYPE_2D); + const bool is3DShaderStorageImage = (pCreateInfo->imageType & VK_IMAGE_TYPE_3D); - // Enable DCC beyond what PAL does by default for color attachments - const bool shouldForceDccForCA = Util::TestAnyFlagSet(forceEnableDccMask, ForceDccForColorAttachments) && - isColorAttachment; - const bool shouldForceDccForNonCAShaderStorage = - Util::TestAnyFlagSet(forceEnableDccMask, ForceDccForNonColorAttachmentShaderStorage) && - (!isColorAttachment); + // Enable DCC beyond what PAL does by default for color attachments + const bool shouldForceDccForCA = Util::TestAnyFlagSet(forceEnableDccMask, ForceDccForColorAttachments) && + isColorAttachment; + const bool shouldForceDccForNonCAShaderStorage = + Util::TestAnyFlagSet(forceEnableDccMask, ForceDccForNonColorAttachmentShaderStorage) && + (!isColorAttachment); - const bool shouldForceDccFor2D = Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor2DShaderStorage) && - is2DShaderStorageImage; - const bool shouldForceDccFor3D = Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor3DShaderStorage) && - is3DShaderStorageImage; + const bool shouldForceDccFor2D = Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor2DShaderStorage) && + is2DShaderStorageImage; + const bool shouldForceDccFor3D = Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor3DShaderStorage) && + is3DShaderStorageImage; - const bool shouldForceDccFor32Bpp = - Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor32BppShaderStorage) && (bpp >= 32) && (bpp < 64); + const bool shouldForceDccFor32Bpp = + Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor32BppShaderStorage) && (bpp >= 32) && (bpp < 64); - const bool shouldForceDccFor64Bpp = - Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor64BppShaderStorage) && (bpp >= 64); + const bool shouldForceDccFor64Bpp = + Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor64BppShaderStorage) && (bpp >= 64); - const bool shouldForceDccForAllBpp = - ((Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor32BppShaderStorage) == false) && - (Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor64BppShaderStorage) == false)); + const bool shouldForceDccForAllBpp = + ((Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor32BppShaderStorage) == false) && + (Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor64BppShaderStorage) == false)); - // To force enable shader storage DCC, at least one of 2D/3D and one of CA/non-CA need to be set - if ((shouldForceDccFor2D || shouldForceDccFor3D) && - (shouldForceDccForCA || shouldForceDccForNonCAShaderStorage) && - (shouldForceDccFor32Bpp || shouldForceDccFor64Bpp || shouldForceDccForAllBpp)) - { - pPalCreateInfo->metadataMode = Pal::MetadataMode::ForceEnabled; + // To force enable shader storage DCC, at least one of 2D/3D and one of CA/non-CA need to be set + if ((shouldForceDccFor2D || shouldForceDccFor3D) && + (shouldForceDccForCA || shouldForceDccForNonCAShaderStorage) && + (shouldForceDccFor32Bpp || shouldForceDccFor64Bpp || shouldForceDccForAllBpp)) + { + pPalCreateInfo->metadataMode = Pal::MetadataMode::ForceEnabled; + } } - } - // This setting should only really be used for Vega20. - // Turn DCC on/off for identified cases where memory bandwidth is not the bottleneck to improve latency. - // PAL may do this implicitly, so specify force enabled instead of default. - if (settings.dccBitsPerPixelThreshold != UINT_MAX) - { - pPalCreateInfo->metadataMode = (bpp < settings.dccBitsPerPixelThreshold) ? - Pal::MetadataMode::Disabled : Pal::MetadataMode::ForceEnabled; + // This setting should only really be used for Vega20. + // Turn DCC on/off for identified cases where memory bandwidth is not the bottleneck to improve latency. + // PAL may do this implicitly, so specify force enabled instead of default. + if (settings.dccBitsPerPixelThreshold != UINT_MAX) + { + pPalCreateInfo->metadataMode = (bpp < settings.dccBitsPerPixelThreshold) ? + Pal::MetadataMode::Disabled : Pal::MetadataMode::ForceEnabled; + } } } @@ -484,13 +499,6 @@ void Image::ConvertImageCreateInfo( pPalCreateInfo->metadataMode = Pal::MetadataMode::Disabled; } - if (settings.enableD24S8 && - ((pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT) || - (pCreateInfo->format == VK_FORMAT_X8_D24_UNORM_PACK32))) - { - pPalCreateInfo->usageFlags.depthAsZ24 = 1; - } - // If DCC was disabled above, still attempt to use Fmask. if ((pPalCreateInfo->samples > 1) && pPalCreateInfo->usageFlags.colorTarget && (pPalCreateInfo->metadataMode == Pal::MetadataMode::Disabled)) @@ -538,7 +546,7 @@ void Image::ConvertImageCreateInfo( if ((extStructs.pImageCompressionControl->sType == VK_STRUCTURE_TYPE_IMAGE_COMPRESSION_CONTROL_EXT) && (extStructs.pImageCompressionControl->flags == VK_IMAGE_COMPRESSION_DISABLED_EXT)) { - pPalCreateInfo->metadataMode = Pal::MetadataMode::Disabled; + pPalCreateInfo->metadataMode = Pal::MetadataMode::Disabled; pPalCreateInfo->metadataTcCompatMode = Pal::MetadataTcCompatMode::Disabled; } } diff --git a/icd/api/vk_indirect_commands_layout.cpp b/icd/api/vk_indirect_commands_layout.cpp index 0a84dd44..8fdc7582 100644 --- a/icd/api/vk_indirect_commands_layout.cpp +++ b/icd/api/vk_indirect_commands_layout.cpp @@ -51,6 +51,7 @@ VkResult IndirectCommandsLayout::Create( createInfo.pParams = &indirectParams[0]; Pal::IIndirectCmdGenerator* pGenerators[MaxPalDevices] = {}; + Pal::IGpuMemory* pGpuMemory[MaxPalDevices] = {}; const size_t apiSize = ObjectSize(pDevice); size_t totalSize = apiSize; @@ -154,12 +155,18 @@ VkResult IndirectCommandsLayout::Create( } } + if (result == VK_SUCCESS) + { + result = BindGpuMemory(pDevice, pAllocator, pGenerators, pGpuMemory); + } + if (result == VK_SUCCESS) { VK_PLACEMENT_NEW(pMemory) IndirectCommandsLayout( pDevice, info, pGenerators, + pGpuMemory, createInfo); *pLayout = IndirectCommandsLayout::HandleFromVoidPointer(pMemory); @@ -172,7 +179,8 @@ VkResult IndirectCommandsLayout::Create( IndirectCommandsLayout::IndirectCommandsLayout( const Device* pDevice, const IndirectCommandsInfo& info, - Pal::IIndirectCmdGenerator** pPalGenerator, + Pal::IIndirectCmdGenerator** pGenerators, + Pal::IGpuMemory** pGpuMemory, const Pal::IndirectCmdGeneratorCreateInfo& palCreateInfo) : m_info(info), @@ -180,8 +188,8 @@ IndirectCommandsLayout::IndirectCommandsLayout( { for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) { - m_perGpu[deviceIdx].pGenerator = pPalGenerator[deviceIdx]; - m_perGpu[deviceIdx].preprocessBufferVirtAddr = 0; + m_perGpu[deviceIdx].pGenerator = pGenerators[deviceIdx]; + m_perGpu[deviceIdx].pGpuMemory = pGpuMemory[deviceIdx]; } } @@ -305,56 +313,125 @@ void IndirectCommandsLayout::CalculateMemoryRequirements( VkMemoryRequirements2* pMemoryRequirements ) const { - VK_ASSERT(m_perGpu[DefaultDeviceIndex].pGenerator != nullptr); + // Our CP packet solution have no preprocess step. Gpu memory is not required. + pMemoryRequirements->memoryRequirements.size = 0; + pMemoryRequirements->memoryRequirements.alignment = 0; + pMemoryRequirements->memoryRequirements.memoryTypeBits = 0; + Pal::GpuMemoryRequirements memReqs = {}; - m_perGpu[DefaultDeviceIndex].pGenerator->GetGpuMemoryRequirements(&memReqs); + memReqs.flags.cpuAccess = 0; + memReqs.heaps[0] = Pal::GpuHeap::GpuHeapInvisible; + memReqs.heapCount = 1; -#if DEBUG - for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + for (uint32_t i = 0; i < memReqs.heapCount; ++i) { - VK_ASSERT(m_perGpu[deviceIdx].pGenerator != nullptr); + uint32_t typeIndexBits; - if (deviceIdx != DefaultDeviceIndex) + if (pDevice->GetVkTypeIndexBitsFromPalHeap(memReqs.heaps[i], &typeIndexBits)) { - Pal::GpuMemoryRequirements deviceReqs = {}; - m_perGpu[deviceIdx].pGenerator->GetGpuMemoryRequirements(&deviceReqs); - VK_ASSERT(memcmp(&memReqs, &deviceReqs, sizeof(deviceReqs)) == 0); + pMemoryRequirements->memoryRequirements.memoryTypeBits |= typeIndexBits; } } -#endif +} + +// ===================================================================================================================== +VkResult IndirectCommandsLayout::BindGpuMemory( + const Device* pDevice, + const VkAllocationCallbacks* pAllocator, + Pal::IIndirectCmdGenerator** pGenerators, + Pal::IGpuMemory** pGpuMemory) +{ + VkResult result = VK_SUCCESS; + Pal::Result palResult; - pMemoryRequirements->memoryRequirements.alignment = memReqs.alignment; - pMemoryRequirements->memoryRequirements.size = memReqs.size; + Pal::GpuMemoryRequirements memReqs[MaxPalDevices] = {}; + Pal::GpuMemoryCreateInfo memCreateInfos[MaxPalDevices] = {}; - pMemoryRequirements->memoryRequirements.memoryTypeBits = 0; + size_t totalSize = 0; - for (uint32_t i = 0; i < memReqs.heapCount; ++i) + void* pMemory = nullptr; + + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) { - uint32_t typeIndexBits; + pGenerators[deviceIdx]->GetGpuMemoryRequirements(&memReqs[deviceIdx]); - if (pDevice->GetVkTypeIndexBitsFromPalHeap(memReqs.heaps[i], &typeIndexBits)) + memCreateInfos[deviceIdx].size = memReqs[deviceIdx].size; + memCreateInfos[deviceIdx].alignment = memReqs[deviceIdx].alignment; + memCreateInfos[deviceIdx].priority = Pal::GpuMemPriority::Normal; + memCreateInfos[deviceIdx].heapCount = memReqs[deviceIdx].heapCount; + + for (uint32 i = 0; i < memReqs[deviceIdx].heapCount; ++i) { - pMemoryRequirements->memoryRequirements.memoryTypeBits |= typeIndexBits; + memCreateInfos[deviceIdx].heaps[i] = memReqs[deviceIdx].heaps[i]; + } + + const size_t size = pDevice->PalDevice(deviceIdx)->GetGpuMemorySize(memCreateInfos[deviceIdx], + &palResult); + + if (palResult == Pal::Result::Success) + { + totalSize += size; + } + else + { + result = PalToVkResult(palResult); + break; } } -} -// ===================================================================================================================== -void IndirectCommandsLayout::BindPreprocessBuffer( - VkBuffer buffer, - VkDeviceSize memOffset, - uint32_t deviceIdx) -{ - Buffer* pBuffer = Buffer::ObjectFromHandle(buffer); - Pal::gpusize bufferVirtAddr = pBuffer->PalMemory(deviceIdx)->Desc().gpuVirtAddr + memOffset; + if (result == VK_SUCCESS) + { + pMemory = pAllocator->pfnAllocation(pAllocator->pUserData, + totalSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (pMemory == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } - if (m_perGpu[deviceIdx].preprocessBufferVirtAddr != bufferVirtAddr) + if (result == VK_SUCCESS) { - Pal::Result palResult = m_perGpu[deviceIdx].pGenerator->BindGpuMemory(pBuffer->PalMemory(deviceIdx), - memOffset); - VK_ASSERT(palResult == Pal::Result::Success); - m_perGpu[deviceIdx].preprocessBufferVirtAddr = bufferVirtAddr; + void* pPalMemory = pMemory; + + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + const size_t size = pDevice->PalDevice(deviceIdx)->GetGpuMemorySize(memCreateInfos[deviceIdx], + &palResult); + + if (palResult == Pal::Result::Success) + { + palResult = pDevice->PalDevice(deviceIdx)->CreateGpuMemory(memCreateInfos[deviceIdx], + pPalMemory, + &pGpuMemory[deviceIdx]); + } + + if (palResult == Pal::Result::Success) + { + // Gpu memory binding for IndirectCmdGenerator to build SRD containing properties and parameter data. + palResult = pGenerators[deviceIdx]->BindGpuMemory(pGpuMemory[deviceIdx], 0); + } + else + { + result = PalToVkResult(palResult); + break; + } + + if (palResult == Pal::Result::Success) + { + pPalMemory = Util::VoidPtrInc(pPalMemory, size); + } + else + { + result = PalToVkResult(palResult); + break; + } + } } + + return result; } // ===================================================================================================================== @@ -368,8 +445,16 @@ VkResult IndirectCommandsLayout::Destroy( { m_perGpu[deviceIdx].pGenerator->Destroy(); } - // It's app's reponsibility to free the preprocess buffer. - m_perGpu[deviceIdx].preprocessBufferVirtAddr = 0; + + if (m_perGpu[deviceIdx].pGpuMemory != nullptr) + { + m_perGpu[deviceIdx].pGpuMemory->Destroy(); + } + } + + if (m_perGpu[DefaultDeviceIndex].pGpuMemory != nullptr) + { + pAllocator->pfnFree(pAllocator->pUserData, m_perGpu[DefaultDeviceIndex].pGpuMemory); } Util::Destructor(this); diff --git a/icd/api/vk_memory.cpp b/icd/api/vk_memory.cpp index 4b99eb59..f2fcf552 100644 --- a/icd/api/vk_memory.cpp +++ b/icd/api/vk_memory.cpp @@ -346,6 +346,7 @@ VkResult Memory::Create( if (pPinnedHostPtr == nullptr) { + vkResult = CreateGpuMemory( pDevice, pAllocator, diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp index 75fdd9cf..e6d2e5f1 100644 --- a/icd/api/vk_physical_device.cpp +++ b/icd/api/vk_physical_device.cpp @@ -821,6 +821,9 @@ VkResult PhysicalDevice::Initialize() finalizeInfo.supportedFullScreenFrameMetadata.p2pCmdFlag = true; finalizeInfo.supportedFullScreenFrameMetadata.forceSwCfMode = true; finalizeInfo.supportedFullScreenFrameMetadata.postFrameTimerSubmission = true; + + // Need to set all 3 bits to 1 per KMD request. + finalizeInfo.supportedFullScreenFrameMetadata.flipIntervalOverride = 7; } finalizeInfo.internalTexOptLevel = VkToPalTexFilterQuality(settings.vulkanTexFilterQuality); @@ -1323,6 +1326,7 @@ void PhysicalDevice::PopulateFormatProperties() } while (aspectMask != 0); } + } #if VKI_RAY_TRACING @@ -4098,11 +4102,17 @@ bool PhysicalDevice::RayTracingSupported() const } #endif +// ===================================================================================================================== static bool IsKhrCooperativeMatrixSupported( const PhysicalDevice* pPhysicalDevice) { - return ((pPhysicalDevice == nullptr) || - (pPhysicalDevice->PalProperties().gfxipProperties.flags.supportCooperativeMatrix)); + const bool hasHardwareSupport = + ((pPhysicalDevice == nullptr) || + (pPhysicalDevice->PalProperties().gfxipProperties.flags.supportCooperativeMatrix)); + + bool emulateSupport = false; + + return hasHardwareSupport || emulateSupport; } // ===================================================================================================================== @@ -4766,7 +4776,9 @@ void PhysicalDevice::PopulateQueueFamilies() pQueueFamilyProps->minImageTransferGranularity.depth = ((transferGranularityOverride >> 16) & 0xff); } - m_queueFamilyCount++; + { + m_queueFamilyCount++; + } } } @@ -7006,6 +7018,9 @@ size_t PhysicalDevice::GetFeatures2( break; } +#if VKI_RAY_TRACING +#endif + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PAGEABLE_DEVICE_LOCAL_MEMORY_FEATURES_EXT: { auto* pExtInfo = reinterpret_cast(pHeader); @@ -8297,6 +8312,9 @@ void PhysicalDevice::GetDeviceProperties2( break; } +#if VKI_RAY_TRACING +#endif + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: { auto* pProps = static_cast(pNext); diff --git a/icd/api/vk_pipeline_layout.cpp b/icd/api/vk_pipeline_layout.cpp index 677a63f9..d11fe4f1 100644 --- a/icd/api/vk_pipeline_layout.cpp +++ b/icd/api/vk_pipeline_layout.cpp @@ -39,6 +39,10 @@ #include "palMetroHash.h" #include "palVectorImpl.h" +#if VKI_RAY_TRACING +#include "raytrace/ray_tracing_device.h" +#endif + namespace vk { @@ -938,7 +942,7 @@ Vkgc::ResourceMappingNodeType PipelineLayout::MapLlpcResourceNodeType( case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: nodeType = Vkgc::ResourceMappingNodeType::DescriptorBufferCompact; break; - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: nodeType = Vkgc::ResourceMappingNodeType::InlineBuffer; break; case VK_DESCRIPTOR_TYPE_MUTABLE_EXT: @@ -1281,7 +1285,8 @@ VkResult PipelineLayout::BuildCompactSchemeLlpcPipelineMapping( Vkgc::ShaderStageVertexBit, userDataLayout.specConstBufVertexRegBase, MaxInternalSpecConstBuffSize, - Vkgc::SpecConstInternalBufferBindingId + ShaderStage::ShaderStageVertex, + static_cast(Vkgc::SpecConstInternalBufferBindingId) + + static_cast(ShaderStage::ShaderStageVertex), &pUserDataNodes[userDataNodeCount], &userDataNodeCount, &pResourceNodes[mappingNodeCount], @@ -1294,7 +1299,8 @@ VkResult PipelineLayout::BuildCompactSchemeLlpcPipelineMapping( Vkgc::ShaderStageFragmentBit, userDataLayout.specConstBufFragmentRegBase, MaxInternalSpecConstBuffSize, - Vkgc::SpecConstInternalBufferBindingId + ShaderStage::ShaderStageFragment, + static_cast(Vkgc::SpecConstInternalBufferBindingId) + + static_cast(ShaderStage::ShaderStageFragment), &pUserDataNodes[userDataNodeCount], &userDataNodeCount, &pResourceNodes[mappingNodeCount], diff --git a/icd/api/vk_query.cpp b/icd/api/vk_query.cpp index ccd84955..9822879d 100644 --- a/icd/api/vk_query.cpp +++ b/icd/api/vk_query.cpp @@ -559,7 +559,8 @@ VkResult QueryPoolWithStorageView::Initialize( m_pStorageView[deviceIdx] = Util::VoidPtrInc(pMemory, apiSize + (viewSize * deviceIdx)); - m_pDevice->PalDevice(deviceIdx)->CreateUntypedBufferViewSrds(1, &bufferViewInfo, m_pStorageView[deviceIdx]); + m_pDevice->PalDevice(deviceIdx)-> + CreateUntypedBufferViewSrds(1, &bufferViewInfo, m_pStorageView[deviceIdx]); } } else @@ -1008,7 +1009,7 @@ VkResult AccelerationStructureQueryPool::GetResults( { while (!ready) { - Util::SleepMs(0u); + Util::Sleep(std::chrono::milliseconds{ 0 }); value = GetAccelerationStructureQueryResults( m_queryType, diff --git a/icd/api/vk_queue.cpp b/icd/api/vk_queue.cpp index 71d28ffa..79196bd8 100644 --- a/icd/api/vk_queue.cpp +++ b/icd/api/vk_queue.cpp @@ -1152,9 +1152,6 @@ VkResult Queue::Submit( const void* pNext = submitInfo.pNext; -#if VKI_RAY_TRACING -#endif - while (pNext != nullptr) { const VkStructHeader* pHeader = static_cast(pNext); @@ -1505,7 +1502,9 @@ VkResult Queue::Submit( if (palResult == Pal::Result::Success) { - palResult = PalQueueSubmit(m_pDevice, PalTmzQueue(deviceIdx), palSubmitInfo); + { + palResult = PalQueueSubmit(m_pDevice, PalTmzQueue(deviceIdx), palSubmitInfo); + } } VK_ASSERT(palResult == Pal::Result::Success); @@ -1530,7 +1529,9 @@ VkResult Queue::Submit( if (palResult == Pal::Result::Success) { - palResult = PalQueueSubmit(m_pDevice, PalQueue(deviceIdx), palSubmitInfo); + { + palResult = PalQueueSubmit(m_pDevice, PalQueue(deviceIdx), palSubmitInfo); + } } VK_ASSERT(palResult == Pal::Result::Success); @@ -1632,8 +1633,6 @@ VkResult Queue::Submit( DebugPrintf::PostQueueSubmit(m_pDevice, this, pCmdBuffers, cmdBufferCount); -#if VKI_RAY_TRACING -#endif } } @@ -1986,18 +1985,10 @@ VkResult Queue::Present( pPresentRects[r] = VkToPalRect(rect2D); } presentInfo.rectangleCount = pVkRegion->rectangleCount; - presentInfo.pRectangles = pPresentRects; + presentInfo.pRectangles = pPresentRects; } } - // Fill in present information and obtain the PAL memory of the presentable image. - Pal::IGpuMemory* pGpuMemory = pSwapChain->UpdatePresentInfo(presentationDeviceIdx, - imageIndex, - &presentInfo, - m_flipStatus.flipFlags); - - CmdBufState* pCmdBufState = m_pCmdBufferRing->AcquireCmdBuffer(m_pDevice, presentationDeviceIdx); - // Ensure metadata is available before post processing. if (pSwapChain->GetFullscreenMgr() != nullptr) { @@ -2008,6 +1999,15 @@ VkResult Queue::Present( VK_ASSERT(palResult == Pal::Result::Success); } + // Fill in present information and obtain the PAL memory of the presentable image. + Pal::IGpuMemory* pGpuMemory = pSwapChain->UpdatePresentInfo(presentationDeviceIdx, + imageIndex, + &presentInfo, + m_flipStatus.flipFlags, + m_palFrameMetadataControl); + + CmdBufState* pCmdBufState = m_pCmdBufferRing->AcquireCmdBuffer(m_pDevice, presentationDeviceIdx); + // This must happen after the fullscreen manager has updated its overlay information and before the software // compositor has an opportunity to copy the presentable image in order to include the overlay itself. bool hasPostProcessing = BuildPostProcessCommands(presentationDeviceIdx, @@ -2124,9 +2124,6 @@ VkResult Queue::Present( } } -#if VKI_RAY_TRACING -#endif - return result; } diff --git a/icd/api/vk_swapchain.cpp b/icd/api/vk_swapchain.cpp index 5436a013..fce1dad2 100644 --- a/icd/api/vk_swapchain.cpp +++ b/icd/api/vk_swapchain.cpp @@ -52,6 +52,8 @@ #include +using namespace std::chrono_literals; + namespace vk { @@ -113,7 +115,7 @@ VkResult SwapChain::Create( // the old swapchain should be flaged as deprecated no matter whether the new swapchain is created successfully. if (pCreateInfo->oldSwapchain != VK_NULL_HANDLE) { - SwapChain::ObjectFromHandle(pCreateInfo->oldSwapchain)->MarkAsDeprecated(pAllocator); + SwapChain::ObjectFromHandle(pCreateInfo->oldSwapchain)->MarkAsDeprecated(true, pAllocator); } // Find the index of the device associated with the PAL screen and therefore, the PAL swap chain to be created @@ -672,6 +674,7 @@ VkResult SwapChain::SetupAutoStereo( // Destroy Vulkan swap chain. VkResult SwapChain::Destroy(const VkAllocationCallbacks* pAllocator) { + // Make sure the swapchain is idle and safe to be destroyed. if (m_pPalSwapChain != nullptr) { @@ -718,9 +721,9 @@ VkResult SwapChain::AcquireNextImage( const VkStructHeader* pAcquireInfo, uint32_t* pImageIndex) { - VkFence fence = VK_NULL_HANDLE; - VkSemaphore semaphore = VK_NULL_HANDLE; - uint64_t timeout = UINT64_MAX; + VkFence fence = VK_NULL_HANDLE; + VkSemaphore semaphore = VK_NULL_HANDLE; + std::chrono::nanoseconds timeout = std::chrono::nanoseconds::max(); const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings(); @@ -740,7 +743,7 @@ VkResult SwapChain::AcquireNextImage( { semaphore = pVkAcquireNextImageInfoKHR->semaphore; fence = pVkAcquireNextImageInfoKHR->fence; - timeout = pVkAcquireNextImageInfoKHR->timeout; + timeout = Uint64ToChronoNano(pVkAcquireNextImageInfoKHR->timeout); Util::BitMaskScanForward(&presentationDeviceIdx, pVkAcquireNextImageInfoKHR->deviceMask); @@ -768,7 +771,7 @@ VkResult SwapChain::AcquireNextImage( if (result == VK_SUCCESS) { - acquireInfo.timeout = Uint64ToChronoNano(timeout); + acquireInfo.timeout = timeout; acquireInfo.pSemaphore = (pSemaphore != nullptr) ? pSemaphore->PalSemaphore(DefaultDeviceIndex) : nullptr; @@ -801,7 +804,7 @@ VkResult SwapChain::AcquireNextImage( result = VK_ERROR_OUT_OF_DATE_KHR; } - if ((timeout == 0) && (result == VK_TIMEOUT)) + if ((timeout == 0s) && (result == VK_TIMEOUT)) { result = VK_NOT_READY; } @@ -887,10 +890,11 @@ bool SwapChain::IsFullscreenOrEfsePresent() const // ===================================================================================================================== // Fills in the PAL swap chain present info with the appropriate image to present and returns its GPU memory. Pal::IGpuMemory* SwapChain::UpdatePresentInfo( - uint32_t deviceIdx, - uint32_t imageIndex, - Pal::PresentSwapChainInfo* pPresentInfo, - const Pal::FlipStatusFlags& flipFlags) + uint32_t deviceIdx, + uint32_t imageIndex, + Pal::PresentSwapChainInfo* pPresentInfo, + const Pal::FlipStatusFlags& flipFlags, + const Pal::PerSourceFrameMetadataControl& metadataFlags) { Pal::IGpuMemory* pSrcImageGpuMemory = nullptr; @@ -911,6 +915,7 @@ Pal::IGpuMemory* SwapChain::UpdatePresentInfo( ) { m_pFullscreenMgr->TryEnterExclusive(this); + } // Always fallback to windowed if FSE is not acquired to avoid missing presents. @@ -1118,27 +1123,31 @@ bool SwapChain::IsSuboptimal(uint32_t deviceIdx) // ===================================================================================================================== void SwapChain::MarkAsDeprecated( + bool releaseResources, const VkAllocationCallbacks* pAllocator) { m_deprecated = true; - if (m_pPalSwapChain != nullptr) + if (releaseResources) { - m_pPalSwapChain->WaitIdle(); - - for (uint32_t i = 0; i < m_properties.imageCount; ++i) + if (m_pPalSwapChain != nullptr) { - // Remove memory references to presentable image memory and destroy the images and image memory. - Memory::ObjectFromHandle(m_properties.imageMemory[i])->Free(m_pDevice, pAllocator); - Image::ObjectFromHandle(m_properties.images[i])->Destroy(m_pDevice, pAllocator); - } + m_pPalSwapChain->WaitIdle(); - m_pPalSwapChain->Destroy(); + for (uint32_t i = 0; i < m_properties.imageCount; ++i) + { + // Remove memory references to presentable image memory and destroy the images and image memory. + Memory::ObjectFromHandle(m_properties.imageMemory[i])->Free(m_pDevice, pAllocator); + Image::ObjectFromHandle(m_properties.images[i])->Destroy(m_pDevice, pAllocator); + } - // Set to null to avoid double deleting when the actual object gets destroyed. - m_pPalSwapChain = nullptr; - } + m_pPalSwapChain->Destroy(); + + // Set to null to avoid double deleting when the actual object gets destroyed. + m_pPalSwapChain = nullptr; + } + } } // ===================================================================================================================== diff --git a/icd/api/vk_utils.cpp b/icd/api/vk_utils.cpp index 45605b97..097adbe2 100644 --- a/icd/api/vk_utils.cpp +++ b/icd/api/vk_utils.cpp @@ -75,7 +75,7 @@ void WaitIdleForDebugger( // Timeout the driver to give debuggers a chance to load all of the symbols if (debugTimeout != 0) { - Util::SleepMs(debugTimeout); + Util::Sleep(std::chrono::milliseconds{ debugTimeout }); } } } diff --git a/icd/res/ver.h b/icd/res/ver.h index 255caefd..6cfc716e 100644 --- a/icd/res/ver.h +++ b/icd/res/ver.h @@ -36,7 +36,7 @@ #define VERSION_MAJOR_STR MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0" // Bump up after each promotion to mainline -#define VULKAN_ICD_BUILD_VERSION 304 +#define VULKAN_ICD_BUILD_VERSION 308 // String version is needed with leading zeros and extra termination (unicode) #define VERSION_NUMBER_MINOR VULKAN_ICD_BUILD_VERSION @@ -45,7 +45,7 @@ // These values specify the driver ID and driver info string #define VULKAN_DRIVER_ID VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR // "AMDOPEN" #define VULKAN_DRIVER_NAME_STR "AMD open-source driver" -#define VULKAN_DRIVER_INFO_STR "2024.Q2.1" +#define VULKAN_DRIVER_INFO_STR "2024.Q2.2" #define VULKAN_DRIVER_INFO_STR_LLPC "(LLPC)" // These values tell which version of the conformance test the driver is compliant against diff --git a/icd/settings/settings.cpp b/icd/settings/settings.cpp index 8679fb01..c5e0cd7a 100644 --- a/icd/settings/settings.cpp +++ b/icd/settings/settings.cpp @@ -176,7 +176,11 @@ void VulkanSettingsLoader::OverrideSettingsBySystemInfo() char executableName[PATH_MAX]; char executablePath[PATH_MAX]; utils::GetExecutableNameAndPath(executableName, executablePath); - sprintf(m_settings.pipelineDumpDir, "%s/%s", m_settings.pipelineDumpDir, executableName); + Util::Snprintf(m_settings.pipelineDumpDir, + sizeof(m_settings.pipelineDumpDir), + "%s/%s", + m_settings.pipelineDumpDir, + executableName); } MakeAbsolutePath(m_settings.pipelineDumpDir, sizeof(m_settings.pipelineDumpDir), @@ -809,7 +813,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( #endif m_settings.enableUberFetchShader = true; - } if (appProfile == AppProfile::Source2Engine) @@ -823,7 +826,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.anisoThreshold = 1.0f; m_settings.disableMsaaStencilShaderRead = true; - } if (appProfile == AppProfile::Talos) @@ -1353,15 +1355,18 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.forceDepthClampBasedOnZExport = true; } + if ((appProfile == AppProfile::DxvkHaloInfiniteLauncher) || + (appProfile == AppProfile::DxvkTf2) #ifndef ICD_X64_BUILD - if (appProfile == AppProfile::DXVK) + || (appProfile == AppProfile::DXVK) +#endif + ) { - // DXVK Tropic4/GTA4 page fault when GPL is enabled. + // DXVK Tropic4, GTA4, Halo Infinite Launcher page fault when GPL is enabled. // It looks incorrect pipeline layout is used. Force indirect can make optimized pipeline layout compatible // with fast-linked pipeline. m_settings.pipelineLayoutSchemeSelectionStrategy = PipelineLayoutSchemeSelectionStrategy::ForceIndirect; } -#endif if (appProfile == AppProfile::AshesOfTheSingularity) { @@ -1602,6 +1607,7 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( if (appProfile == AppProfile::Vkd3dEngine) { m_settings.exportNvComputeShaderDerivatives = true; + m_settings.exportNvDeviceGeneratedCommands = true; m_settings.exportImageCompressionControl = true; } @@ -1610,6 +1616,7 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( { m_settings.disableSingleMipAnisoOverride = false; } + } return result; @@ -1812,7 +1819,6 @@ void VulkanSettingsLoader::ValidateSettings() { buildMode = BvhBuildModePLOC; } - m_settings.bvhBuildModeOverrideBlas = buildMode; m_settings.bvhBuildModeOverrideTlas = buildMode; } @@ -1867,6 +1873,12 @@ void VulkanSettingsLoader::ValidateSettings() m_settings.indirectCalleeIntersection = Util::Min(255U, m_settings.indirectCalleeIntersection); m_settings.indirectCalleeCallable = Util::Min(255U, m_settings.indirectCalleeCallable); m_settings.indirectCalleeTraceRays = Util::Min(255U, m_settings.indirectCalleeTraceRays); + + // Force invalid accel struct to skip traversal if toss point is traversal or greater + if (m_settings.rtTossPoint >= RtTossPointTraversal) + { + m_settings.forceInvalidAccelStruct = true; + } #endif // SkipDstCacheInv should not be enabled by default when acquire-release barrier interface is used, because PAL diff --git a/icd/settings/settings_xgl.json b/icd/settings/settings_xgl.json index 0f88fd4f..41ab5ff0 100644 --- a/icd/settings/settings_xgl.json +++ b/icd/settings/settings_xgl.json @@ -218,7 +218,7 @@ }, { "Name": "BvhBuildModeAuto", - "Value": 3, + "Value": 4, "Description": "Only for override builds. If set, falls back to regular build options." } ] @@ -1037,6 +1037,21 @@ "Scope": "Driver", "Type": "bool" }, + { + "Name": "EnableImageMsaaLoadOpt", + "Description": "Enable image MSAA load optimization on Gfx11.", + "Tags": [ + "Pipeline Options" + ], + "BuildTypes": [ + "VKI_BUILD_GFX11" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Scope": "Driver" + }, { "Name": "DisableLoopUnrolls", "Description": "Disable loop unrolls. This modifies the default pipeline state and can be overwritten by fine-grain override settings.", @@ -1445,7 +1460,7 @@ "SPIRV Options" ], "Defaults": { - "Default": false + "Default": true }, "Scope": "Driver", "Type": "bool", @@ -2720,7 +2735,7 @@ { "Name": "RtTraceRayCounterMode", "Type": "enum", - "Description": "Enable ray tracing counters. Written to the directory specified by RayTracingCapturePath. Press the RayTracingCaptureHotKey to dump when enabled.", + "Description": "Enable ray tracing counters. Written to the directory specified by RtDumpDir. Press the RtCaptureHotKey to dump when enabled.", "Scope": "Driver", "Tags": [ "Ray Tracing" @@ -3239,6 +3254,36 @@ "Name": "RtTriangleSplittingPriority", "Scope": "Driver" }, + { + "Name": "EnableMergedEncodeBuild", + "Description": "Enable merged encode and build dispatch.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Scope": "Driver" + }, + { + "Name": "EnableMergedEncodeUpdate", + "Description": "Enable merged encode and update dispatch.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Scope": "Driver" + }, { "Name": "RtEnableMortonCode30", "Description": "Enable Morton Code 30 bits", @@ -4191,44 +4236,6 @@ "Type": "bool", "Scope": "Driver" }, - { - "Name": "RtGpuDebugFlags", - "Description": "Gpu Debug flags for GPU RT Debug feature (asserts/printf)", - "Tags": [ - "Ray Tracing" - ], - "Defaults": { - "Default": "NoFlag" - }, - "ValidValues": { - "IsEnum": true, - "Name": "RtGpuDebugFlags", - "Values": [ - { - "Name": "NoFlag", - "Value": 0, - "Description": "Disable all gpu debug flags" - }, - { - "Name": "HostAssert", - "Value": 1, - "Description": "Enable Asserts" - }, - { - "Name": "HostPrint", - "Value": 2, - "Description": "Enable Prints" - }, - { - "Name": "ShaderHalt", - "Value": 4, - "Description": "Enable Halt shader" - } - ] - }, - "Type": "enum", - "Scope": "Driver" - }, { "Name": "EnableRemapScratchBuffer", "Description": "Enable Remapping BVH2 Data from ScratchBuffer to ResultBuffer", @@ -4528,6 +4535,47 @@ "Type": "bool", "Scope": "Driver" }, + { + "Name": "BatchBvhBuilds", + "Description": "Group BVH builds and updates based on explicit app-provided batches or our own implicit batches.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": "BatchBvhModeDisabled" + }, + "ValidValues": { + "IsEnum": true, + "Values": [ + { + "Name": "BatchBvhModeDisabled", + "Value": 0, + "Description": "Disables BVH batching" + }, + { + "Name": "BatchBvhModeExplicit", + "Value": 1, + "Description": "Relies on batching done by application." + }, + { + "Name": "BatchBvhModeImplicit", + "Value": 2, + "Description": "Enables our BvhBatchLayer for implicit BVH batching. Adds some overhead, but could be beneficial for apps written sub-optimally." + }, + { + "Name": "BatchBvhModeImplicitAndLog", + "Value": 3, + "Description": "Same as BatchBvhModeImplicit, but also logs layer activity to [AMD_DEBUG_DIR]/BvhBatchLog.txt. AMD_DEBUG_DIR must be set when this option is enabled (otherwise initialization will fail)." + } + ], + "Name": "BatchBvhModes" + }, + "Type": "enum", + "Scope": "Driver" + }, { "Name": "DbgBarrierPostCmdEnable", "Description": "Triggers a CmdBarrier call after any command in the given mask. The barrier behavior is controlled by the other DbgBarrierPost* settings in this category. Requires VK_ENABLE_DEBUG_BARRIERS=1 to take effect. 0x8FFFFFFF: All commands (heavyweight option)", @@ -7385,6 +7433,18 @@ "Type": "bool", "Scope": "Driver" }, + { + "Description": "If true, disables ReZ for pipelines that only read/write depth", + "Tags": [ + "Optimization" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Name": "DisableDepthOnlyReZ", + "Scope": "Driver" + }, { "Name": "Ac01WaNotNeeded", "Description": "Allows use AC01 fast clears. Please also check setting: Ac01WaState.", @@ -8679,8 +8739,8 @@ "Scope": "Driver" }, { - "Name": "IFHRayTracing", - "Description": "Makes the driver effectively skip the BVH build by reducing prim count to 0.", + "Name": "RtTossPoint", + "Description": "Set toss point for raytracing.", "Tags": [ "Ray Tracing" ], @@ -8688,9 +8748,40 @@ "VKI_RAY_TRACING" ], "Defaults": { - "Default": false + "Default": "RtTossPointDisabled" }, - "Type": "bool", + "ValidValues": { + "IsEnum": true, + "Name": "RtTossPointEnums", + "Values": [ + { + "Name": "RtTossPointDisabled", + "Value": 0, + "Description": "No toss points, raytracing executes normally" + }, + { + "Name": "RtTossPointTraversal", + "Value": 1, + "Description": "Disable traversal" + }, + { + "Name": "RtTossPointTlas", + "Value": 2, + "Description": "Disable traversal, TLAS build/update" + }, + { + "Name": "RtTossPointBlasUpdate", + "Value": 3, + "Description": "Disable traversal, TLAS build/update, BLAS update" + }, + { + "Name": "RtTossPointBlasBuild", + "Value": 4, + "Description": "Disable traversal, TLAS build/update, BLAS update, BLAS build" + } + ] + }, + "Type": "enum", "Scope": "Driver" }, { @@ -8709,7 +8800,7 @@ "Scope": "Driver" }, { - "Name": "GpuRtGpuDebugFlags", + "Name": "RtGpuDebugFlags", "Description": "GPURT GPU debug flags", "Tags": [ "Ray Tracing"