From 840c94212feda11235abd7a1a9ce8fa7c4bbf281 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Semenov=20Herman=20=28=D0=A1=D0=B5=D0=BC=D0=B5=D0=BD=D0=BE?= =?UTF-8?q?=D0=B2=20=D0=93=D0=B5=D1=80=D0=BC=D0=B0=D0=BD=29?= Date: Thu, 19 Dec 2024 04:58:06 +0300 Subject: [PATCH] performance: align structures for 64-bit platforms - _tagCLGLBufferInfo 56 -> 48 bytes - InlineSamplerBaseT 16 -> 12 bytes - CsrSelectionArgs 88 -> 80 bytes - EngineInfo 64 -> 48 bytes - FragmentStorage 48 -> 40 bytes - SWTagsManager 48 -> 40 bytes - _tagCLGLResourceInfo 96 -> 80 bytes - SvmFreeUserData 40 -> 32 bytes - PayloadArgumentBaseT 40 -> 36 bytes - ImageDescriptor 72 -> 64 bytes - PerThreadMemoryBufferBaseT 16 -> 12 bytes - CopyEngineState 24 -> 16 bytes - BatchBuffer 120 -> 112 bytes - DebugMetadata 32 -> 24 bytes - ImmediateFlushData 48 -> 40 bytes --- .../public/cl_gl_private_intel_structures.h | 16 +++++++------- .../source/command_queue/copy_engine_state.h | 2 +- .../source/command_queue/csr_selection_args.h | 12 +++++----- opencl/source/command_queue/enqueue_svm.h | 6 ++--- .../command_queue/blit_enqueue_1_tests.cpp | 2 +- .../command_queue_hw_1_tests.cpp | 2 +- .../cl_tbx_command_stream_tests.cpp | 2 +- .../unit_test/mem_obj/buffer_bcs_tests.cpp | 2 +- opencl/test/unit_test/mem_obj/image_tests.cpp | 2 +- .../command_stream_receiver_hw.h | 4 ++-- ...mand_stream_receiver_simulated_common_hw.h | 6 ++--- .../command_stream/submissions_aggregator.cpp | 10 ++++----- .../command_stream/submissions_aggregator.h | 17 +++++++------- .../device_binary_format/zebin/zeinfo.h | 6 ++--- shared/source/helpers/surface_format_info.h | 2 +- .../source/memory_manager/host_ptr_defines.h | 2 +- .../os_interface/linux/xe/ioctl_helper_xe.h | 2 +- .../source/utilities/software_tags_manager.h | 22 +++++++++---------- 18 files changed, 59 insertions(+), 58 deletions(-) diff --git a/opencl/extensions/public/cl_gl_private_intel_structures.h b/opencl/extensions/public/cl_gl_private_intel_structures.h index 2a8fd01873ea9..6ae2cd3be0cbc 100644 --- a/opencl/extensions/public/cl_gl_private_intel_structures.h +++ b/opencl/extensions/public/cl_gl_private_intel_structures.h @@ -12,24 +12,24 @@ // Used for creating CL resources from GL resources typedef struct _tagCLGLResourceInfo { + GMM_RESOURCE_INFO *pGmmResInfo; /// Pointer to GMMResInfo from GL that will be copied in CL (GL) GLuint name; GLenum target; unsigned int globalShareHandle; - GMM_RESOURCE_INFO *pGmmResInfo; /// Pointer to GMMResInfo from GL that will be copied in CL (GL) GLenum glFormat; GLint glInternalFormat; GLuint glHWFormat; - GLboolean isAuxEnabled; GLuint borderWidth; GLint textureBufferWidth; GLint textureBufferSize; GLint textureBufferOffset; + GMM_RESOURCE_INFO *pGmmResInfoMCS; + GLvoid *pReleaseData; GLboolean oglSynchronized; + GLboolean isAuxEnabled; GMM_STATUS status; unsigned int globalShareHandleMCS; - GMM_RESOURCE_INFO *pGmmResInfoMCS; GLint numberOfSamples; // Number of samples as specified by API - GLvoid *pReleaseData; } CL_GL_RESOURCE_INFO, *PCL_GL_RESOURCE_INFO; // Used for creating GL resources from CL resources @@ -48,15 +48,15 @@ typedef struct _tagGLCLResourceInfo { } GL_CL_RESOURCE_INFO, *PGL_CL_RESOURCE_INFO; typedef struct _tagCLGLBufferInfo { - GLenum bufferName; - unsigned int globalShareHandle; GMM_RESOURCE_INFO *pGmmResInfo; /// Pointer to GMMResInfo from GL that will be copied in CL (GL) GLvoid *pSysMem; + GLvoid *pReleaseData; + GLenum bufferName; + unsigned int globalShareHandle; GLint bufferSize; GLint bufferOffset; - GLboolean oglSynchronized; GMM_STATUS status; - GLvoid *pReleaseData; + GLboolean oglSynchronized; GLboolean createOrDestroy; } CL_GL_BUFFER_INFO, *PCL_GL_BUFFER_INFO; diff --git a/opencl/source/command_queue/copy_engine_state.h b/opencl/source/command_queue/copy_engine_state.h index 0812898774c16..0980c50bd07c2 100644 --- a/opencl/source/command_queue/copy_engine_state.h +++ b/opencl/source/command_queue/copy_engine_state.h @@ -13,8 +13,8 @@ namespace NEO { struct CopyEngineState { - aub_stream::EngineType engineType = aub_stream::EngineType::NUM_ENGINES; TaskCountType taskCount = 0; + aub_stream::EngineType engineType = aub_stream::EngineType::NUM_ENGINES; bool csrClientRegistered = false; bool isValid() const { diff --git a/opencl/source/command_queue/csr_selection_args.h b/opencl/source/command_queue/csr_selection_args.h index 7a94967ab883f..7b2790828cc32 100644 --- a/opencl/source/command_queue/csr_selection_args.h +++ b/opencl/source/command_queue/csr_selection_args.h @@ -24,21 +24,21 @@ struct CsrSelectionArgs { const size_t *imageOrigin = nullptr; }; - cl_command_type cmdType; - const size_t *size = nullptr; Resource srcResource; Resource dstResource; + const size_t *size = nullptr; + cl_command_type cmdType; TransferDirection direction; CsrSelectionArgs(cl_command_type cmdType, const size_t *size) - : cmdType(cmdType), - size(size), + : size(size), + cmdType(cmdType), direction(TransferDirection::hostToHost) {} template CsrSelectionArgs(cl_command_type cmdType, ResourceType *src, ResourceType *dst, uint32_t rootDeviceIndex, const size_t *size) - : cmdType(cmdType), - size(size) { + : size(size), + cmdType(cmdType) { if (src) { processResource(*src, rootDeviceIndex, this->srcResource); } diff --git a/opencl/source/command_queue/enqueue_svm.h b/opencl/source/command_queue/enqueue_svm.h index 5d9602a1dbd05..db9a4c579e08a 100644 --- a/opencl/source/command_queue/enqueue_svm.h +++ b/opencl/source/command_queue/enqueue_svm.h @@ -25,20 +25,20 @@ using SvmFreeClbT = void(CL_CALLBACK *)(cl_command_queue queue, void *userData); struct SvmFreeUserData { - cl_uint numSvmPointers; void **svmPointers; SvmFreeClbT clb; void *userData; + cl_uint numSvmPointers; bool ownsEventDeletion; SvmFreeUserData(cl_uint numSvmPointers, void **svmPointers, SvmFreeClbT clb, void *userData, bool ownsEventDeletion) - : numSvmPointers(numSvmPointers), - svmPointers(svmPointers), + : svmPointers(svmPointers), clb(clb), userData(userData), + numSvmPointers(numSvmPointers), ownsEventDeletion(ownsEventDeletion){}; }; diff --git a/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp b/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp index 5d18436dbe643..90d23461689fe 100644 --- a/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp @@ -1251,7 +1251,7 @@ HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, whenWaitUntilCompletionCalledThenW uint32_t gpgpuTaskCount = 123; uint32_t bcsTaskCount = 123; - CopyEngineState bcsState{bcsCsr->getOsContext().getEngineType(), bcsTaskCount}; + CopyEngineState bcsState{bcsTaskCount, bcsCsr->getOsContext().getEngineType()}; commandQueue->waitUntilComplete(gpgpuTaskCount, Range{&bcsState}, 0, false); EXPECT_EQ(gpgpuTaskCount, static_cast *>(gpgpuCsr)->latestWaitForCompletionWithTimeoutTaskCount.load()); diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp index 8c30cf1376a5f..4aef76b326726 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp @@ -55,7 +55,7 @@ HWTEST_F(CommandQueueHwTest, whenCallingIsCompletedThenTestTaskCountValue) { bcsCsr->setupContext(*osContext); bcsCsr->initializeTagAllocation(); EngineControl control(bcsCsr.get(), osContext.get()); - CopyEngineState state{aub_stream::EngineType::ENGINE_BCS, 1, false}; + CopyEngineState state{1, aub_stream::EngineType::ENGINE_BCS, false}; MockCommandQueueHw cmdQ(context, pClDevice, nullptr); diff --git a/opencl/test/unit_test/command_stream/cl_tbx_command_stream_tests.cpp b/opencl/test/unit_test/command_stream/cl_tbx_command_stream_tests.cpp index 658e304f49459..e0da4da3f0af2 100644 --- a/opencl/test/unit_test/command_stream/cl_tbx_command_stream_tests.cpp +++ b/opencl/test/unit_test/command_stream/cl_tbx_command_stream_tests.cpp @@ -47,7 +47,7 @@ HWTEST_F(ClTbxCommandStreamTests, givenTbxCsrWhenDispatchBlitEnqueueThenProcessC cmdQ.clearBcsEngines(); cmdQ.bcsEngines[0] = &engineControl1; - cmdQ.bcsStates[0] = {aub_stream::ENGINE_BCS, 0, false}; + cmdQ.bcsStates[0] = {0, aub_stream::ENGINE_BCS, false}; cl_int error = CL_SUCCESS; std::unique_ptr buffer(Buffer::create(&context, 0, 1, nullptr, error)); diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index 8944f24719763..ed8f4ec46982c 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -619,7 +619,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenAllBcsEnginesReadyWhenWaitingForEventThe ultCsr2.initializeTagAllocation(); ultCsr2.setupContext(osContext); - CopyEngineState copyEngineState = {aub_stream::EngineType::ENGINE_BCS2, 2, false}; + CopyEngineState copyEngineState = {2, aub_stream::EngineType::ENGINE_BCS2, false}; EngineControl engineControl = {&ultCsr2, &osContext}; auto bcs2Index = EngineHelpers::getBcsIndex(aub_stream::EngineType::ENGINE_BCS2); mockCmdQ->bcsStates[bcs2Index] = copyEngineState; diff --git a/opencl/test/unit_test/mem_obj/image_tests.cpp b/opencl/test/unit_test/mem_obj/image_tests.cpp index 16f53bd885a9f..f0e3b47c2b172 100644 --- a/opencl/test/unit_test/mem_obj/image_tests.cpp +++ b/opencl/test/unit_test/mem_obj/image_tests.cpp @@ -1559,7 +1559,7 @@ TEST(ImageConvertDescriptorTest, givenClImageDescWhenConvertedThenCorrectImageDe } TEST(ImageConvertDescriptorTest, givenImageDescriptorWhenConvertedThenCorrectClImageDescIsReturned) { - ImageDescriptor desc = {ImageType::image2D, 16, 24, 1, 1, 1024, 2048, 1, 3, false}; + ImageDescriptor desc = {16, 24, 1, 1, 1024, 2048, ImageType::image2D, 1, 3, false}; auto clDesc = Image::convertDescriptor(desc); EXPECT_EQ(clDesc.image_type, static_cast(CL_MEM_OBJECT_IMAGE2D)); diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index 0392c0c7b8913..4a6abaa31925e 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -25,10 +25,10 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { using STATE_BASE_ADDRESS = typename GfxFamily::STATE_BASE_ADDRESS; struct ImmediateFlushData { - PipelineSelectArgs pipelineSelectArgs{}; - size_t estimatedSize = 0; void *endPtr = nullptr; + size_t estimatedSize = 0; size_t csrStartOffset = 0; + PipelineSelectArgs pipelineSelectArgs{}; bool pipelineSelectFullConfigurationNeeded = false; bool pipelineSelectDirty = false; diff --git a/shared/source/command_stream/command_stream_receiver_simulated_common_hw.h b/shared/source/command_stream/command_stream_receiver_simulated_common_hw.h index 078ad9c9556f3..51c38895ea3c8 100644 --- a/shared/source/command_stream/command_stream_receiver_simulated_common_hw.h +++ b/shared/source/command_stream/command_stream_receiver_simulated_common_hw.h @@ -81,12 +81,12 @@ class CommandStreamReceiverSimulatedCommonHw : public CommandStreamReceiverHw { diff --git a/shared/source/device_binary_format/zebin/zeinfo.h b/shared/source/device_binary_format/zebin/zeinfo.h index d102c14c44e05..caa69bcd5dc26 100644 --- a/shared/source/device_binary_format/zebin/zeinfo.h +++ b/shared/source/device_binary_format/zebin/zeinfo.h @@ -634,12 +634,12 @@ inline constexpr BtiValueT btiValue = -1; } // namespace Defaults struct PayloadArgumentBaseT { - ArgTypeT argType = argTypeUnknown; OffsetT offset = Defaults::offset; SourceOffseT sourceOffset = Defaults::sourceOffset; SizeT size = 0; ArgIndexT argIndex = Defaults::argIndex; BtiValueT btiValue = Defaults::btiValue; + ArgTypeT argType = argTypeUnknown; AddrmodeT addrmode = memoryAddressingModeUnknown; AddrspaceT addrspace = addressSpaceUnknown; AccessTypeT accessType = accessTypeUnknown; @@ -692,9 +692,9 @@ inline constexpr Slot slot = 0U; } // namespace Defaults struct PerThreadMemoryBufferBaseT { + SizeT size = 0U; AllocationType allocationType = AllocationTypeUnknown; MemoryUsageT memoryUsage = MemoryUsageUnknown; - SizeT size = 0U; IsSimtThreadT isSimtThread = Defaults::isSimtThread; Slot slot = Defaults::slot; }; @@ -732,8 +732,8 @@ inline constexpr NormalizedT normalized = false; struct InlineSamplerBaseT { SamplerIndexT samplerIndex = Defaults::samplerIndex; - AddrModeT addrMode = Defaults::addrMode; FilterModeT filterMode = Defaults::filterMode; + AddrModeT addrMode = Defaults::addrMode; NormalizedT normalized = Defaults::normalized; }; } // namespace InlineSamplers diff --git a/shared/source/helpers/surface_format_info.h b/shared/source/helpers/surface_format_info.h index d9e93175f4dc1..59f773d0aa910 100644 --- a/shared/source/helpers/surface_format_info.h +++ b/shared/source/helpers/surface_format_info.h @@ -218,13 +218,13 @@ enum class ImageType { }; struct ImageDescriptor { - ImageType imageType; size_t imageWidth; size_t imageHeight; size_t imageDepth; size_t imageArraySize; size_t imageRowPitch; size_t imageSlicePitch; + ImageType imageType; uint32_t numMipLevels; uint32_t numSamples; bool fromParent; diff --git a/shared/source/memory_manager/host_ptr_defines.h b/shared/source/memory_manager/host_ptr_defines.h index c96271560a408..3575d59df75e2 100644 --- a/shared/source/memory_manager/host_ptr_defines.h +++ b/shared/source/memory_manager/host_ptr_defines.h @@ -51,9 +51,9 @@ struct AllocationRequirements { struct FragmentStorage { const void *fragmentCpuPointer = nullptr; size_t fragmentSize = 0; - int refCount = 0; OsHandle *osInternalStorage = nullptr; ResidencyData *residency = nullptr; + int refCount = 0; bool driverAllocation = false; }; diff --git a/shared/source/os_interface/linux/xe/ioctl_helper_xe.h b/shared/source/os_interface/linux/xe/ioctl_helper_xe.h index ba1078c7f134b..ed63dfe570fc6 100644 --- a/shared/source/os_interface/linux/xe/ioctl_helper_xe.h +++ b/shared/source/os_interface/linux/xe/ioctl_helper_xe.h @@ -190,9 +190,9 @@ class IoctlHelperXe : public IoctlHelper { std::unique_ptr defaultEngine; struct DebugMetadata { - DrmResourceClass type; uint64_t offset; uint64_t size; + DrmResourceClass type; bool isCookie; }; diff --git a/shared/source/utilities/software_tags_manager.h b/shared/source/utilities/software_tags_manager.h index 28efd165de9c1..175fe8f5183cc 100644 --- a/shared/source/utilities/software_tags_manager.h +++ b/shared/source/utilities/software_tags_manager.h @@ -19,6 +19,17 @@ class GraphicsAllocation; class LinearStream; class SWTagsManager { + protected: + void allocateBXMLHeap(Device &device); + void allocateSWTagHeap(Device &device); + + MemoryManager *memoryManager{}; + GraphicsAllocation *tagHeap = nullptr; + GraphicsAllocation *bxmlHeap = nullptr; + unsigned int currentHeapOffset = 0; + unsigned int currentTagCount = 0; + bool initialized = false; + public: SWTagsManager() = default; @@ -43,17 +54,6 @@ class SWTagsManager { static const unsigned int maxTagHeapSize = 16384; unsigned int currentCallCount = 0; unsigned int getCurrentHeapOffset() { return currentHeapOffset; } - - protected: - void allocateBXMLHeap(Device &device); - void allocateSWTagHeap(Device &device); - - MemoryManager *memoryManager{}; - GraphicsAllocation *tagHeap = nullptr; - GraphicsAllocation *bxmlHeap = nullptr; - unsigned int currentHeapOffset = 0; - unsigned int currentTagCount = 0; - bool initialized = false; }; template