inc/gpuUtil/palGpaSession.h

/*
 ***********************************************************************************************************************
 *
 *  Copyright (c) 2016-2024 Advanced Micro Devices, Inc. All Rights Reserved.
 *
 *  Permission is hereby granted, free of charge, to any person obtaining a copy
 *  of this software and associated documentation files (the "Software"), to deal
 *  in the Software without restriction, including without limitation the rights
 *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *  copies of the Software, and to permit persons to whom the Software is
 *  furnished to do so, subject to the following conditions:
 *
 *  The above copyright notice and this permission notice shall be included in all
 *  copies or substantial portions of the Software.
 *
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 *  SOFTWARE.
 *
 **********************************************************************************************************************/
/**
 ***********************************************************************************************************************
 * @file  palGpaSession.h
 * @brief PAL GPU utility GpaSession class.
 ***********************************************************************************************************************
 */

#pragma once

#include "palDeque.h"
#include "palDevice.h"
#include "palGpuUtil.h"
#include "palHashSet.h"
#include "palMutex.h"
#include "palPipeline.h"
#include "palVector.h"
#include "palPlatform.h"
#include "palSysMemory.h"
#include "palGpuMemory.h"
#include "palMemTrackerImpl.h"

// Forward declarations.
namespace Pal
{
    class  ICmdAllocator;
    class  ICmdBuffer;
    class  IDevice;
    class  IGpuEvent;
    class  IGpuMemory;
    class  IPerfExperiment;
    class  IQueue;
    class  IQueueSemaphore;
    struct GlobalCounterLayout;
    struct MultiSubmitInfo;
    struct ThreadTraceLayout;
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 900
    enum   PipelineStageFlag : uint32;
#else
    enum   HwPipePoint : uint32;
#endif
}
struct SqttFileChunkCpuInfo;
struct SqttFileChunkAsicInfo;
struct SqttCodeObjectDatabaseRecord;

struct GpuMemoryInfo;

namespace GpuUtil
{
// Sample id initialization value.
constexpr Pal::uint32 InvalidSampleId = 0xFFFFFFFF;

/// The available states of GpaSession
enum class GpaSessionState : Pal::uint32
{
    Reset      = 0,
    Building   = 1,
    Complete   = 2,
    Ready      = 3,
};

/// The various ways you can change trace options after it has started.
enum class UpdateSampleTraceMode : Pal::uint32
{
    MinimalToFullMask      = 0, ///< Used to convert a minimal trace (needed for context in compute presents) to a full
                                ///   trace according to the options in the active trace. Requires enableSampleUpdates.
                                ///  Additionally, this must be called between BeginSample() and EndSample() and
                                ///   queue timing must also be enabled on the GpaSession when this function is called.
    StartInstructionTrace  = 1, ///< Used to enable instruction-level trace globally at any time. Can be run without an
                                ///  active sample. Useful for targeting specific parts of a frame.
    StopInstructionTrace   = 2, ///< Used to disable instruction-level trace globally at any time. Can be run without an
                                ///  active sample.
};

/// Specifies basic type of sample to perfom - either a normal set of "global" perf counters, or a trace consisting
/// of SQ thread trace and/or streaming performance counters.
enum class GpaSampleType : Pal::uint32
{
    None       = 0x0,  ///< No profile will be done.
    Cumulative = 0x1,  ///< One 64-bit result will be returned per performance counter representing the cumulative delta
                       ///  for that counter over the sample period.  Cumulative samples must begin and end in the same
                       ///  command buffer.
    Trace      = 0x2,  ///< A GPU memory buffer will be filled with hw-specific SQ thread trace and/or streaming
                       ///  performance counter data.  Trace samples may span multiple command buffers.
    Timing     = 0x3,  ///< Two 64-bit results will be recorded in beginTs and endTs to gather timestamp data.
    Query      = 0x4,  ///< A set of 11 pipeline stats will be collected.
    Count
};

/// Specifies a specific performance counter to be sampled with GpaSession::BeginSample() and GpaSession::EndSample().
///
/// This identifies a specific counter in a particular HW block instance, e.g., TCC instance 3 counter #19.  It is up
/// to the client to know the meaning of a particular counter, e.g., TCC #19 is TCC_PERF_SEL_MISS on Fiji.  Eventually,
/// PAL may want to support certain counters without the client needing HW-specific knowledge (i.e., select an enum
/// called L2MissRate from PAL rather than needing to know that counter is TCC #19 on Fiji), but GPA currently works in
/// this low-level mode with other drivers, and wants to keep the flexibility.
struct PerfCounterId
{
    Pal::GpuBlock block;    ///< Which GPU block to reference (e.g., CB, DB, TCC).
    Pal::uint32   instance; ///< Which instance of the specified GPU block to sample.  E.g., Tahiti has 12 TCC blocks
                            ///  (this number is returned per-block in the @ref Pal::GpuBlockPerfProperties structure).
                            ///  There is no shortcut to get results for all instances of block in the whole chip, the
                            ///  client must explicitly sample each instance and sum the results.
    Pal::uint32   eventId;  ///< Counter ID to sample.  Note that the meaning of a particular eventId for a block can
                            ///  change between chips.

    union
    {
        struct
        {
            Pal::uint32 spm32Bit : 1;  ///< For SPM counters, collect in 32bit instead of 16bit
            Pal::uint32 reserved : 31; ///< Reserved for future use
        };

        Pal::uint32 u32All; ///< Union value for copying

    } flags;

    // Some blocks have additional per-counter controls. They must be properly programmed when adding counters for
    // the relevant blocks. It's recommended to zero them out when not in use.
    union
    {
        struct
        {
            Pal::uint32 eventQualifier;   ///< The DF counters have an event-specific qualifier bitfield.
        } df;

        struct
        {
            Pal::uint16 eventThreshold;   ///< Threshold value for those UMC counters having event-specific threshold.
            Pal::uint8  eventThresholdEn; ///< Threshold enable (0 for disabled,1 for <threshold,2 for >threshold)
            Pal::uint8  rdWrMask;         ///< Read/Write mask select (1 for Read, 2 for Write).
        } umc;

        Pal::uint32 rs64Cntl; ///< CP blocks CPG and CPC have events that can be further filtered for processor events

        Pal::uint32 u32All; ///< Union value for copying, must be increased in size if any element of the union exceeds
    } subConfig;
};

/// Defines a set of flags for a particular gpa session.
union GpaSessionFlags
{
    struct
    {
        /// Enables timing of queue operations via Timed* functions.
        Pal::uint32 enableQueueTiming        : 1;

        /// Enables sample updates via the UpdateSampleTraceParams function.
        Pal::uint32 enableSampleUpdates      : 1;

        /// Indicates that the client will use the internal Timed*QueueSemaphore() functions for queue semaphore timing
        /// data. When not set it indicates the client will provide ETW data via the ExteralTimed* functions.
        Pal::uint32 useInternalQueueSemaphoreTiming : 1;

        /// Reserved for future use.
        Pal::uint32 reserved                 : 29;
    };

    /// Flags packed as 32-bit uint.
    Pal::uint32 u32All;
};

/// Specifies options that direct the gpa session behavior.
struct GpaSessionBeginInfo
{
    /// Gpa Session flags used to control behavior.
    GpaSessionFlags flags;
};

/// Input structure for CmdBeginGpuProfilerSample.
///
/// Defines a set of global performance counters and/or SQ thread trace data to be sampled.
struct GpaSampleConfig
{
    /// Selects what type of data should be gathered for this sample.  This can either be _cumulative_ to gather
    /// simple deltas for the specified set of perf counters over the sample period, or it can be _trace_ to generate
    /// a blob of RGP-formatted data containing SQ thread trace and/or streaming performance monitor data.
    GpaSampleType type;

    union
    {
        struct
        {
            Pal::uint32 sampleInternalOperations      :  1;  ///< Include BLTs and internal driver operations in the
                                                             ///  results.
            Pal::uint32 cacheFlushOnCounterCollection :  1;  ///< Insert cache flush and invalidate events before and
                                                             ///  after every sample.
            Pal::uint32 sqShaderMask                  :  1;  ///< If sqShaderMask is valid.
            Pal::uint32 sqWgpShaderMask               :  1;  ///< If sqWgpShaderMask is valid.
            Pal::uint32 reserved                      : 28;  ///< Reserved for future use.
        };
        Pal::uint32 u32All;                                  ///< Bit flags packed as uint32.
    } flags;                                                 ///< Bit flags controlling sample operation for all sample
                                                             ///  types.

    Pal::PerfExperimentShaderFlags sqShaderMask;    ///< Which shader stages are sampled by GpuBlock::Sq counters.
                                                    ///< Only used if flags.sqShaderMask is set to 1.
    Pal::PerfExperimentShaderFlags sqWgpShaderMask; ///< Which shader stages are sampled by GpuBlock::SqWgp counters.
                                                    ///< Only used if flags.sqWgpShaderMask is set to 1.

    struct
    {
        /// Number of entries in pIds.
        Pal::uint32 numCounters;

        /// List of performance counters to be gathered for a sample.  If the sample type is _cumulative_ this will
        /// result in "global" perf counters being sampled at the beginning of the sample period; if the sample type
        /// is _trace_ this will result in SPM data being added to the sample's resulting RGP blob.
        ///
        /// Note that it is up to the client to respect the hardware counter limit per block.  This can be
        /// determined by the maxGlobalOnlyCounters, maxGlobalSharedCounters, maxSpmCounters, and instanceGroupSize
        /// fields of @ref Pal::GpuBlockPerfProperties.
        const PerfCounterId* pIds;

        /// Period for SPM sample collection in cycles.  Only relevant for _trace_ samples.
        Pal::uint32  spmTraceSampleInterval;

        /// Maximum amount of GPU memory in bytes this sample can allocate for SPM data.  Only relevant for _trace_
        /// samples.
        Pal::gpusize gpuMemoryLimit;
    } perfCounters;  ///< Performance counter selection (valid for both _cumulative_ and _trace_ samples).

    struct
    {
        /// Number of entries in pIds.
        Pal::uint32  numCounters;

        /// Period for DF SPM sample collection in nano seconds.
        Pal::uint32  sampleInterval;

        /// Maximum amount of GPU memory in bytes this sample can allocate for DF SPM data.
        Pal::gpusize gpuMemoryLimit;

        /// List of performance counters to be gathered for a df sample. This has to be separate from the list
        /// list of normal counters because it is a completely different mechanism for gathering data.
        ///
        /// Note that it is up to the client to respect the hardware counter limit per block.  This can be
        /// determined by the maxSpmCounters fields of
        /// @ref Pal::GpuBlockPerfProperties.
        const PerfCounterId* pIds;
    } dfSpmPerfCounters;

    struct
    {
        union
        {
            struct
            {
                Pal::uint32 enable                     :  1;  ///< Include SQTT data in the trace.
                Pal::uint32 supressInstructionTokens   :  1;  ///< Prevents capturing instruction-level SQTT tokens,
                                                              ///  significantly reducing the amount of sample data.
                Pal::uint32 stallMode                  :  2;  ///< Describes behavior when buffer full
                Pal::uint32 placeholder1               :  1;
                Pal::uint32 excludeNonDetailShaderData :  1;  ///< Only emit shader tokens from the SIMD that have been
                                                              ///  selected for detail instruction tracing
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 899
                Pal::uint32 enableExecPopTokens        :  1;  ///< Output exec tokens
#else
                Pal::uint32 placeholder2               :  1;
#endif
                Pal::uint32 reserved                   : 25;  ///< Reserved for future use.
            };
            Pal::uint32 u32All;                             ///< Bit flags packed as uint32.
        } flags;                                            ///< Bit flags controlling SQTT samples.
        Pal::uint32 seMask;          ///< Mask that determines which specific SEs to run Thread trace on.
                                     ///  If 0, all SEs are enabled
        Pal::uint32 seDetailedMask;  ///< Mask that selects which specific SEs to reveal Thread trace detailed info.
                                     ///  If 0, all SEs will reveal detailed thread trace
        Pal::gpusize gpuMemoryLimit; ///< Maximum amount of GPU memory in bytes this sample can allocate for the SQTT
                                     ///  buffer.  If 0, allocate maximum size to prevent dropping tokens toward the
                                     ///  end of the sample.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 824
        Pal::uint32 tokenMask;       ///< Mask indicating which SQTT tokens are requested for capture. If a tokenMask is
                                     ///  not provided, PAL will default to collecting all tokens or tokens except
                                     ///  instruction tokens if the supressInstructionTokens flag is set. Instruction
                                     ///  tokens will always be filtered out if supressInstructionTokens = true.
#endif
    } sqtt;  ///< SQ thread trace configuration (only valid for _trace_ samples).

    struct
    {
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 900
        Pal::PipelineStageFlag preSample;  ///< The pipeline stage in the GPU pipeline where the begin timestamp should
                                           ///  take place.
        Pal::PipelineStageFlag postSample; ///< The pipeline stage in the GPU pipeline where the end timestamp should
                                           ///  take place.
#else
        Pal::HwPipePoint preSample;   ///< The point in the GPU pipeline where the begin timestamp should take place.
        Pal::HwPipePoint postSample;  ///< The point in the GPU pipeline where the end timestamp should take place.
#endif
    } timing;  ///< Timestamp configuration. (only valid for timing samples)
};

/// Extra metadata about a command buffer submission
struct TimedSubmitInfo
{
    const Pal::uint64* pApiCmdBufIds;  ///< Array of api specific command buffer ids
    const Pal::uint32* pSqttCmdBufIds; ///< Array of sqtt command buffer ids
    Pal::uint64 frameIndex;            ///< The global frame index for the application.
};

/// Extra metadata about a queue semaphore operation
struct TimedQueueSemaphoreInfo
{
    Pal::uint64 semaphoreID; ///< Api specific id associated with a semaphore.
};

/// Extra metadata about a queue present operation
struct TimedQueuePresentInfo
{
    Pal::uint64 presentID; ///< Api specific id associated with a present.
};

/// Struct for storing information about gpu clock speeds.
struct GpuClocksSample
{
    Pal::uint32 gpuEngineClockSpeed; // Current speed of the gpu engine clock in MHz
    Pal::uint32 gpuMemoryClockSpeed; // Current speed of the gpu memory clock in MHz
};

/// Struct for storing CPU-side allocations of Pal::IPerfExperiment's.
struct PerfExperimentMemory
{
    void*  pMemory;     // Memory allocated for an IPerfExperiment.
    size_t memorySize;  // Size of the memory allocated in pMemory.
};

/// Struct for supplying API-dependent information about pipelines.
struct RegisterPipelineInfo
{
    Pal::uint64 apiPsoHash;  ///< Client-provided PSO hash.
};

/// Struct for supplying API-dependent information about libraries.
struct RegisterLibraryInfo
{
    Pal::uint64 apiHash;      ///< Client-provided api hash.
};

/// Struct for supplying Elf binary.
struct ElfBinaryInfo
{
    const void* pBinary;          ///< FAT Elf binary.
    Pal::uint32 binarySize;       ///< FAT Elf binary size.
    Pal::IGpuMemory* pGpuMemory;  ///< GPU Memory where the compiled ISA resides.
    Pal::gpusize offset;          ///< Offset inside GPU memory object
    Pal::uint64 originalHash;     ///< Original source/binary hash.
    Pal::uint64 compiledHash;     ///< Compiled binary hash.
};

/// Enumeration of RGP trace profiling modes
enum class TraceProfilingMode : Pal::uint32
{
    Present     = 0,    ///< Present triggered capture
    UserMarkers = 1,    ///< Capture triggered by user marker
    FrameNumber = 2,    ///< Capture based on frame number
    Tags        = 3,    ///< Tag based capture
};

/// Constant defines the maximum length for a user marker string.
static constexpr Pal::uint32 UserMarkerStringLength = 256;

/// Defines data specific to each profiling mode used to capture an RGP trace.
union TraceProfilingModeData
{
    struct
    {
        char start[UserMarkerStringLength];     ///< User marker string used to start trace capture.
        char end[UserMarkerStringLength];       ///< User marker string used to end trace capture.
    } userMarkerData;

    struct
    {
        Pal::uint32 start;                      ///< Frame number used to start the trace.
        Pal::uint32 end;                        ///< Frame number used to end the trace.
    } frameNumberData;

    struct
    {
        Pal::uint64 start;                      ///< Tag used to start the trace.
        Pal::uint64 end;                        ///< Tag used to end the trace.
    } tagData;
};

/// Enumerates the different instruction level data modes for an RGP trace
enum class InstructionTraceMode : Pal::uint32
{
    Disabled  = 0, ///< Instruction level data was disabled for trace.
    FullFrame = 1, ///< Instruction level data was enabled for the full trace.
    ApiPso    = 2, ///< Instruction level data was enabled only for a single API PSO.
};

/// Defines the data used to control enabling of instruction level data.
struct InstructionTraceModeData
{
    Pal::uint64 apiPsoHash;     ///< Hash of the API PSO targeted for instruction level data.
};

/// Struct for supplying API specific information about an RGP trace
struct SampleTraceApiInfo
{
    TraceProfilingMode       profilingMode;             ///< Profiling mode used to trigger the trace.
    TraceProfilingModeData   profilingModeData;         ///< Profiling mode specific data.
    InstructionTraceMode     instructionTraceMode;      ///< Instruction trace mode for the trace.
    InstructionTraceModeData instructionTraceModeData;  ///< Instruction trace mode data.
};

/// An enumeration of the API types.
enum class ApiType : Pal::uint32
{
    DirectX12 = 0,    ///< Represents DirectX12 API type.
    Vulkan    = 1,    ///< Represents Vulkan API type.
    Generic   = 2,    ///< Represents Generic API type.
    OpenCl    = 3,    ///< Represents OpenCL API type.
    Hip       = 5,    ///< Represents HIP API type.
};

/// Struct used for storing SQTT-specific trace information
struct SqttTraceInfo
{
    Pal::uint32 shaderEngine;    ///< Shader engine index
    Pal::uint32 computeUnit;     ///< Compute unit index
    Pal::uint32 sqttVersion;     ///< SQTT version
    Pal::uint64 bufferSize;      ///< SQTT trace buffer size
};

/// Struct used for storing SPM-specific trace information
struct SpmTraceInfo
{
    Pal::uint32 numSpmCounters;  ///< The number of SPM counters sampled in the trace
    Pal::uint32 numTimestamps;   ///< The number of timestamps that samples were taken
    Pal::uint32 sampleFrequency; ///< The SPM counter sampling frequency
};

/// Struct used for storing QueueTimings-specific trace information
struct QueueTimingsTraceInfo
{
    Pal::uint32 numQueueInfoRecords;
    Pal::uint32 numQueueEventRecords;
    Pal::uint32 queueInfoTableSize;
    Pal::uint32 queueEventTableSize;
};

/**
***********************************************************************************************************************
* @class GpaSession
* @brief Helper class providing common driver functionality required by all PAL clients that support the GPUPerfAPI
*        (GPA).  Abstracts IPerfExperiment creation, memory management, completion confirmation, and results reporting
*        at a level convenient for GPA.  Each PAL client driver will need to publish an API extension exposing this
*        support for use by GPA.
*
* A GpaSession is a container for a set of _samples_ of performance counter and/or SQ thread trace data.  Its main
* purpose is to manage resources (IPerfExperiments and their backing system/GPU memory) in an efficient manner that is
* consistent with command buffer management in modern APIs.  Consider GpaSession as a peer of DX12's command
* allocator or Vulkan's command pool objects.
*
* Basic flow of usage:
*     - Newly create sessions are in the _reset_ state.
*     - A session is moved from the _reset_ state to the _building_ state by calling Begin().
*     - Samples are added to a session by specifying desired data for each query and marking a begin and end location
*       in ICmdBuffers as they are built.  Internally required resources, like GPU memory where counters will be
*       written, are allocated from internal pools managed by the session.
*     - A session is moved from the _building_ state to the _complete_ state by calling End().
*     - The application will submit all command buffers referenced by the session.
*     - The session is confirmed as _ready_, either using standard PAL fences to confirm all assocated submission have
*       completed, or by polling IsReady() on the session.
*     - Results for all samples in the session can be queried via GetResults().
*     - Reset() should be called once results have been gathered and before building a new session.  Resources are
*       retained by the session object for use in the newly built session.  The session object must be destroyed in
*       order to fully release all resource back to the system.
*
* Cumulative-type samples may not span multiple command buffers, as other apps could interfere with the counts and
* there the final data doesn't have time-based visibility to detect that happened.
*
* @warning GpaSession is not thread safe.  Performing samples in command buffers being built simultaneously by multiple
*          threads should use multiple GpaSession objects.
***********************************************************************************************************************
*/
class GpaSession
{
    typedef Pal::IPlatform             GpaAllocator;
public:
    typedef Util::Deque<PerfExperimentMemory, GpaAllocator> PerfExpMemDeque;

    /// Constructor.
    GpaSession(
        Pal::IPlatform*      pPlatform,
        Pal::IDevice*        pDevice,
        Pal::uint16          apiMajorVer,
        Pal::uint16          apiMinorVer,
        ApiType              apiType,
        Pal::uint16          rgpInstrumentationSpecVer = 0,
        Pal::uint16          rgpInstrumentationApiVer  = 0,
        PerfExpMemDeque*     pAvailablePerfExpMem      = nullptr);

    ~GpaSession();

    /// Copy constructor creates an empty copy of a session.
    ///
    /// Newly constructed session copies the GPU memory allocations and their layout from the source session, making
    /// this a valid destination for a CopyResults command.  This new object is effectively in the _complete_ state.
    ///
    /// The purpose of such objects is to handle sampling data from bundles or nested command buffers where the same
    /// set of commands might be executed multiple times from a single root-level command buffer.  The client should
    /// note such cases, and create a copy of the bundle's session for each invocation, then call CopyResults() from
    /// the original session into the copy after the invocation.
    ///
    /// @param [in] src Session to be copied.  Must either be in the _complete_ or _ready_ state.
    explicit GpaSession(const GpaSession& src);

    /// Initialize the newly constructed GPA session.
    Pal::Result Init();

    /// Registers a queue with the GpaSession that will be submitted to using TimedSubmit. This must be called on any
    /// queues that are submitted to via the Timed* functions. For Timed* signal and wait queue semaphore events, a
    /// valid queueContext will be required (queueContext not equal to 0).
    Pal::Result RegisterTimedQueue(Pal::IQueue* pQueue,
                                   Pal::uint64 queueId,
                                   Pal::uint64 queueContext);

    /// Unregisters a queue prior to object destruction, and ensure that associated resources are destroyed. Work can
    /// no longer be submitted on the queue after this has been called.
    Pal::Result UnregisterTimedQueue(Pal::IQueue* pQueue);

    Pal::Result TimedSubmit(Pal::IQueue*                pQueue,
                            const Pal::MultiSubmitInfo& submitInfo,
                            const TimedSubmitInfo&      timedSubmitInfo);

    /// Executes a timed queue semaphore signal through the given queue.  The HW time is measured when the queue semaphore
    /// is signaled.
    Pal::Result TimedSignalQueueSemaphore(Pal::IQueue* pQueue,
                                          Pal::IQueueSemaphore* pQueueSemaphore,
                                          const TimedQueueSemaphoreInfo& timedSignalInfo,
                                          Pal::uint64  value = 0);

    /// Executes a timed queue semaphore wait through the given queue.  The HW time is measured when the queue semaphore
    /// wait finishes.
    Pal::Result TimedWaitQueueSemaphore(Pal::IQueue* pQueue,
                                        Pal::IQueueSemaphore* pQueueSemaphore,
                                        const TimedQueueSemaphoreInfo& timedWaitInfo,
                                        Pal::uint64  value = 0);

    /// Injects a timed queue present event.
    Pal::Result TimedQueuePresent(Pal::IQueue* pQueue,
                                  const TimedQueuePresentInfo& timedPresentInfo);

    /// Injects a timed wait queue semaphore event using information supplied by an external source.
    /// A valid queueContext (queueContext not equal to 0) is needed for this function.
    Pal::Result ExternalTimedWaitQueueSemaphore(Pal::uint64 queueContext,
                                                Pal::uint64 cpuSubmissionTimestamp,
                                                Pal::uint64 cpuCompletionTimestamp,
                                                const TimedQueueSemaphoreInfo& timedWaitInfo);

    /// Injects a timed signal queue semaphore event using information supplied by an external source.
    /// A valid queueContext (queueContext not equal to 0) is needed for this function.
    Pal::Result ExternalTimedSignalQueueSemaphore(Pal::uint64 queueContext,
                                                  Pal::uint64 cpuSubmissionTimestamp,
                                                  Pal::uint64 cpuCompletionTimestamp,
                                                  const TimedQueueSemaphoreInfo& timedSignalInfo);

    /// Queries the engine and memory clocks from DeviceProperties
    Pal::Result SampleGpuClocks(GpuClocksSample* pGpuClocksSample) const;

    /// Samples the timing clocks if queue timing is enabled and adds a clock sample entry to the current session.
    Pal::Result SampleTimingClocks();

    /// Moves the session from the _reset_ state to the _building_ state.
    ///
    /// Invalid to call Begin() on a session that isn't in the _reset_ state.
    ///
    /// @param [in] info Information about the gpa sessions desired behavior.
    ///
    /// @returns Success if the session was successfully moved to the _building_ state.  Otherwise, possible errors
    ///          include:
    ///          + ErrorUnavailable if the sessions isn't current in the _reset_ state.
    Pal::Result Begin(const GpaSessionBeginInfo& info);

    /// Moves the session from the _building_ state to the _complete_ state.
    ///
    /// Invalid to call End() on a session that isn't in the _building_ state.  The implementation _may_ insert GPU
    /// commands into the specified pCmdBuf - in the case of a session that spans multiple command buffers, the
    /// command buffer specified to End() _must_ be the last command buffer of the session that is submitted.
    ///
    /// @param [in] pCmdBuf Last (normally _only_) command buffer of the session.  Can be used by implementation
    ///                     to insert GPU commands required after all samples are inserted (e.g., to confirm session
    ///                     completion).
    ///
    /// @returns Success if the session was successfully moved to the _complete_ state.  Otherwise, possible errors
    ///          include:
    ///          + ErrorUnavailable if the sessions isn't current in the _building_ state.
    Pal::Result End(Pal::ICmdBuffer* pCmdBuf);

    /// Marks the beginning of a range of GPU operations to be measured and specifies what data should be recorded.
    ///
    /// It is possible the sample will not succeed due to internal memory allocation failure, etc.  In those cases,
    /// the session will be marked invalid and no sample commands will be inserted.  Reporting of this error is
    /// delayed until GetResults().
    ///
    /// A note for GpuBlock::SqWgp
    /// Client of palPerfExperiment may configure counters of GpuBlock::SqWgp based on a per-wgp granularity
    /// only if the following are disabled: GFXOFF, virtualization/SRIOV, VDDGFX (power down features), clock
    /// gating (CGCG) and power gating. PAL expose this feature to clients.
    /// If any of the conditions above cannot be met, it's the client's job to set all WGPs in the same SE to the same
    /// perf counter programming. In this case, GpuBlock::SqWgp's perf counter works on a per-SE granularity.
    /// Strictly speaking, it's not true that the counters work on a per-SE granularity when those power features
    /// are enabled. It's all still per-WGP in HW, we just can't support different counter configs within the same SE.
    /// The counter data is still reported per WGP (not aggregated for the whole SE).
    ///
    /// Check the following two documents for details:
    ///
    /// @param [in]  pCmdBuf      Command buffer to issue the begin sample commands.  All operations performed
    ///                           between executing the BeginSample() and EndSample() GPU commands will contribute to
    ///                           the sample results.
    /// @param [in]  sampleConfig Describes what data should be sampled.
    /// @param [out] pSampleId    An ID corresponding to this sample.  This ID should be recorded and passed back to
    ///                           EndSample() when the sampled command buffer range is complete.  This ID should also
    ///                           be passed to GetResults() when the session is in the _ready_ state in order to get
    ///                           the results of this sample.
    ///
    /// @returns Success if the update was successful.  Unsupported if the sample config type is not supported.
    ///          Otherwise, possible errors include:
    ///          + ErrorInvalidPointer if pCmdBuf or pSampleId is nullptr.
    Pal::Result BeginSample(
        Pal::ICmdBuffer*       pCmdBuf,
        const GpaSampleConfig& sampleConfig,
        Pal::uint32*           pSampleId);

    /// Updates the trace parameters for a specific sample.
    ///
    /// @param [in] pCmdBuf    Command buffer to issue the update commands.
    /// @param [in] sampleId   Identifies the sample to be updated, if required by the mode.  This should be a value
    ///                        returned by BeginSample(), and must correspond to a thread trace sample.
    /// @param [in] updateMode The way the sample parameters should be set. Some modes have additional restrictions.
    ///                        @see UpdateSampleTraceMode
    ///
    /// @returns Success if the update was successful.  Otherwise, possible errors
    ///          include:
    ///          + ErrorInvalidPointer if pCmdBuf is nullptr.
    ///          + ErrorInvalidObjectType if a sample is required and the sample associated with sampleId is not a
    ///                                   trace sample.
    Pal::Result UpdateSampleTraceParams(
        Pal::ICmdBuffer*          pCmdBuf,
        Pal::uint32               sampleId,
        UpdateSampleTraceMode     updateMode);

    /// Marks the end of a range of command buffer operations to be measured.
    ///
    /// @param [in] pCmdBuf  Command buffer to issue the end sample commands.  All operations performed between
    ///                      executing the BeginSample() and EndSample() GPU commands will contribute to the sample
    ///                      results.  _Cumulative_ samples (i.e., global performance counter samples) must never span
    ///                      multiple command buffers (EndSample() should be called in the same command buffer as
    ///                      BeginSample()).
    /// @param [in] sampleId Identifies the sample to be ended.  This should be the value returned by BeginSample()
    ///                      for the sample that is being ended.
    ///
    /// @note BeginSample() must be called before EndSample() _and_ the GPU commands inserted by BeginSample() must be
    ///       executed before the command inserted by EndSample().  Since a session is a single-threaded object, this
    ///       will normally happen naturally.
    void EndSample(
        Pal::ICmdBuffer* pCmdBuf,
        Pal::uint32      sampleId);

    /// Copies the DF SPM trace buffer to the GpaSession result buffer
    ///
    /// @param [in] pCmdBuf  Command buffer to issue the copy commands.
    /// @param [in] sampleId Identifies the sample to be copied.
    /// @note This must be called after a command buffer with the dfSpmTraceEnd CmdBufInfo flag
    ///       and with a separate command buffer. DF SPM traces are on a per command buffer granularity
    ///       because they are started and stopped by the KMD.
    void CopyDfSpmTraceResults(
        Pal::ICmdBuffer* pCmdBuf,
        Pal::uint32      sampleId);

    /// Provides API specific information about an RGP trace.
    ///
    /// @param [in] traceApiInfo  Const reference to the struct of API specific information.
    /// @param [in] sampleId      Sample ID (returned by BeginSample) for the RGP trace type sample info is being
    ///                           provided for.
    void SetSampleTraceApiInfo(
        const SampleTraceApiInfo& traceApiInfo,
        Pal::uint32               sampleId) const;

    /// Reports if GPU execution of this session has completed and results are _ready_ for querying from the CPU via
    /// GetResults().
    ///
    /// @returns true if all samples in the session have completed GPU execution.
    bool IsReady() const;

    /// Reports results of a particular sample.  Only valid for sessions in the _ready_ state.
    ///
    /// Results will be formatted depending on the sample type:
    /// + Cumulative: Results will be an array of uint64 values in the order of perf counter IDs specified by
    ///               BeginSample().
    /// + SqThreadTrace: Results will be a binary blob in the RGP file format.
    ///
    /// @param [in]     sampleId     Sample to be reported.  Corresponds to value returned by BeginSample().
    /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space
    ///                              available in pData, and *pSizeInBytes will be set to the amount of space written
    ///                              to pData.  If pData is null, *pSizeInBytes will be set to the amount of space
    ///                              required.
    /// @param [out]    pData        Can be null to query how much size is required (should only be necessary when
    ///                              getting RGP data).  If non-null, the sample results will be written to this
    ///                              location.
    ///
    /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required
    ///          size is successfully written to pSizeInBytes).  Otherwise, possible errors include:
    ///          + ErrorUnavailable if the session is not in the _ready_ state.
    ///          + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources.
    ///          + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results.
    Pal::Result GetResults(
        Pal::uint32 sampleId,
        size_t*     pSizeInBytes,
        void*       pData) const;

    /// Retrieves the SQTT results. Only valid for sessions in the _complete_ state.
    ///
    /// @param [in]     sampleId       Sample to be reported.  Corresponds to value returned by BeginSample().
    /// @param [in]     traceIndex     The index of the trace to get.
    /// @param [out]    pTraceInfoOut  Optional pointer to a structure which will be written with information about the trace.
    /// @param [in,out] pSizeInBytes   If pData is non-null, the input value of *pSizeInBytes is the amount of space
    ///                                available in pData, and *pSizeInBytes will be set to the amount of space written
    ///                                to pData.  If pData is null, *pSizeInBytes will be set to the amount of space
    ///                                required.
    /// @param [out]    pData          Can be null to query how much size is required.
    ///                                If non-null, the sample results will be written to this location.
    ///
    /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required
    ///          size is successfully written to pSizeInBytes).  Otherwise, possible errors include:
    ///          + ErrorUnavailable if the session is not in the _ready_ state.
    ///          + NotFound if the given index is not valid.
    ///          + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources.
    ///          + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results.
    //           + ErrorInvalidPointer if pSizeInBytes is NULL.
    Pal::Result GetSqttTraceData(
        Pal::uint32    sampleId,
        Pal::uint32    traceIndex,
        SqttTraceInfo* pTraceInfo,
        size_t*        pSizeInBytes,
        void*          pData) const;

    /// Retrieves the SPM trace results of a particular sample. Only valid for 'Trace' type samples and sessions
    /// in the _complete_ state.
    ///
    /// Results in the output buffer are a binary blob formatted according to the RGP specification.
    /// The data layout of the populated output buffer is as follows:
    ///     - Timestamps array        [size: "numTimestamps * sizeof(uint64)" bytes]
    ///     - SpmCounterInfo array    [size: "numSpmCounters * sizeof(SpmCounterInfo)" bytes]
    ///     - SPM Counter Data matrix [size: "*pSizeInBytes - (timestamps array + SpmCounterInfo array size)" bytes]
    ///
    /// The SPM Counter Data matrix is laid out linearly in a row-major format. There are "numSpmCounters" rows and
    /// "numTimestamps" columns. Each element in the matrix is either 16- or 32-bits, based on the "dataSize" field
    /// of the corresponding "SpmCounterInfo" entry.
    ///
    /// @param [in]     sampleId          Sample to be reported.  Corresponds to value returned by BeginSample().
    /// @param [out]    pTraceInfo        Optional. If non-null, this structure is populated with trace metadata.
    /// @param [in,out] pSizeInBytes      If pData is non-null, the input value of *pSizeInBytes is the amount of space
    ///                                   available in pData.
    ///                                   If pData is null, *pSizeInBytes will be set to the amount of space
    ///                                   required.
    /// @param [out]    pData             Can be null to query how much size is required.
    ///                                   If non-null, the sample results will be written to this location.
    ///
    /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required
    ///          size is successfully written to pSizeInBytes).  Otherwise, possible errors include:
    ///          + ErrorUnavailable if the session is not in the _ready_ state.
    ///          + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources.
    ///          + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results.
    Pal::Result GetSpmTraceData(
        Pal::uint32   sampleId,
        SpmTraceInfo* pTraceInfo,
        size_t*       pSizeInBytes,
        void*         pData) const;

    /// Retrieves the Queue Timings data from the active GpaSession.
    /// Only valid when the GpaSession had `enableQueueTiming` flag set.
    ///
    /// @param [out]    pTraceInfo        Optional. If non-null, this structure is populated with metadata.
    /// @param [in,out] pSizeInBytes      If pData is non-null, the input value of *pSizeInBytes is the amount of space
    ///                                   available in pData.
    ///                                   If pData is null, *pSizeInBytes will be set to the amount of space
    ///                                   required.
    /// @param [out]    pData             Can be null to query how much size is required.
    ///                                   If non-null, the sample results will be written to this location.
    ///
    /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required
    ///          size is successfully written to pSizeInBytes).  Otherwise, possible errors include:
    ///          + ErrorUnavailable if the session was not configured with `enableQueueTiming`.
    Pal::Result GetQueueTimingsData(
        QueueTimingsTraceInfo* pTraceInfo,
        size_t*                pSizeInBytes,
        void*                  pData) const;

    /// Moves the session to the _reset_ state, marking all sessions resources as unused and available for reuse when
    /// the session is re-built.
    ///
    /// @warning This function cannot be called when the session is queued for execution on the GPU.  The client must
    ///          confirm this is not the case using IsReady(), fences, etc.
    ///
    /// @returns Success if the session was successfully moved to the _reset_ state.  Otherwise, possible errors
    ///          include:
    ///          + ErrorUnknown if an internal PAL error occurs.
    Pal::Result Reset();

    /// Uses the GPU to copy results from a nested command buffer's session into a root-level command buffer's per-
    /// invocation session data.
    ///
    /// This command will implicitly wait for the source session (as specified in the copy constructor) to be complete
    /// then use the GPU to update this session's data.  This allows the client to get accurate sample data in the
    /// case where a nested command buffer is launched multiple times from the same root-level command buffer.
    ///
    /// The session remains in the _complete_ state after calling this, and the client should submit the commands
    /// and verify their completion to move to the _ready_ state.
    ///
    /// @param pCmdBuf Command buffer where the session copy should be performed.
    void CopyResults(Pal::ICmdBuffer* pCmdBuf);

    /// Register pipeline with GpaSession for obtaining shader dumps and load events in the RGP file.
    ///
    /// @param [in] pPipeline   The PAL pipeline to be tracked.
    /// @param [in] clientInfo  API-dependent information for this pipeline to also be recorded.
    ///
    /// @returns Success if the pipeline has been registered with GpaSession successfully.
    ///          + AlreadyExists if a duplicate pipeline is provided.
    Pal::Result RegisterPipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo);

    /// Unregister pipeline with GpaSession for obtaining unload events in the RGP file.
    /// This should be called immediately before destroying the PAL pipeline object.
    ///
    /// @param [in] pPipeline  The PAL pipeline to be tracked.
    ///
    /// @returns Success if the pipeline has been unregistered with GpaSession successfully.
    Pal::Result UnregisterPipeline(const Pal::IPipeline* pPipeline);

    /// Register library with GpaSession for obtaining shader dumps and load events in the RGP file.
    ///
    /// @param [in] pLibrary   The PAL library to be tracked.
    /// @param [in] clientInfo API-dependent information for this library to also be recorded.
    ///
    /// @returns Success if the library has been registered with GpaSession successfully.
    ///          + AlreadyExists if a duplicate library is provided.
    Pal::Result RegisterLibrary(const Pal::IShaderLibrary* pLibrary, const RegisterLibraryInfo& clientInfo);

    /// Unregister library with GpaSession for obtaining unload events in the RGP file.
    /// This should be called immediately before destroying the PAL library object.
    ///
    /// @param [in] pLibrary The PAL library to be tracked.
    ///
    /// @returns Success if the library has been unregistered with GpaSession successfully.
    Pal::Result UnregisterLibrary(const Pal::IShaderLibrary* pLibrary);

    /// Register ELF binary with GpaSession for obtaining kernel dumps and load events in the RGP file.
    ///
    /// @param [in] elfBinaryInfo Contains information about the Elf binary to be recorded.
    ///
    /// @returns Success if the Elf binary has been registered with GpaSession successfully.
    Pal::Result RegisterElfBinary(const ElfBinaryInfo& elfBinaryInfo);

    /// Unregister Elf binary with GpaSession for obtaining unload events in the RGP file.
    /// This should be called immediately before destroying the Elf binary.
    ///
    /// @param [in] elfBinaryInfo  Contains the elf binary info to be removed from tracking.
    ///
    /// @returns Success if the library has been unregistered with GpaSession successfully.
    Pal::Result UnregisterElfBinary(const ElfBinaryInfo& elfBinaryInfo);

    /// Given a Pal device, validate a list of perfcounters.
    ///
    /// @param [in] pDevice      a given device
    /// @param [in] pCounters    a list of perf counters.
    /// @param [in] numCounters  perf counter counts.
    ///
    /// @returns Success if counters are valid.
    Pal::Result ValidatePerfCounters(Pal::IDevice*        pDevice,
                                     const PerfCounterId* pCounters,
                                     const Pal::uint32    numCounters);

private:
    // Tracking structure for a single IGpuMemory allocation owned by a GpaSession::GpaSession. In particular, it
    // tracks the associated CPU pointer since these allocations remain mapped for CPU access for their lifetime.
    struct GpuMemoryInfo
    {
        Pal::IGpuMemory* pGpuMemory;
        void*            pCpuAddr;
    };

    // Event type for code object load events
    enum class CodeObjectLoadEventType
    {
        LoadToGpuMemory     = 0,
        UnloadFromGpuMemory
    };

    // Represents all information to be contained in one SqttCodeObjectLoaderEventRecord
    struct CodeObjectLoadEventRecord
    {
        CodeObjectLoadEventType eventType;
        Pal::uint64             baseAddress;
        Pal::ShaderHash         codeObjectHash;
        Pal::uint64             timestamp;
    };

    // Represents all information to be contained in one SqttPsoCorrelationRecord
    struct PsoCorrelationRecord
    {
        Pal::uint64        apiPsoHash;
        Pal::PipelineHash  internalPipelineHash;
    };

    // Registers a single (non-archive) pipeline with the GpaSession. Returns AlreadyExists on duplicate PAL pipeline.
    Pal::Result RegisterSinglePipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo);

    // Unregisters a single (non-archive) pipeline from the GpaSession.
    Pal::Result UnregisterSinglePipeline(const Pal::IPipeline* pPipeline);

    Pal::IDevice*const            m_pDevice;                    // Device associated with this GpaSession.
    Pal::DeviceProperties         m_deviceProps;
    Pal::SetClockModeOutput       m_peakClockFrequency;         // Output of query for stable peak, values in Mhz
    Pal::PerfExperimentProperties m_perfExperimentProps;
    Pal::uint32                   m_timestampAlignment;         // Pre-calculated timestamp data alignment.
    ApiType                       m_apiType;                    // API type, e.g. Vulkan, used in RGP dumps.
    Pal::uint16                   m_apiMajorVer;                // API major version, used in RGP dumps.
    Pal::uint16                   m_apiMinorVer;                // API minor version, used in RGP dumps.
    Pal::uint16                   m_instrumentationSpecVersion; // Spec version of RGP instrumetation.
    Pal::uint16                   m_instrumentationApiVersion;  // Api version of RGP instrumetation.

    Pal::IGpuEvent*               m_pGpuEvent;
    GpaSessionState               m_sessionState;

    const GpaSession* const       m_pSrcSession;                // source session for session created via copy c'tor

    // Tracks the current GPU memory object and offset being sub-allocated for AcquireGpuMem().
    GpuMemoryInfo                 m_curGartGpuMem;
    Pal::gpusize                  m_curGartGpuMemOffset;
    GpuMemoryInfo                 m_curLocalGpuMem;
    Pal::gpusize                  m_curLocalGpuMemOffset;
    GpuMemoryInfo                 m_curInvisGpuMem;
    Pal::gpusize                  m_curInvisGpuMemOffset;

    // Locks for the local-invisible, gart and local memory subdivision (and their pools)
    Util::Mutex m_gartGpuMemLock;
    Util::Mutex m_localGpuMemLock;
    Util::Mutex m_invisGpuMemLock;

    // Counts number of samples that are active in this GpaSession.
    Pal::uint32                   m_sampleCount;

    Pal::IPlatform*const          m_pPlatform;                  // Platform associated with this GpaSesion.

    // GartHeap / LocalHeap / InvisHeap GPU chunk pools.
    Util::Deque<GpuMemoryInfo, GpaAllocator> m_availableGartGpuMem;
    Util::Deque<GpuMemoryInfo, GpaAllocator> m_busyGartGpuMem;
    Util::Deque<GpuMemoryInfo, GpaAllocator> m_availableLocalGpuMem;
    Util::Deque<GpuMemoryInfo, GpaAllocator> m_busyLocalGpuMem;
    Util::Deque<GpuMemoryInfo, GpaAllocator> m_availableInvisGpuMem;
    Util::Deque<GpuMemoryInfo, GpaAllocator> m_busyInvisGpuMem;

    struct SampleItem;
    class PerfSample;
    class CounterSample;
    class TraceSample;
    class TimingSample;
    class QuerySample;

    Util::Vector<SampleItem*, 16, GpaAllocator> m_sampleItemArray;
    PerfExpMemDeque* m_pAvailablePerfExpMem;

    // Unique pipelines registered with this GpaSession.
    Util::HashSet<Pal::uint64, GpaAllocator, Util::JenkinsHashFunc> m_registeredPipelines;
    // Unique API PSOs registered with this GpaSession.
    Util::HashSet<Pal::uint64, GpaAllocator, Util::JenkinsHashFunc> m_registeredApiHashes;

    // List of cached pipeline code object records that will be copied to the final database at the end of a trace
    Util::Deque<SqttCodeObjectDatabaseRecord*, GpaAllocator>  m_codeObjectRecordsCache;
    // List of pipeline code object records that were registered during a trace
    Util::Deque<SqttCodeObjectDatabaseRecord*, GpaAllocator>  m_curCodeObjectRecords;

    // List of cached code object load event records that will be copied to the final database at the end of a trace
    Util::Deque<CodeObjectLoadEventRecord, GpaAllocator>  m_codeObjectLoadEventRecordsCache;
    // List of code object load event records that were registered during a trace
    Util::Deque<CodeObjectLoadEventRecord, GpaAllocator>  m_curCodeObjectLoadEventRecords;

    // List of cached PSO correlation records that will be copied to the final database at the end of a trace
    Util::Deque<PsoCorrelationRecord, GpaAllocator>  m_psoCorrelationRecordsCache;
    // List of PSO correlation records that were registered during a trace
    Util::Deque<PsoCorrelationRecord, GpaAllocator>  m_curPsoCorrelationRecords;

    Util::RWLock m_registerPipelineLock;

    // Event type for timed queue events
    enum class TimedQueueEventType : Pal::uint32
    {
        Submit,
        Signal,
        Wait,
        Present,
        ExternalSignal,
        ExternalWait
    };

    // Struct that contains information about a specific timed queue event.
    struct TimedQueueEventItem
    {
        TimedQueueEventType eventType;      // Type of event
        Pal::uint64         cpuTimestamp;   // Time when the event was processed on the cpu
        Pal::uint64         apiId;          // The api specific id for the queue event
        Pal::uint32         sqttCmdBufId;   // The sqtt command buffer id value associated with a submit event
        Pal::uint32         submitSubIndex; // The sub index of an event within a submission event.
        Pal::uint32         queueIndex;     // The index of the associated queue in the m_timedQueuesArray
        Pal::uint64         frameIndex;     // The index of the current frame being rendered
        union
        {
            struct
            {
                GpuMemoryInfo memInfo[2];   // The gpu memory for the timestamps associated with the event
                Pal::gpusize  offsets[2];   // Memory offsets for the associated timestamp gpu memory
            } gpuTimestamps;

            Pal::uint64       cpuCompletionTimestamp; // The time when the event completed on the cpu
        };
    };

    // Struct for keeping track of timed operation on a specific queue
    struct TimedQueueState
    {
        Pal::IQueue*                                 pQueue;               // Pal Queue
        Pal::uint64                                  queueId;              // Api specific queue id
        Pal::uint64                                  queueContext;         // Api specific queue context
        Pal::QueueType                               queueType;            // Queue type
        Pal::EngineType                              engineType;           // Engine type
        bool                                         valid;                // Used to track if the queue is valid
        Util::Deque<Pal::ICmdBuffer*, GpaAllocator>* pAvailableCmdBuffers; // List of available cmdbuffers
        Util::Deque<Pal::ICmdBuffer*, GpaAllocator>* pBusyCmdBuffers;      // List of busy cmdbuffers
        Pal::IFence*                                 pFence;               // Used to track queue
                                                                           // operations
    };

    // Flags for the current session.
    GpaSessionFlags m_flags;

    // Array containing all of the queues registered for timing operations
    Util::Vector<TimedQueueState*, 8, GpaAllocator> m_timedQueuesArray;
    Util::RWLock m_timedQueuesArrayLock;

    // List of timed queue events for the current session
    Util::Vector<TimedQueueEventItem, 16, GpaAllocator> m_queueEvents;
    Util::Mutex m_queueEventsLock;

    // List of timestamp calibration samples
    Util::Vector<Pal::CalibratedTimestamps, 4, GpaAllocator> m_timestampCalibrations;

    // The most recent gpu clocks sample
    GpuClocksSample m_lastGpuClocksSample;

    // Internal command allocator used for timing command buffers
    Pal::ICmdAllocator* m_pCmdAllocator;

    // Finds the TimedQueueState associated with pQueue.
    Pal::Result FindTimedQueue(Pal::IQueue* pQueue,
                               TimedQueueState** ppQueueState,
                               Pal::uint32* pQueueIndex);

    // Finds the TimedQueueState associated with queueContext.
    Pal::Result FindTimedQueueByContext(Pal::uint64 queueContext,
                                        TimedQueueState** ppQueueState,
                                        Pal::uint32* pQueueIndex);

    /// Injects an external timed queue semaphore operation event
    Pal::Result ExternalTimedQueueSemaphoreOperation(Pal::uint64 queueContext,
                                                     Pal::uint64 cpuSubmissionTimestamp,
                                                     Pal::uint64 cpuCompletionTimestamp,
                                                     const TimedQueueSemaphoreInfo& timedSemaphoreInfo,
                                                     bool isSignalOperation);

    /// Converts a CPU timestamp to a GPU timestamp using a CalibratedTimestamps struct
    Pal::uint64 ConvertCpuTimestampToGpuTimestamp(Pal::uint64                      cpuTimestamp,
                                                  const Pal::CalibratedTimestamps& calibration) const;

    /// Extracts a GPU timestamp from a queue event
    Pal::uint64 ExtractGpuTimestampFromQueueEvent(const TimedQueueEventItem& queueEvent) const;

    // Creates a new command buffer for use on pQueue
    Pal::Result CreateCmdBufferForQueue(Pal::IQueue* pQueue,
                                        Pal::ICmdBuffer** ppCmdBuffer);

    // Acquires a command buffer from the TimedQueueState's command buffer pool
    Pal::Result AcquireTimedQueueCmdBuffer(TimedQueueState* pQueueState,
                                           Pal::ICmdBuffer** ppCmdBuffer);

    // Recycles busy command buffers in pQueueState
    Pal::Result RecycleTimedQueueCmdBuffers(TimedQueueState* pQueueState);

    // Preallocates a fixed number of command buffers for pQueueState and adds them to the command buffer pool
    Pal::Result PreallocateTimedQueueCmdBuffers(TimedQueueState* pQueueState,
                                                Pal::uint32 numCmdBuffers);

    // Resets all per session state in pQueueState
    Pal::Result ResetTimedQueueState(TimedQueueState* pQueueState);

    // Destroys the memory and resources for pQueueState
    void DestroyTimedQueueState(TimedQueueState* pQueueState);

    // Helper function to import one sample item from a source session to copy session.
    Pal::Result ImportSampleItem(const SampleItem* pSrcSampleItem);

    // Acquires a range of queue-owned GPU memory for use by the next command buffer submission.
    Pal::Result AcquireGpuMem(
        Pal::gpusize            size,
        Pal::gpusize            alignment,
        Pal::GpuHeap            heapType,
        Pal::GpuMemMallPolicy   mallPolicy,
        GpuMemoryInfo*          pGpuMem,
        Pal::gpusize*           pOffset);

    // Acquires a GpaSession-owned performance experiment based on the device's active perf counter requests.
    Pal::Result AcquirePerfExperiment(
        GpaSession::SampleItem* pSampleItem,
        const GpaSampleConfig&  sampleConfig,
        GpuMemoryInfo*          pGpuMem,
        Pal::gpusize*           pOffset,
        GpuMemoryInfo*          pSecondaryGpuMem,
        Pal::gpusize*           pSecondaryOffset,
        Pal::gpusize*           pHeapSize,
        Pal::IPerfExperiment**  ppExperiment);

    // Acquires a session-owned pipeline stats query.
    Pal::Result AcquirePipeStatsQuery(
        GpuMemoryInfo*          pGpuMem,
        Pal::gpusize*           pOffset,
        Pal::gpusize*           pHeapSize,
        Pal::IQueryPool**       ppQuery);

    // Dump SQ thread trace data in rgp format
    Pal::Result DumpRgpData(const GpaSampleConfig* pTraceConfig,
                            TraceSample*           pTraceSample,
                            void*                  pRgpOutput,
                            size_t*                pTraceSize) const;

    // Dumps the spm trace data in the buffer provided.
    Pal::Result AppendSpmTraceData(TraceSample*  pTraceSample,
                                   size_t        bufferSize,
                                   void*         pData,
                                   Pal::gpusize* pSizeInBytes) const;

    // Dumps the df spm trace data in the buffer provided.
    Pal::Result AppendDfSpmTraceData(TraceSample*  pTraceSample,
                                     size_t        bufferSize,
                                     void*         pData,
                                     Pal::gpusize* pSizeInBytes) const;

    Pal::Result AddCodeObjectLoadEvent(const Pal::IPipeline* pPipeline, CodeObjectLoadEventType eventType);
    Pal::Result AddCodeObjectLoadEvent(const Pal::IShaderLibrary* pLibrary, CodeObjectLoadEventType eventType);
    Pal::Result AddCodeObjectLoadEvent(const ElfBinaryInfo& elfBinaryInfo, CodeObjectLoadEventType eventType);

    // Recycle used Gart rafts and put back to available pool
    void RecycleGartGpuMem();

    // Recycle used Local rafts and put back to available pool
    void RecycleLocalGpuMem();

    // Recycle used Invisible rafts and put back to available pool
    void RecycleInvisGpuMem();

    // Destroy and free one sample item and its sub-items.
    void FreeSampleItem(GpaSession::SampleItem* pSampleItem);

    // Destroy and free the m_sampleItemArray and associated memory allocation
    void FreeSampleItemArray();

    // Destroy the sub-items in m_sampleItemArray but keep associated memory allocations.
    void RecycleSampleItemArray();

    // Helper function to destroy the GpuMemoryInfo object
    void DestroyGpuMemoryInfo(GpuMemoryInfo* pGpuMemoryInfo);

    PAL_DISALLOW_DEFAULT_CTOR(GpaSession);
    GpaSession& operator =(const GpaSession&);
};
} // GpuUtil