mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-20 00:24:58 +08:00
feature: Add heapless mode programming in ocl
Related-To: NEO-7621 Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
c35b13ccae
commit
ce7298d512
@@ -17,6 +17,7 @@
|
||||
#include "shared/source/debugger/debugger_l0.h"
|
||||
#include "shared/source/execution_environment/root_device_environment.h"
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/helpers/compiler_product_helper.h"
|
||||
#include "shared/source/memory_manager/allocation_properties.h"
|
||||
#include "shared/source/memory_manager/memory_manager.h"
|
||||
#include "shared/source/os_interface/os_context.h"
|
||||
@@ -98,6 +99,8 @@ ze_result_t CommandQueueImp::initialize(bool copyOnly, bool isInternal, bool imm
|
||||
this->doubleSbaWa = productHelper.isAdditionalStateBaseAddressWARequired(hwInfo);
|
||||
this->cmdListHeapAddressModel = L0GfxCoreHelper::getHeapAddressModel(rootDeviceEnvironment);
|
||||
this->dispatchCmdListBatchBufferAsPrimary = L0GfxCoreHelper::dispatchCmdListBatchBufferAsPrimary(rootDeviceEnvironment, !immediateCmdListQueue);
|
||||
auto &compilerProductHelper = rootDeviceEnvironment.getHelper<NEO::CompilerProductHelper>();
|
||||
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled();
|
||||
}
|
||||
return returnValue;
|
||||
}
|
||||
|
||||
@@ -87,6 +87,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
|
||||
bool doubleSbaWa = false;
|
||||
bool dispatchCmdListBatchBufferAsPrimary = false;
|
||||
bool internalQueueForImmediateCommandList = false;
|
||||
bool heaplessModeEnabled = false;
|
||||
};
|
||||
|
||||
using CommandQueueAllocatorFn = CommandQueue *(*)(Device *device, NEO::CommandStreamReceiver *csr,
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/helpers/array_count.h"
|
||||
#include "shared/source/helpers/bit_helpers.h"
|
||||
#include "shared/source/helpers/compiler_product_helper.h"
|
||||
#include "shared/source/helpers/engine_node_helper.h"
|
||||
#include "shared/source/helpers/flush_stamp.h"
|
||||
#include "shared/source/helpers/get_info.h"
|
||||
@@ -102,6 +103,7 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr
|
||||
auto &hwInfo = device->getHardwareInfo();
|
||||
auto &gfxCoreHelper = device->getGfxCoreHelper();
|
||||
auto &productHelper = device->getProductHelper();
|
||||
auto &compilerProductHelper = device->getCompilerProductHelper();
|
||||
auto &rootDeviceEnvironment = device->getRootDeviceEnvironment();
|
||||
|
||||
bcsAllowed = productHelper.isBlitterFullySupported(hwInfo) &&
|
||||
@@ -127,6 +129,8 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr
|
||||
if (NEO::Debugger::isDebugEnabled(internalUsage) && device->getDevice().getL0Debugger()) {
|
||||
device->getDevice().getL0Debugger()->notifyCommandQueueCreated(&device->getDevice());
|
||||
}
|
||||
|
||||
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -384,6 +384,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
|
||||
void handlePostCompletionOperations(bool checkQueueCompletion);
|
||||
|
||||
bool getHeaplessModeEnabled() const { return this->heaplessModeEnabled; }
|
||||
|
||||
protected:
|
||||
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
|
||||
cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest);
|
||||
@@ -477,6 +479,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
bool dcFlushRequiredOnStallingCommandsOnNextFlush = false;
|
||||
bool splitBarrierRequired = false;
|
||||
bool gpgpuCsrClientRegistered = false;
|
||||
bool heaplessModeEnabled = false;
|
||||
};
|
||||
|
||||
template <typename PtrType>
|
||||
|
||||
@@ -552,11 +552,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
|
||||
dispatchWalkerArgs.event = event;
|
||||
dispatchWalkerArgs.relaxedOrderingEnabled = relaxedOrderingEnabled;
|
||||
|
||||
HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
*this,
|
||||
multiDispatchInfo,
|
||||
csrDeps,
|
||||
dispatchWalkerArgs);
|
||||
HardwareInterface<GfxFamily>::dispatchWalkerCommon(*this, multiDispatchInfo, csrDeps, dispatchWalkerArgs);
|
||||
|
||||
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
|
||||
@@ -35,8 +35,9 @@ class GpgpuWalkerHelper {
|
||||
static size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
|
||||
static size_t getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel);
|
||||
|
||||
template <typename WalkerType>
|
||||
static size_t setGpgpuWalkerThreadData(
|
||||
WALKER_TYPE *walkerCmd,
|
||||
WalkerType *walkerCmd,
|
||||
const KernelDescriptor &kernelDescriptor,
|
||||
const size_t globalOffsets[3],
|
||||
const size_t startWorkGroups[3],
|
||||
@@ -67,9 +68,10 @@ class GpgpuWalkerHelper {
|
||||
TagNodeBase &hwPerfCounter,
|
||||
LinearStream *commandStream);
|
||||
|
||||
template <typename WalkerType>
|
||||
static void setupTimestampPacket(
|
||||
LinearStream *cmdStream,
|
||||
WALKER_TYPE *walkerCmd,
|
||||
WalkerType *walkerCmd,
|
||||
TagNodeBase *timestampPacketNode,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
|
||||
@@ -94,6 +96,7 @@ struct EnqueueOperation {
|
||||
static size_t getSizeForCacheFlushAfterWalkerCommands(const Kernel &kernel, const CommandQueue &commandQueue);
|
||||
|
||||
private:
|
||||
template <typename WalkerType>
|
||||
static size_t getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo);
|
||||
static size_t getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue);
|
||||
};
|
||||
|
||||
@@ -247,7 +247,7 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCS(uint32_t cmdType, bool res
|
||||
if (isCommandWithoutKernel(cmdType)) {
|
||||
return EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue);
|
||||
} else {
|
||||
return EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, pKernel, dispatchInfo);
|
||||
return EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel<typename GfxFamily::WALKER_TYPE>(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, pKernel, dispatchInfo);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -17,8 +17,9 @@
|
||||
namespace NEO {
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
WALKER_TYPE *walkerCmd,
|
||||
WalkerType *walkerCmd,
|
||||
const KernelDescriptor &kernelDescriptor,
|
||||
const size_t globalOffsets[3],
|
||||
const size_t startWorkGroups[3],
|
||||
@@ -58,9 +59,10 @@ inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
|
||||
LinearStream *cmdStream,
|
||||
WALKER_TYPE *walkerCmd,
|
||||
WalkerType *walkerCmd,
|
||||
TagNodeBase *timestampPacketNode,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
|
||||
@@ -78,6 +80,7 @@ void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo) {
|
||||
size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + HardwareCommandsHelper<GfxFamily>::getSizeRequiredCS() +
|
||||
sizeof(PIPE_CONTROL) * (MemorySynchronizationCommands<GfxFamily>::isBarrierWaRequired(commandQueue.getDevice().getRootDeviceEnvironment()) ? 2 : 1);
|
||||
|
||||
@@ -21,8 +21,9 @@
|
||||
namespace NEO {
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
WALKER_TYPE *walkerCmd,
|
||||
WalkerType *walkerCmd,
|
||||
const KernelDescriptor &kernelDescriptor,
|
||||
const size_t globalOffsets[3],
|
||||
const size_t startWorkGroups[3],
|
||||
@@ -50,7 +51,7 @@ size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
}
|
||||
|
||||
walkerCmd->setExecutionMask(static_cast<uint32_t>(executionMask));
|
||||
walkerCmd->setSimdSize(getSimdConfig<WALKER_TYPE>(simd));
|
||||
walkerCmd->setSimdSize(getSimdConfig<WalkerType>(simd));
|
||||
walkerCmd->setMessageSimd(walkerCmd->getSimdSize());
|
||||
|
||||
if (DebugManager.flags.ForceSimdMessageSizeInWalker.get() != -1) {
|
||||
@@ -64,6 +65,7 @@ size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
// 1) cross-thread inline data will be put into R1, but if kernel uses local ids, then cross-thread should be put further back
|
||||
// so whenever local ids are driver or hw generated, reserve space by setting right values for emitLocalIds
|
||||
// 2) Auto-generation of local ids should be possible, when in fact local ids are used
|
||||
|
||||
if (!localIdsGenerationByRuntime && kernelUsesLocalIds) {
|
||||
uint32_t emitLocalIdsForDim = 0;
|
||||
if (kernelDescriptor.kernelAttributes.localId[0]) {
|
||||
@@ -77,6 +79,7 @@ size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
}
|
||||
walkerCmd->setEmitLocalId(emitLocalIdsForDim);
|
||||
}
|
||||
|
||||
if (inlineDataProgrammingRequired == true) {
|
||||
walkerCmd->setEmitInlineParameter(1);
|
||||
}
|
||||
@@ -94,20 +97,20 @@ size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(LinearStream *cmdStream,
|
||||
WALKER_TYPE *walkerCmd,
|
||||
WalkerType *walkerCmd,
|
||||
TagNodeBase *timestampPacketNode,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
|
||||
|
||||
const auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
|
||||
auto &postSyncData = walkerCmd->getPostSync();
|
||||
postSyncData.setDataportPipelineFlush(true);
|
||||
|
||||
EncodeDispatchKernel<GfxFamily>::setupPostSyncMocs(*walkerCmd, rootDeviceEnvironment,
|
||||
MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, rootDeviceEnvironment));
|
||||
EncodeDispatchKernel<GfxFamily>::template setupPostSyncMocs<WalkerType>(*walkerCmd, rootDeviceEnvironment,
|
||||
MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, rootDeviceEnvironment));
|
||||
|
||||
EncodeDispatchKernel<GfxFamily>::adjustTimestampPacket(*walkerCmd, hwInfo);
|
||||
EncodeDispatchKernel<GfxFamily>::template adjustTimestampPacket<WalkerType>(*walkerCmd, hwInfo);
|
||||
|
||||
if (DebugManager.flags.UseImmDataWriteModeOnPostSyncOperation.get()) {
|
||||
postSyncData.setOperation(GfxFamily::POSTSYNC_DATA::OPERATION::OPERATION_WRITE_IMMEDIATE_DATA);
|
||||
@@ -119,8 +122,11 @@ void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(LinearStream *cmdStream,
|
||||
auto contextStartAddress = TimestampPacketHelper::getContextStartGpuAddress(*timestampPacketNode);
|
||||
postSyncData.setDestinationAddress(contextStartAddress);
|
||||
}
|
||||
if (DebugManager.flags.OverrideSystolicInComputeWalker.get() != -1) {
|
||||
walkerCmd->setSystolicModeEnable((DebugManager.flags.OverrideSystolicInComputeWalker.get()));
|
||||
|
||||
if constexpr (std::is_same_v<WalkerType, typename GfxFamily::COMPUTE_WALKER>) {
|
||||
if (DebugManager.flags.OverrideSystolicInComputeWalker.get() != -1) {
|
||||
walkerCmd->setSystolicModeEnable((DebugManager.flags.OverrideSystolicInComputeWalker.get()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -130,10 +136,11 @@ void GpgpuWalkerHelper<GfxFamily>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxF
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo) {
|
||||
size_t numBarriers = MemorySynchronizationCommands<GfxFamily>::isBarrierWaRequired(commandQueue.getDevice().getRootDeviceEnvironment()) ? 2 : 1;
|
||||
|
||||
size_t size = sizeof(typename GfxFamily::COMPUTE_WALKER) +
|
||||
size_t size = sizeof(WalkerType) +
|
||||
(MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(false) * numBarriers) +
|
||||
HardwareCommandsHelper<GfxFamily>::getSizeRequiredCS() +
|
||||
EncodeMemoryPrefetch<GfxFamily>::getSizeForMemoryPrefetch(pKernel->getKernelInfo().heapInfo.kernelHeapSize, commandQueue.getDevice().getRootDeviceEnvironment());
|
||||
@@ -144,7 +151,7 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilin
|
||||
Vec3<size_t> groupCount = dispatchInfo.getNumberOfWorkgroups();
|
||||
UNRECOVERABLE_IF(groupCount.x == 0);
|
||||
const bool staticPartitioning = commandQueue.getGpgpuCommandStreamReceiver().isStaticWorkPartitioningEnabled();
|
||||
size += static_cast<size_t>(ImplicitScalingDispatch<GfxFamily>::getSize(false, staticPartitioning, devices, groupStart, groupCount));
|
||||
size += static_cast<size_t>(ImplicitScalingDispatch<GfxFamily>::template getSize<WalkerType>(false, staticPartitioning, devices, groupStart, groupCount));
|
||||
}
|
||||
|
||||
size += PerformanceCounters::getGpuCommandsSize(commandQueue.getPerfCounters(), commandQueue.getGpgpuEngine().osContext->getEngineType(), reservePerfCounters);
|
||||
|
||||
@@ -55,12 +55,19 @@ class HardwareInterface {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
|
||||
using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;
|
||||
|
||||
template <typename WalkerType>
|
||||
static void dispatchWalker(
|
||||
CommandQueue &commandQueue,
|
||||
const MultiDispatchInfo &multiDispatchInfo,
|
||||
const CsrDependencies &csrDependencies,
|
||||
HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
|
||||
static void dispatchWalkerCommon(
|
||||
CommandQueue &commandQueue,
|
||||
const MultiDispatchInfo &multiDispatchInfo,
|
||||
const CsrDependencies &csrDependencies,
|
||||
HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
|
||||
static void getDefaultDshSpace(
|
||||
const size_t &offsetInterfaceDescriptorTable,
|
||||
CommandQueue &commandQueue,
|
||||
@@ -94,6 +101,7 @@ class HardwareInterface {
|
||||
DebugPauseState waitCondition,
|
||||
const HardwareInfo &hwInfo);
|
||||
|
||||
template <typename WalkerType>
|
||||
static void programWalker(
|
||||
LinearStream &commandStream,
|
||||
Kernel &kernel,
|
||||
@@ -104,12 +112,14 @@ class HardwareInterface {
|
||||
const DispatchInfo &dispatchInfo,
|
||||
HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
|
||||
static WALKER_TYPE *allocateWalkerSpace(LinearStream &commandStream,
|
||||
const Kernel &kernel);
|
||||
template <typename WalkerType>
|
||||
static WalkerType *allocateWalkerSpace(LinearStream &commandStream,
|
||||
const Kernel &kernel);
|
||||
|
||||
static void obtainIndirectHeaps(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo,
|
||||
bool blockedQueue, IndirectHeap *&dsh, IndirectHeap *&ioh, IndirectHeap *&ssh);
|
||||
|
||||
template <typename WalkerType>
|
||||
static void dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream,
|
||||
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh,
|
||||
HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
|
||||
@@ -23,11 +23,18 @@
|
||||
namespace NEO {
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline typename GfxFamily::WALKER_TYPE *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream, const Kernel &kernel) {
|
||||
auto walkerCmd = commandStream.getSpaceForCmd<WALKER_TYPE>();
|
||||
template <typename WalkerType>
|
||||
inline WalkerType *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream, const Kernel &kernel) {
|
||||
auto walkerCmd = commandStream.getSpaceForCmd<WalkerType>();
|
||||
return walkerCmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void HardwareInterface<GfxFamily>::dispatchWalkerCommon(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs) {
|
||||
|
||||
dispatchWalker<typename GfxFamily::WALKER_TYPE>(commandQueue, multiDispatchInfo, csrDependencies, walkerArgs);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
|
||||
TagNodeBase *hwTimeStamps,
|
||||
@@ -61,6 +68,7 @@ inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
void HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
CommandQueue &commandQueue,
|
||||
const MultiDispatchInfo &multiDispatchInfo,
|
||||
@@ -111,7 +119,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
walkerArgs.interfaceDescriptorIndex = 0;
|
||||
walkerArgs.offsetInterfaceDescriptorTable = dsh->getUsed();
|
||||
|
||||
size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
size_t totalInterfaceDescriptorTableSize = GfxFamily::template getInterfaceDescriptorSize<WalkerType>();
|
||||
|
||||
getDefaultDshSpace(walkerArgs.offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize, dsh, commandStream);
|
||||
|
||||
@@ -143,7 +151,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
dispatchInfo.dispatchInitCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getRootDeviceEnvironment());
|
||||
walkerArgs.isMainKernel = (dispatchInfo.getKernel() == mainKernel);
|
||||
|
||||
dispatchKernelCommands(commandQueue, dispatchInfo, *commandStream, *dsh, *ioh, *ssh, walkerArgs);
|
||||
dispatchKernelCommands<WalkerType>(commandQueue, dispatchInfo, *commandStream, *dsh, *ioh, *ssh, walkerArgs);
|
||||
|
||||
walkerArgs.currentDispatchIndex++;
|
||||
dispatchInfo.dispatchEpilogueCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getRootDeviceEnvironment());
|
||||
@@ -164,6 +172,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream,
|
||||
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh,
|
||||
HardwareInterfaceWalkerArgs &walkerArgs) {
|
||||
@@ -223,7 +232,7 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
|
||||
|
||||
dispatchWorkarounds(&commandStream, commandQueue, kernel, true);
|
||||
|
||||
programWalker(commandStream, kernel, commandQueue, dsh, ioh, ssh, dispatchInfo, walkerArgs);
|
||||
programWalker<WalkerType>(commandStream, kernel, commandQueue, dsh, ioh, ssh, dispatchInfo, walkerArgs);
|
||||
|
||||
dispatchWorkarounds(&commandStream, commandQueue, kernel, false);
|
||||
}
|
||||
|
||||
@@ -47,6 +47,7 @@ inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
LinearStream &commandStream,
|
||||
Kernel &kernel,
|
||||
@@ -57,8 +58,8 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
const DispatchInfo &dispatchInfo,
|
||||
HardwareInterfaceWalkerArgs &walkerArgs) {
|
||||
|
||||
auto walkerCmdBuf = allocateWalkerSpace(commandStream, kernel);
|
||||
WALKER_TYPE walkerCmd = GfxFamily::cmdInitGpgpuWalker;
|
||||
auto walkerCmdBuf = allocateWalkerSpace<WalkerType>(commandStream, kernel);
|
||||
WalkerType walkerCmd = GfxFamily::cmdInitGpgpuWalker;
|
||||
uint32_t dim = dispatchInfo.getDim();
|
||||
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
|
||||
auto &rootDeviceEnvironment = commandQueue.getDevice().getRootDeviceEnvironment();
|
||||
@@ -82,7 +83,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
|
||||
false, false, 0u);
|
||||
|
||||
HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
HardwareCommandsHelper<GfxFamily>::template sendIndirectState<WalkerType, INTERFACE_DESCRIPTOR_DATA>(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
@@ -98,6 +99,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
&walkerCmd,
|
||||
nullptr,
|
||||
kernelUsesLocalIds,
|
||||
0,
|
||||
commandQueue.getDevice());
|
||||
|
||||
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), false, kernel.getKernelInfo().kernelDescriptor};
|
||||
|
||||
@@ -8,7 +8,9 @@
|
||||
#pragma once
|
||||
#include "shared/source/command_container/command_encoder.h"
|
||||
#include "shared/source/command_container/implicit_scaling.h"
|
||||
#include "shared/source/command_stream/command_stream_receiver.h"
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/device/device.h"
|
||||
#include "shared/source/helpers/engine_node_helper.h"
|
||||
#include "shared/source/os_interface/os_context.h"
|
||||
#include "shared/source/os_interface/os_interface.h"
|
||||
@@ -37,6 +39,7 @@ inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
LinearStream &commandStream,
|
||||
Kernel &kernel,
|
||||
@@ -47,9 +50,9 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
const DispatchInfo &dispatchInfo,
|
||||
HardwareInterfaceWalkerArgs &walkerArgs) {
|
||||
|
||||
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
|
||||
using InterfaceDescriptorType = typename WalkerType::InterfaceDescriptorType;
|
||||
WalkerType walkerCmd = GfxFamily::template getInitGpuWalker<WalkerType>();
|
||||
|
||||
COMPUTE_WALKER walkerCmd = GfxFamily::cmdInitGpgpuWalker;
|
||||
auto &kernelInfo = kernel.getKernelInfo();
|
||||
|
||||
uint32_t dim = dispatchInfo.getDim();
|
||||
@@ -75,7 +78,6 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
simd);
|
||||
|
||||
bool inlineDataProgrammingRequired = EncodeDispatchKernel<GfxFamily>::inlineDataProgrammingRequired(kernel.getKernelInfo().kernelDescriptor);
|
||||
auto idd = &walkerCmd.getInterfaceDescriptor();
|
||||
auto &queueCsr = commandQueue.getGpgpuCommandStreamReceiver();
|
||||
|
||||
auto &rootDeviceEnvironment = commandQueue.getDevice().getRootDeviceEnvironment();
|
||||
@@ -86,27 +88,40 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
}
|
||||
|
||||
if (timestampPacketNode) {
|
||||
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacketNode, rootDeviceEnvironment);
|
||||
GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacket<WalkerType>(&commandStream, &walkerCmd, timestampPacketNode, rootDeviceEnvironment);
|
||||
}
|
||||
|
||||
auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
|
||||
|
||||
const auto &hwInfo = commandQueue.getDevice().getHardwareInfo();
|
||||
if (auto kernelAllocation = kernelInfo.getGraphicsAllocation()) {
|
||||
EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.kernelHeapSize, 0, rootDeviceEnvironment);
|
||||
constexpr bool heaplessModeEnabled = GfxFamily::template isHeaplessMode<WalkerType>();
|
||||
|
||||
if constexpr (heaplessModeEnabled == false) {
|
||||
if (auto kernelAllocation = kernelInfo.getGraphicsAllocation()) {
|
||||
EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.kernelHeapSize, 0, rootDeviceEnvironment);
|
||||
}
|
||||
}
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups,
|
||||
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
|
||||
localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder);
|
||||
GpgpuWalkerHelper<GfxFamily>::template setGpgpuWalkerThreadData<WalkerType>(&walkerCmd, kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups,
|
||||
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
|
||||
localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder);
|
||||
|
||||
HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
auto interfaceDescriptor = &walkerCmd.getInterfaceDescriptor();
|
||||
uint64_t scratchAddress = 0;
|
||||
|
||||
if constexpr (heaplessModeEnabled) {
|
||||
auto scratchAllocation = queueCsr.getScratchAllocation();
|
||||
if (scratchAllocation) {
|
||||
scratchAddress = scratchAllocation->getGpuAddress();
|
||||
}
|
||||
}
|
||||
|
||||
HardwareCommandsHelper<GfxFamily>::template sendIndirectState<WalkerType, InterfaceDescriptorType>(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
ssh,
|
||||
kernel,
|
||||
kernel.getKernelStartAddress(localIdsGenerationByRuntime, kernelUsesLocalIds, isCcsUsed, false),
|
||||
kernel.getKernelStartAddress(localIdsGenerationByRuntime, kernelUsesLocalIds, isCcsUsed, heaplessModeEnabled),
|
||||
simd,
|
||||
walkerArgs.localWorkSizes,
|
||||
threadGroupCount,
|
||||
@@ -114,8 +129,9 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
walkerArgs.interfaceDescriptorIndex,
|
||||
walkerArgs.preemptionMode,
|
||||
&walkerCmd,
|
||||
idd,
|
||||
interfaceDescriptor,
|
||||
localIdsGenerationByRuntime,
|
||||
scratchAddress,
|
||||
commandQueue.getDevice());
|
||||
|
||||
bool kernelSystemAllocation = false;
|
||||
@@ -126,7 +142,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
}
|
||||
bool requiredSystemFence = kernelSystemAllocation && walkerArgs.event != nullptr;
|
||||
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), requiredSystemFence, kernelInfo.kernelDescriptor};
|
||||
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, encodeWalkerArgs);
|
||||
EncodeDispatchKernel<GfxFamily>::template encodeAdditionalWalkerFields<WalkerType>(rootDeviceEnvironment, walkerCmd, encodeWalkerArgs);
|
||||
|
||||
auto devices = queueCsr.getOsContext().getDeviceBitfield();
|
||||
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true);
|
||||
@@ -139,18 +155,18 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
if (partitionWalker) {
|
||||
const uint64_t workPartitionAllocationGpuVa = commandQueue.getDevice().getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
|
||||
uint32_t partitionCount = 0u;
|
||||
ImplicitScalingDispatch<GfxFamily>::dispatchCommands(commandStream,
|
||||
walkerCmd,
|
||||
nullptr,
|
||||
devices,
|
||||
partitionCount,
|
||||
false,
|
||||
false,
|
||||
kernel.usesImages(),
|
||||
queueCsr.getDcFlushSupport(),
|
||||
kernel.isSingleSubdevicePreferred(),
|
||||
workPartitionAllocationGpuVa,
|
||||
hwInfo);
|
||||
ImplicitScalingDispatch<GfxFamily>::template dispatchCommands<WalkerType>(commandStream,
|
||||
walkerCmd,
|
||||
nullptr,
|
||||
devices,
|
||||
partitionCount,
|
||||
false,
|
||||
false,
|
||||
kernel.usesImages(),
|
||||
queueCsr.getDcFlushSupport(),
|
||||
kernel.isSingleSubdevicePreferred(),
|
||||
workPartitionAllocationGpuVa,
|
||||
hwInfo);
|
||||
if (queueCsr.isStaticWorkPartitioningEnabled()) {
|
||||
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount));
|
||||
}
|
||||
@@ -159,7 +175,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
timestampPacketNode->setPacketsUsed(partitionCount);
|
||||
}
|
||||
} else {
|
||||
auto computeWalkerOnStream = commandStream.getSpaceForCmd<typename GfxFamily::COMPUTE_WALKER>();
|
||||
auto computeWalkerOnStream = commandStream.getSpaceForCmd<WalkerType>();
|
||||
*computeWalkerOnStream = walkerCmd;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2022 Intel Corporation
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -13,10 +13,20 @@
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template class HardwareInterface<Gen11Family>;
|
||||
using Family = Gen11Family;
|
||||
|
||||
template class GpgpuWalkerHelper<Gen11Family>;
|
||||
template class HardwareInterface<Family>;
|
||||
|
||||
template struct EnqueueOperation<Gen11Family>;
|
||||
template void HardwareInterface<Family>::dispatchWalker<Family::WALKER_TYPE>(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template void HardwareInterface<Family>::programWalker<Family::WALKER_TYPE>(LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template void HardwareInterface<Family>::dispatchKernelCommands<Family::WALKER_TYPE>(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template Family::WALKER_TYPE *HardwareInterface<Family>::allocateWalkerSpace<Family::WALKER_TYPE>(LinearStream &commandStream, const Kernel &kernel);
|
||||
|
||||
template class GpgpuWalkerHelper<Family>;
|
||||
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::WALKER_TYPE>(LinearStream *cmdStream, Family::WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::WALKER_TYPE>(Family::WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3],
|
||||
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);
|
||||
|
||||
template struct EnqueueOperation<Family>;
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2022 Intel Corporation
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -16,4 +16,53 @@ namespace NEO {
|
||||
using FamilyType = Gen11Family;
|
||||
|
||||
template struct HardwareCommandsHelper<FamilyType>;
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendIndirectState<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||
LinearStream &commandStream,
|
||||
IndirectHeap &dsh,
|
||||
IndirectHeap &ioh,
|
||||
IndirectHeap &ssh,
|
||||
Kernel &kernel,
|
||||
uint64_t kernelStartOffset,
|
||||
uint32_t simd,
|
||||
const size_t localWorkSize[3],
|
||||
const uint32_t threadGroupCount,
|
||||
const uint64_t offsetInterfaceDescriptorTable,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
PreemptionMode preemptionMode,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
|
||||
bool localIdsGenerationByRuntime,
|
||||
uint64_t scratchAddress,
|
||||
const Device &device);
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendCrossThreadData<FamilyType::WALKER_TYPE>(
|
||||
IndirectHeap &indirectHeap,
|
||||
Kernel &kernel,
|
||||
bool inlineDataProgrammingRequired,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
uint32_t &sizeCrossThreadData,
|
||||
uint64_t scratchAddress);
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||
const IndirectHeap &indirectHeap,
|
||||
uint64_t offsetInterfaceDescriptor,
|
||||
uint64_t kernelStartOffset,
|
||||
size_t sizeCrossThreadData,
|
||||
size_t sizePerThreadData,
|
||||
size_t bindingTablePointer,
|
||||
[[maybe_unused]] size_t offsetSamplerState,
|
||||
uint32_t numSamplers,
|
||||
const uint32_t threadGroupCount,
|
||||
uint32_t numThreadsPerThreadGroup,
|
||||
const Kernel &kernel,
|
||||
uint32_t bindingTablePrefetchSize,
|
||||
PreemptionMode preemptionMode,
|
||||
const Device &device,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
|
||||
|
||||
template void HardwareCommandsHelper<FamilyType>::programInlineData<FamilyType::WALKER_TYPE>(
|
||||
Kernel &kernel,
|
||||
FamilyType::WALKER_TYPE *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2022 Intel Corporation
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -12,48 +12,58 @@
|
||||
|
||||
namespace NEO {
|
||||
|
||||
using Family = Gen12LpFamily;
|
||||
|
||||
template <>
|
||||
void GpgpuWalkerHelper<Gen12LpFamily>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<Gen12LpFamily> *storeCmd) {
|
||||
void GpgpuWalkerHelper<Family>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<Family> *storeCmd) {
|
||||
storeCmd->setMmioRemapEnable(true);
|
||||
}
|
||||
|
||||
template <>
|
||||
void HardwareInterface<Gen12LpFamily>::dispatchWorkarounds(
|
||||
void HardwareInterface<Family>::dispatchWorkarounds(
|
||||
LinearStream *commandStream,
|
||||
CommandQueue &commandQueue,
|
||||
Kernel &kernel,
|
||||
const bool &enable) {
|
||||
|
||||
using MI_LOAD_REGISTER_IMM = typename Gen12LpFamily::MI_LOAD_REGISTER_IMM;
|
||||
using PIPE_CONTROL = typename Gen12LpFamily::PIPE_CONTROL;
|
||||
using MI_LOAD_REGISTER_IMM = typename Family::MI_LOAD_REGISTER_IMM;
|
||||
using PIPE_CONTROL = typename Family::PIPE_CONTROL;
|
||||
|
||||
if (kernel.requiresWaDisableRccRhwoOptimization()) {
|
||||
|
||||
PIPE_CONTROL cmdPipeControl = Gen12LpFamily::cmdInitPipeControl;
|
||||
PIPE_CONTROL cmdPipeControl = Family::cmdInitPipeControl;
|
||||
cmdPipeControl.setCommandStreamerStallEnable(true);
|
||||
auto pCmdPipeControl = commandStream->getSpaceForCmd<PIPE_CONTROL>();
|
||||
*pCmdPipeControl = cmdPipeControl;
|
||||
|
||||
uint32_t value = enable ? 0x40004000 : 0x40000000;
|
||||
NEO::LriHelper<Gen12LpFamily>::program(commandStream,
|
||||
0x7010,
|
||||
value,
|
||||
false);
|
||||
NEO::LriHelper<Family>::program(commandStream,
|
||||
0x7010,
|
||||
value,
|
||||
false);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
size_t GpgpuWalkerHelper<Gen12LpFamily>::getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel) {
|
||||
size_t GpgpuWalkerHelper<Family>::getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel) {
|
||||
if (pKernel->requiresWaDisableRccRhwoOptimization()) {
|
||||
return (2 * (sizeof(Gen12LpFamily::PIPE_CONTROL) + sizeof(Gen12LpFamily::MI_LOAD_REGISTER_IMM)));
|
||||
return (2 * (sizeof(Gen12LpFamily::PIPE_CONTROL) + sizeof(Family::MI_LOAD_REGISTER_IMM)));
|
||||
}
|
||||
return 0u;
|
||||
}
|
||||
|
||||
template class HardwareInterface<Gen12LpFamily>;
|
||||
template class HardwareInterface<Family>;
|
||||
|
||||
template class GpgpuWalkerHelper<Gen12LpFamily>;
|
||||
template void HardwareInterface<Family>::dispatchWalker<Family::WALKER_TYPE>(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template void HardwareInterface<Family>::programWalker<Family::WALKER_TYPE>(LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template void HardwareInterface<Family>::dispatchKernelCommands<Family::WALKER_TYPE>(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template Family::WALKER_TYPE *HardwareInterface<Family>::allocateWalkerSpace<Family::WALKER_TYPE>(LinearStream &commandStream, const Kernel &kernel);
|
||||
|
||||
template struct EnqueueOperation<Gen12LpFamily>;
|
||||
template class GpgpuWalkerHelper<Family>;
|
||||
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::WALKER_TYPE>(LinearStream *cmdStream, Family::WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::WALKER_TYPE>(Family::WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3],
|
||||
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);
|
||||
|
||||
template struct EnqueueOperation<Family>;
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -23,4 +23,54 @@ size_t HardwareCommandsHelper<FamilyType>::getSizeRequiredCS() {
|
||||
}
|
||||
|
||||
template struct HardwareCommandsHelper<FamilyType>;
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendIndirectState<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||
LinearStream &commandStream,
|
||||
IndirectHeap &dsh,
|
||||
IndirectHeap &ioh,
|
||||
IndirectHeap &ssh,
|
||||
Kernel &kernel,
|
||||
uint64_t kernelStartOffset,
|
||||
uint32_t simd,
|
||||
const size_t localWorkSize[3],
|
||||
const uint32_t threadGroupCount,
|
||||
const uint64_t offsetInterfaceDescriptorTable,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
PreemptionMode preemptionMode,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
|
||||
bool localIdsGenerationByRuntime,
|
||||
uint64_t scratchAddress,
|
||||
const Device &device);
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendCrossThreadData<FamilyType::WALKER_TYPE>(
|
||||
IndirectHeap &indirectHeap,
|
||||
Kernel &kernel,
|
||||
bool inlineDataProgrammingRequired,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
uint32_t &sizeCrossThreadData,
|
||||
uint64_t scratchAddress);
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||
const IndirectHeap &indirectHeap,
|
||||
uint64_t offsetInterfaceDescriptor,
|
||||
uint64_t kernelStartOffset,
|
||||
size_t sizeCrossThreadData,
|
||||
size_t sizePerThreadData,
|
||||
size_t bindingTablePointer,
|
||||
[[maybe_unused]] size_t offsetSamplerState,
|
||||
uint32_t numSamplers,
|
||||
const uint32_t threadGroupCount,
|
||||
uint32_t numThreadsPerThreadGroup,
|
||||
const Kernel &kernel,
|
||||
uint32_t bindingTablePrefetchSize,
|
||||
PreemptionMode preemptionMode,
|
||||
const Device &device,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
|
||||
|
||||
template void HardwareCommandsHelper<FamilyType>::programInlineData<FamilyType::WALKER_TYPE>(
|
||||
Kernel &kernel,
|
||||
FamilyType::WALKER_TYPE *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
* Copyright (C) 2018-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -13,34 +13,36 @@
|
||||
|
||||
namespace NEO {
|
||||
|
||||
using Family = Gen8Family;
|
||||
|
||||
template <>
|
||||
void GpgpuWalkerHelper<Gen8Family>::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
|
||||
void GpgpuWalkerHelper<Family>::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
|
||||
if (disablePerfMode) {
|
||||
if (kernel.getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesFencesForReadWriteImages) {
|
||||
// Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
|
||||
GpgpuWalkerHelper<Gen8Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
GpgpuWalkerHelper<Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
}
|
||||
} else {
|
||||
if (kernel.getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesFencesForReadWriteImages) {
|
||||
// Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
|
||||
typedef typename Gen8Family::PIPE_CONTROL PIPE_CONTROL;
|
||||
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
|
||||
auto pipeControlSpace = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
|
||||
auto pipeControl = Gen8Family::cmdInitPipeControl;
|
||||
auto pipeControl = Family::cmdInitPipeControl;
|
||||
pipeControl.setCommandStreamerStallEnable(true);
|
||||
*pipeControlSpace = pipeControl;
|
||||
// Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
|
||||
GpgpuWalkerHelper<Gen8Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
GpgpuWalkerHelper<Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
size_t GpgpuWalkerHelper<Gen8Family>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
|
||||
typedef typename Gen8Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
|
||||
typedef typename Gen8Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
|
||||
typedef typename Gen8Family::PIPE_CONTROL PIPE_CONTROL;
|
||||
typedef typename Gen8Family::MI_MATH MI_MATH;
|
||||
typedef typename Gen8Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
|
||||
size_t GpgpuWalkerHelper<Family>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
|
||||
typedef typename Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
|
||||
typedef typename Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
|
||||
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
|
||||
typedef typename Family::MI_MATH MI_MATH;
|
||||
typedef typename Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
|
||||
size_t n = 0;
|
||||
if (pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesFencesForReadWriteImages) {
|
||||
n += sizeof(PIPE_CONTROL) +
|
||||
@@ -54,10 +56,18 @@ size_t GpgpuWalkerHelper<Gen8Family>::getSizeForWADisableLSQCROPERFforOCL(const
|
||||
return n;
|
||||
}
|
||||
|
||||
template class HardwareInterface<Gen8Family>;
|
||||
template class HardwareInterface<Family>;
|
||||
|
||||
template class GpgpuWalkerHelper<Gen8Family>;
|
||||
template void HardwareInterface<Family>::dispatchWalker<Family::WALKER_TYPE>(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template void HardwareInterface<Family>::programWalker<Family::WALKER_TYPE>(LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template void HardwareInterface<Family>::dispatchKernelCommands<Family::WALKER_TYPE>(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template Family::WALKER_TYPE *HardwareInterface<Family>::allocateWalkerSpace<Family::WALKER_TYPE>(LinearStream &commandStream, const Kernel &kernel);
|
||||
|
||||
template struct EnqueueOperation<Gen8Family>;
|
||||
template class GpgpuWalkerHelper<Family>;
|
||||
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::WALKER_TYPE>(LinearStream *cmdStream, Family::WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::WALKER_TYPE>(Family::WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3],
|
||||
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);
|
||||
|
||||
template struct EnqueueOperation<Family>;
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2022 Intel Corporation
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -15,4 +15,53 @@ namespace NEO {
|
||||
using FamilyType = Gen8Family;
|
||||
|
||||
template struct HardwareCommandsHelper<FamilyType>;
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendIndirectState<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||
LinearStream &commandStream,
|
||||
IndirectHeap &dsh,
|
||||
IndirectHeap &ioh,
|
||||
IndirectHeap &ssh,
|
||||
Kernel &kernel,
|
||||
uint64_t kernelStartOffset,
|
||||
uint32_t simd,
|
||||
const size_t localWorkSize[3],
|
||||
const uint32_t threadGroupCount,
|
||||
const uint64_t offsetInterfaceDescriptorTable,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
PreemptionMode preemptionMode,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
|
||||
bool localIdsGenerationByRuntime,
|
||||
uint64_t scratchAddress,
|
||||
const Device &device);
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendCrossThreadData<FamilyType::WALKER_TYPE>(
|
||||
IndirectHeap &indirectHeap,
|
||||
Kernel &kernel,
|
||||
bool inlineDataProgrammingRequired,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
uint32_t &sizeCrossThreadData,
|
||||
uint64_t scratchAddress);
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||
const IndirectHeap &indirectHeap,
|
||||
uint64_t offsetInterfaceDescriptor,
|
||||
uint64_t kernelStartOffset,
|
||||
size_t sizeCrossThreadData,
|
||||
size_t sizePerThreadData,
|
||||
size_t bindingTablePointer,
|
||||
[[maybe_unused]] size_t offsetSamplerState,
|
||||
uint32_t numSamplers,
|
||||
const uint32_t threadGroupCount,
|
||||
uint32_t numThreadsPerThreadGroup,
|
||||
const Kernel &kernel,
|
||||
uint32_t bindingTablePrefetchSize,
|
||||
PreemptionMode preemptionMode,
|
||||
const Device &device,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
|
||||
|
||||
template void HardwareCommandsHelper<FamilyType>::programInlineData<FamilyType::WALKER_TYPE>(
|
||||
Kernel &kernel,
|
||||
FamilyType::WALKER_TYPE *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
* Copyright (C) 2018-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -12,35 +12,37 @@
|
||||
|
||||
namespace NEO {
|
||||
|
||||
using Family = Gen9Family;
|
||||
|
||||
template <>
|
||||
void GpgpuWalkerHelper<Gen9Family>::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
|
||||
void GpgpuWalkerHelper<Family>::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
|
||||
if (disablePerfMode) {
|
||||
if (kernel.getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesFencesForReadWriteImages) {
|
||||
// Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
|
||||
GpgpuWalkerHelper<Gen9Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
GpgpuWalkerHelper<Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
}
|
||||
} else {
|
||||
if (kernel.getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesFencesForReadWriteImages) {
|
||||
// Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
|
||||
typedef typename Gen9Family::PIPE_CONTROL PIPE_CONTROL;
|
||||
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
|
||||
auto pipeControlSpace = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
|
||||
auto pipeControl = Gen9Family::cmdInitPipeControl;
|
||||
auto pipeControl = Family::cmdInitPipeControl;
|
||||
pipeControl.setCommandStreamerStallEnable(true);
|
||||
*pipeControlSpace = pipeControl;
|
||||
|
||||
// Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
|
||||
GpgpuWalkerHelper<Gen9Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
GpgpuWalkerHelper<Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
size_t GpgpuWalkerHelper<Gen9Family>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
|
||||
typedef typename Gen9Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
|
||||
typedef typename Gen9Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
|
||||
typedef typename Gen9Family::PIPE_CONTROL PIPE_CONTROL;
|
||||
typedef typename Gen9Family::MI_MATH MI_MATH;
|
||||
typedef typename Gen9Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
|
||||
size_t GpgpuWalkerHelper<Family>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
|
||||
typedef typename Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
|
||||
typedef typename Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
|
||||
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
|
||||
typedef typename Family::MI_MATH MI_MATH;
|
||||
typedef typename Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
|
||||
size_t n = 0;
|
||||
if (pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesFencesForReadWriteImages) {
|
||||
n += sizeof(PIPE_CONTROL) +
|
||||
@@ -54,10 +56,18 @@ size_t GpgpuWalkerHelper<Gen9Family>::getSizeForWADisableLSQCROPERFforOCL(const
|
||||
return n;
|
||||
}
|
||||
|
||||
template class HardwareInterface<Gen9Family>;
|
||||
template class HardwareInterface<Family>;
|
||||
|
||||
template class GpgpuWalkerHelper<Gen9Family>;
|
||||
template void HardwareInterface<Family>::dispatchWalker<Family::WALKER_TYPE>(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template void HardwareInterface<Family>::programWalker<Family::WALKER_TYPE>(LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template void HardwareInterface<Family>::dispatchKernelCommands<Family::WALKER_TYPE>(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template Family::WALKER_TYPE *HardwareInterface<Family>::allocateWalkerSpace<Family::WALKER_TYPE>(LinearStream &commandStream, const Kernel &kernel);
|
||||
|
||||
template struct EnqueueOperation<Gen9Family>;
|
||||
template class GpgpuWalkerHelper<Family>;
|
||||
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::WALKER_TYPE>(LinearStream *cmdStream, Family::WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::WALKER_TYPE>(Family::WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3],
|
||||
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);
|
||||
|
||||
template struct EnqueueOperation<Family>;
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2022 Intel Corporation
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -17,4 +17,53 @@ namespace NEO {
|
||||
using FamilyType = Gen9Family;
|
||||
|
||||
template struct HardwareCommandsHelper<FamilyType>;
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendIndirectState<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||
LinearStream &commandStream,
|
||||
IndirectHeap &dsh,
|
||||
IndirectHeap &ioh,
|
||||
IndirectHeap &ssh,
|
||||
Kernel &kernel,
|
||||
uint64_t kernelStartOffset,
|
||||
uint32_t simd,
|
||||
const size_t localWorkSize[3],
|
||||
const uint32_t threadGroupCount,
|
||||
const uint64_t offsetInterfaceDescriptorTable,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
PreemptionMode preemptionMode,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
|
||||
bool localIdsGenerationByRuntime,
|
||||
uint64_t scratchAddress,
|
||||
const Device &device);
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendCrossThreadData<FamilyType::WALKER_TYPE>(
|
||||
IndirectHeap &indirectHeap,
|
||||
Kernel &kernel,
|
||||
bool inlineDataProgrammingRequired,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
uint32_t &sizeCrossThreadData,
|
||||
uint64_t scratchAddress);
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||
const IndirectHeap &indirectHeap,
|
||||
uint64_t offsetInterfaceDescriptor,
|
||||
uint64_t kernelStartOffset,
|
||||
size_t sizeCrossThreadData,
|
||||
size_t sizePerThreadData,
|
||||
size_t bindingTablePointer,
|
||||
[[maybe_unused]] size_t offsetSamplerState,
|
||||
uint32_t numSamplers,
|
||||
const uint32_t threadGroupCount,
|
||||
uint32_t numThreadsPerThreadGroup,
|
||||
const Kernel &kernel,
|
||||
uint32_t bindingTablePrefetchSize,
|
||||
PreemptionMode preemptionMode,
|
||||
const Device &device,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
|
||||
|
||||
template void HardwareCommandsHelper<FamilyType>::programInlineData<FamilyType::WALKER_TYPE>(
|
||||
Kernel &kernel,
|
||||
FamilyType::WALKER_TYPE *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -37,6 +37,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
|
||||
|
||||
inline static uint32_t additionalSizeRequiredDsh();
|
||||
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
static size_t sendInterfaceDescriptorData(
|
||||
const IndirectHeap &indirectHeap,
|
||||
uint64_t offsetInterfaceDescriptor,
|
||||
@@ -51,9 +52,9 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
|
||||
const Kernel &kernel,
|
||||
uint32_t bindingTablePrefetchSize,
|
||||
PreemptionMode preemptionMode,
|
||||
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
|
||||
const Device &device,
|
||||
WALKER_TYPE *walkerCmd);
|
||||
WalkerType *walkerCmd,
|
||||
InterfaceDescriptorType *inlineInterfaceDescriptor);
|
||||
|
||||
static void sendMediaStateFlush(
|
||||
LinearStream &commandStream,
|
||||
@@ -64,13 +65,16 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
|
||||
size_t offsetInterfaceDescriptorData,
|
||||
size_t sizeInterfaceDescriptorData);
|
||||
|
||||
template <typename WalkerType>
|
||||
static size_t sendCrossThreadData(
|
||||
IndirectHeap &indirectHeap,
|
||||
Kernel &kernel,
|
||||
bool inlineDataProgrammingRequired,
|
||||
WALKER_TYPE *walkerCmd,
|
||||
uint32_t &sizeCrossThreadData);
|
||||
WalkerType *walkerCmd,
|
||||
uint32_t &sizeCrossThreadData,
|
||||
uint64_t scratchAddress);
|
||||
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
static size_t sendIndirectState(
|
||||
LinearStream &commandStream,
|
||||
IndirectHeap &dsh,
|
||||
@@ -84,11 +88,15 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
|
||||
const uint64_t offsetInterfaceDescriptorTable,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
PreemptionMode preemptionMode,
|
||||
WALKER_TYPE *walkerCmd,
|
||||
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
|
||||
WalkerType *walkerCmd,
|
||||
InterfaceDescriptorType *inlineInterfaceDescriptor,
|
||||
bool localIdsGenerationByRuntime,
|
||||
uint64_t scratchAddress,
|
||||
const Device &device);
|
||||
|
||||
template <typename WalkerType>
|
||||
static void programInlineData(Kernel &kernel, WalkerType *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
|
||||
|
||||
static void programPerThreadData(
|
||||
bool localIdsGenerationByRuntime,
|
||||
size_t &sizePerThreadData,
|
||||
|
||||
@@ -19,12 +19,15 @@
|
||||
#include "shared/source/indirect_heap/indirect_heap.h"
|
||||
#include "shared/source/kernel/dispatch_kernel_encoder_interface.h"
|
||||
#include "shared/source/kernel/implicit_args.h"
|
||||
#include "shared/source/memory_manager/memory_manager.h"
|
||||
|
||||
#include "opencl/source/cl_device/cl_device.h"
|
||||
#include "opencl/source/context/context.h"
|
||||
#include "opencl/source/helpers/dispatch_info.h"
|
||||
#include "opencl/source/kernel/kernel.h"
|
||||
|
||||
#include "hardware_commands_helper.h"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template <typename GfxFamily>
|
||||
@@ -117,66 +120,68 @@ size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
|
||||
const IndirectHeap &indirectHeap,
|
||||
uint64_t offsetInterfaceDescriptor,
|
||||
uint64_t kernelStartOffset,
|
||||
size_t sizeCrossThreadData,
|
||||
size_t sizePerThreadData,
|
||||
size_t bindingTablePointer,
|
||||
[[maybe_unused]] size_t bindingTablePointer,
|
||||
[[maybe_unused]] size_t offsetSamplerState,
|
||||
uint32_t numSamplers,
|
||||
[[maybe_unused]] uint32_t numSamplers,
|
||||
const uint32_t threadGroupCount,
|
||||
uint32_t threadsPerThreadGroup,
|
||||
const Kernel &kernel,
|
||||
uint32_t bindingTablePrefetchSize,
|
||||
[[maybe_unused]] uint32_t bindingTablePrefetchSize,
|
||||
PreemptionMode preemptionMode,
|
||||
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
|
||||
const Device &device,
|
||||
WALKER_TYPE *walkerCmd) {
|
||||
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
|
||||
using SHARED_LOCAL_MEMORY_SIZE = typename INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
||||
WalkerType *walkerCmd,
|
||||
InterfaceDescriptorType *inlineInterfaceDescriptor) {
|
||||
|
||||
const auto &hardwareInfo = device.getHardwareInfo();
|
||||
const auto &kernelDescriptor = kernel.getKernelInfo().kernelDescriptor;
|
||||
constexpr bool heaplessModeEnabled = GfxFamily::template isHeaplessMode<WalkerType>();
|
||||
|
||||
// Allocate some memory for the interface descriptor
|
||||
auto pInterfaceDescriptor = getInterfaceDescriptor(indirectHeap, offsetInterfaceDescriptor, inlineInterfaceDescriptor);
|
||||
auto interfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData;
|
||||
InterfaceDescriptorType *pInterfaceDescriptor = nullptr;
|
||||
auto interfaceDescriptor = GfxFamily::template getInitInterfaceDescriptor<InterfaceDescriptorType>();
|
||||
|
||||
// Program the kernel start pointer
|
||||
interfaceDescriptor.setKernelStartPointer(static_cast<uint32_t>(kernelStartOffset & std::numeric_limits<uint32_t>::max()));
|
||||
if constexpr (heaplessModeEnabled) {
|
||||
pInterfaceDescriptor = inlineInterfaceDescriptor;
|
||||
interfaceDescriptor.setKernelStartPointer(kernelStartOffset);
|
||||
} else {
|
||||
pInterfaceDescriptor = getInterfaceDescriptor(indirectHeap, offsetInterfaceDescriptor, inlineInterfaceDescriptor);
|
||||
interfaceDescriptor.setKernelStartPointer(static_cast<uint32_t>(kernelStartOffset));
|
||||
}
|
||||
|
||||
// # of threads in thread group should be based on LWS.
|
||||
// # of threads in thread group should be based on LWS.
|
||||
interfaceDescriptor.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
|
||||
|
||||
auto slmTotalSize = kernel.getSlmTotalSize();
|
||||
const auto &kernelDescriptor = kernel.getKernelInfo().kernelDescriptor;
|
||||
|
||||
EncodeDispatchKernel<GfxFamily>::setGrfInfo(&interfaceDescriptor, kernelDescriptor.kernelAttributes.numGrfRequired,
|
||||
sizeCrossThreadData, sizePerThreadData, device.getRootDeviceEnvironment());
|
||||
auto &productHelper = device.getProductHelper();
|
||||
productHelper.updateIddCommand(&interfaceDescriptor, kernelDescriptor.kernelAttributes.numGrfRequired,
|
||||
kernelDescriptor.kernelAttributes.threadArbitrationPolicy);
|
||||
|
||||
EncodeDispatchKernel<GfxFamily>::appendAdditionalIDDFields(&interfaceDescriptor, device.getRootDeviceEnvironment(), threadsPerThreadGroup,
|
||||
slmTotalSize, SlmPolicy::SlmPolicyNone);
|
||||
EncodeDispatchKernel<GfxFamily>::appendAdditionalIDDFields(&interfaceDescriptor, device.getRootDeviceEnvironment(),
|
||||
threadsPerThreadGroup, slmTotalSize, SlmPolicy::SlmPolicyNone);
|
||||
|
||||
interfaceDescriptor.setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer));
|
||||
if constexpr (heaplessModeEnabled == false) {
|
||||
interfaceDescriptor.setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer));
|
||||
|
||||
if constexpr (GfxFamily::supportsSampler) {
|
||||
if (device.getDeviceInfo().imageSupport) {
|
||||
interfaceDescriptor.setSamplerStatePointer(static_cast<uint32_t>(offsetSamplerState));
|
||||
if constexpr (GfxFamily::supportsSampler) {
|
||||
if (device.getDeviceInfo().imageSupport) {
|
||||
interfaceDescriptor.setSamplerStatePointer(static_cast<uint32_t>(offsetSamplerState));
|
||||
}
|
||||
}
|
||||
EncodeDispatchKernel<GfxFamily>::adjustBindingTablePrefetch(interfaceDescriptor, numSamplers, bindingTablePrefetchSize);
|
||||
}
|
||||
|
||||
EncodeDispatchKernel<GfxFamily>::adjustBindingTablePrefetch(interfaceDescriptor, numSamplers, bindingTablePrefetchSize);
|
||||
|
||||
const auto &hardwareInfo = device.getHardwareInfo();
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
auto programmableIDSLMSize =
|
||||
static_cast<SHARED_LOCAL_MEMORY_SIZE>(gfxCoreHelper.computeSlmValues(hardwareInfo, slmTotalSize));
|
||||
auto programmableIDSLMSize = static_cast<uint32_t>(gfxCoreHelper.computeSlmValues(hardwareInfo, slmTotalSize));
|
||||
|
||||
if (DebugManager.flags.OverrideSlmAllocationSize.get() != -1) {
|
||||
programmableIDSLMSize = static_cast<SHARED_LOCAL_MEMORY_SIZE>(DebugManager.flags.OverrideSlmAllocationSize.get());
|
||||
programmableIDSLMSize = static_cast<uint32_t>(DebugManager.flags.OverrideSlmAllocationSize.get());
|
||||
}
|
||||
|
||||
interfaceDescriptor.setSharedLocalMemorySize(programmableIDSLMSize);
|
||||
@@ -212,6 +217,7 @@ void HardwareCommandsHelper<GfxFamily>::programPerThreadData(
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
LinearStream &commandStream,
|
||||
IndirectHeap &dsh,
|
||||
@@ -225,34 +231,42 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
const uint64_t offsetInterfaceDescriptorTable,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
PreemptionMode preemptionMode,
|
||||
WALKER_TYPE *walkerCmd,
|
||||
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
|
||||
WalkerType *walkerCmd,
|
||||
InterfaceDescriptorType *inlineInterfaceDescriptor,
|
||||
bool localIdsGenerationByRuntime,
|
||||
uint64_t scratchAddress,
|
||||
const Device &device) {
|
||||
|
||||
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
|
||||
|
||||
DEBUG_BREAK_IF(simd != 1 && simd != 8 && simd != 16 && simd != 32);
|
||||
auto inlineDataProgrammingRequired = EncodeDispatchKernel<GfxFamily>::inlineDataProgrammingRequired(kernel.getKernelInfo().kernelDescriptor);
|
||||
|
||||
// Copy the kernel over to the ISH
|
||||
const auto &kernelInfo = kernel.getKernelInfo();
|
||||
constexpr bool heaplessModeEnabled = GfxFamily::template isHeaplessMode<WalkerType>();
|
||||
|
||||
ssh.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
|
||||
|
||||
size_t dstBindingTablePointer = HardwareCommandsHelper<GfxFamily>::checkForAdditionalBTAndSetBTPointer(ssh, kernel);
|
||||
|
||||
// Copy our sampler state if it exists
|
||||
const auto &samplerTable = kernelInfo.kernelDescriptor.payloadMappings.samplerTable;
|
||||
size_t dstBindingTablePointer = 0;
|
||||
uint32_t samplerCount = 0;
|
||||
uint32_t samplerStateOffset = 0;
|
||||
if (isValidOffset(samplerTable.tableOffset) && isValidOffset(samplerTable.borderColor)) {
|
||||
samplerCount = samplerTable.numSamplers;
|
||||
samplerStateOffset = EncodeStates<GfxFamily>::copySamplerState(&dsh, samplerTable.tableOffset,
|
||||
samplerCount, samplerTable.borderColor,
|
||||
kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(),
|
||||
device.getRootDeviceEnvironment());
|
||||
uint32_t bindingTablePrefetchSize = 0;
|
||||
|
||||
if constexpr (heaplessModeEnabled == false) {
|
||||
|
||||
ssh.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
|
||||
dstBindingTablePointer = HardwareCommandsHelper<GfxFamily>::checkForAdditionalBTAndSetBTPointer(ssh, kernel);
|
||||
|
||||
const auto &kernelInfo = kernel.getKernelInfo();
|
||||
// Copy our sampler state if it exists
|
||||
const auto &samplerTable = kernelInfo.kernelDescriptor.payloadMappings.samplerTable;
|
||||
if (isValidOffset(samplerTable.tableOffset) && isValidOffset(samplerTable.borderColor)) {
|
||||
samplerCount = samplerTable.numSamplers;
|
||||
samplerStateOffset = EncodeStates<GfxFamily>::copySamplerState(&dsh, samplerTable.tableOffset,
|
||||
samplerCount, samplerTable.borderColor,
|
||||
kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(),
|
||||
device.getRootDeviceEnvironment());
|
||||
}
|
||||
|
||||
if (EncodeSurfaceState<GfxFamily>::doBindingTablePrefetch()) {
|
||||
bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
|
||||
}
|
||||
}
|
||||
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
|
||||
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
|
||||
@@ -260,9 +274,10 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
|
||||
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
||||
|
||||
size_t offsetCrossThreadData = HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
||||
auto inlineDataProgrammingRequired = EncodeDispatchKernel<GfxFamily>::inlineDataProgrammingRequired(kernel.getKernelInfo().kernelDescriptor);
|
||||
size_t offsetCrossThreadData = HardwareCommandsHelper<GfxFamily>::sendCrossThreadData<WalkerType>(
|
||||
ioh, kernel, inlineDataProgrammingRequired,
|
||||
walkerCmd, sizeCrossThreadData);
|
||||
walkerCmd, sizeCrossThreadData, scratchAddress);
|
||||
|
||||
size_t sizePerThreadDataTotal = 0;
|
||||
size_t sizePerThreadData = 0;
|
||||
@@ -275,14 +290,9 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
kernel,
|
||||
localWorkSize);
|
||||
|
||||
uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * GfxFamily::template getInterfaceDescriptorSize<WalkerType>();
|
||||
|
||||
auto bindingTablePrefetchSize = 0;
|
||||
if (EncodeSurfaceState<GfxFamily>::doBindingTablePrefetch()) {
|
||||
bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
|
||||
}
|
||||
|
||||
HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
|
||||
HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData<WalkerType, InterfaceDescriptorType>(
|
||||
dsh,
|
||||
offsetInterfaceDescriptor,
|
||||
kernelStartOffset,
|
||||
@@ -296,31 +306,36 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
kernel,
|
||||
bindingTablePrefetchSize,
|
||||
preemptionMode,
|
||||
inlineInterfaceDescriptor,
|
||||
device,
|
||||
walkerCmd);
|
||||
walkerCmd,
|
||||
inlineInterfaceDescriptor);
|
||||
|
||||
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
|
||||
PatchInfoData patchInfoData(kernelStartOffset, 0, PatchInfoAllocationType::InstructionHeap, dsh.getGraphicsAllocation()->getGpuAddress(), offsetInterfaceDescriptor, PatchInfoAllocationType::DynamicStateHeap);
|
||||
kernel.getPatchInfoDataList().push_back(patchInfoData);
|
||||
}
|
||||
|
||||
// Program media state flush to set interface descriptor offset
|
||||
sendMediaStateFlush(
|
||||
commandStream,
|
||||
interfaceDescriptorIndex);
|
||||
if constexpr (heaplessModeEnabled == false) {
|
||||
// Program media state flush to set interface descriptor offset
|
||||
sendMediaStateFlush(
|
||||
commandStream,
|
||||
interfaceDescriptorIndex);
|
||||
|
||||
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
|
||||
walkerCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
|
||||
setInterfaceDescriptorOffset(walkerCmd, interfaceDescriptorIndex);
|
||||
|
||||
auto indirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
|
||||
WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||
walkerCmd->setIndirectDataLength(indirectDataLength);
|
||||
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
|
||||
walkerCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
|
||||
setInterfaceDescriptorOffset(walkerCmd, interfaceDescriptorIndex);
|
||||
auto indirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
|
||||
WalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||
walkerCmd->setIndirectDataLength(indirectDataLength);
|
||||
}
|
||||
|
||||
return offsetCrossThreadData;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
void HardwareCommandsHelper<GfxFamily>::programInlineData(Kernel &kernel, WalkerType *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress) {}
|
||||
|
||||
template <typename GfxFamily>
|
||||
bool HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(const Kernel &kernel) {
|
||||
return kernel.getKernelInfo().kernelDescriptor.kernelAttributes.numLocalIdChannels > 0;
|
||||
|
||||
@@ -71,13 +71,15 @@ void HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
||||
IndirectHeap &indirectHeap,
|
||||
Kernel &kernel,
|
||||
bool inlineDataProgrammingRequired,
|
||||
WALKER_TYPE *walkerCmd,
|
||||
uint32_t &sizeCrossThreadData) {
|
||||
indirectHeap.align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||
WalkerType *walkerCmd,
|
||||
uint32_t &sizeCrossThreadData,
|
||||
uint64_t scratchAddress) {
|
||||
indirectHeap.align(WalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||
|
||||
auto pImplicitArgs = kernel.getImplicitArgs();
|
||||
if (pImplicitArgs) {
|
||||
|
||||
@@ -52,14 +52,20 @@ void HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
||||
IndirectHeap &indirectHeap,
|
||||
Kernel &kernel,
|
||||
bool inlineDataProgrammingRequired,
|
||||
WALKER_TYPE *walkerCmd,
|
||||
uint32_t &sizeCrossThreadData) {
|
||||
WalkerType *walkerCmd,
|
||||
uint32_t &sizeCrossThreadData,
|
||||
[[maybe_unused]] uint64_t scratchAddress) {
|
||||
|
||||
indirectHeap.align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||
constexpr bool heaplessModeEnabled = GfxFamily::template isHeaplessMode<WalkerType>();
|
||||
|
||||
if constexpr (heaplessModeEnabled == false) {
|
||||
indirectHeap.align(WalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||
}
|
||||
|
||||
auto offsetCrossThreadData = indirectHeap.getUsed();
|
||||
char *dest = nullptr;
|
||||
@@ -96,11 +102,16 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
||||
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder), gfxCoreHelper);
|
||||
}
|
||||
|
||||
using InlineData = typename GfxFamily::INLINE_DATA;
|
||||
using GRF = typename GfxFamily::GRF;
|
||||
uint32_t inlineDataSize = sizeof(InlineData);
|
||||
uint32_t sizeToCopy = sizeCrossThreadData;
|
||||
if (inlineDataProgrammingRequired == true) {
|
||||
|
||||
using InlineData = typename GfxFamily::INLINE_DATA;
|
||||
uint32_t inlineDataSize = sizeof(InlineData);
|
||||
|
||||
if constexpr (heaplessModeEnabled) {
|
||||
inlineDataSize = 64;
|
||||
}
|
||||
|
||||
sizeToCopy = std::min(inlineDataSize, sizeCrossThreadData);
|
||||
dest = reinterpret_cast<char *>(walkerCmd->getInlineDataPointer());
|
||||
memcpy_s(dest, sizeToCopy, kernel.getCrossThreadData(), sizeToCopy);
|
||||
@@ -114,6 +125,14 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
||||
memcpy_s(dest, sizeCrossThreadData, src, sizeCrossThreadData);
|
||||
}
|
||||
|
||||
if constexpr (heaplessModeEnabled) {
|
||||
auto device = kernel.getContext().getDevice(0);
|
||||
uint64_t indirectDataAddress = device->getMemoryManager()->getInternalHeapBaseAddress(device->getRootDeviceIndex(), indirectHeap.getGraphicsAllocation()->isAllocatedInLocalMemoryPool());
|
||||
indirectDataAddress += indirectHeap.getHeapGpuStartOffset();
|
||||
|
||||
HardwareCommandsHelper<GfxFamily>::programInlineData<WalkerType>(kernel, walkerCmd, indirectDataAddress, scratchAddress);
|
||||
}
|
||||
|
||||
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
|
||||
FlatBatchBufferHelper::fixCrossThreadDataInfo(kernel.getPatchInfoDataList(), offsetCrossThreadData, indirectHeap.getGraphicsAllocation()->getGpuAddress());
|
||||
}
|
||||
|
||||
@@ -109,6 +109,7 @@ std::string Program::getInternalOptions() const {
|
||||
auto isDebuggerActive = pClDevice->getDevice().getDebugger() != nullptr;
|
||||
CompilerOptions::concatenateAppend(internalOptions, compilerProductHelper.getCachingPolicyOptions(isDebuggerActive));
|
||||
CompilerOptions::applyExtraInternalOptions(internalOptions, compilerProductHelper);
|
||||
|
||||
return internalOptions;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2021-2022 Intel Corporation
|
||||
* Copyright (C) 2021-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -12,10 +12,20 @@
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template class GpgpuWalkerHelper<XeHpcCoreFamily>;
|
||||
using Family = XeHpcCoreFamily;
|
||||
|
||||
template class HardwareInterface<XeHpcCoreFamily>;
|
||||
template class GpgpuWalkerHelper<Family>;
|
||||
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::WALKER_TYPE>(LinearStream *cmdStream, Family::WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::WALKER_TYPE>(Family::WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3],
|
||||
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);
|
||||
|
||||
template struct EnqueueOperation<XeHpcCoreFamily>;
|
||||
template class HardwareInterface<Family>;
|
||||
|
||||
template void HardwareInterface<Family>::dispatchWalker<Family::WALKER_TYPE>(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template void HardwareInterface<Family>::programWalker<Family::WALKER_TYPE>(LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template void HardwareInterface<Family>::dispatchKernelCommands<Family::WALKER_TYPE>(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template Family::WALKER_TYPE *HardwareInterface<Family>::allocateWalkerSpace<Family::WALKER_TYPE>(LinearStream &commandStream, const Kernel &kernel);
|
||||
|
||||
template struct EnqueueOperation<Family>;
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -16,4 +16,53 @@ namespace NEO {
|
||||
using FamilyType = XeHpcCoreFamily;
|
||||
|
||||
template struct HardwareCommandsHelper<FamilyType>;
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendIndirectState<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||
LinearStream &commandStream,
|
||||
IndirectHeap &dsh,
|
||||
IndirectHeap &ioh,
|
||||
IndirectHeap &ssh,
|
||||
Kernel &kernel,
|
||||
uint64_t kernelStartOffset,
|
||||
uint32_t simd,
|
||||
const size_t localWorkSize[3],
|
||||
const uint32_t threadGroupCount,
|
||||
const uint64_t offsetInterfaceDescriptorTable,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
PreemptionMode preemptionMode,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
|
||||
bool localIdsGenerationByRuntime,
|
||||
uint64_t scratchAddress,
|
||||
const Device &device);
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendCrossThreadData<FamilyType::WALKER_TYPE>(
|
||||
IndirectHeap &indirectHeap,
|
||||
Kernel &kernel,
|
||||
bool inlineDataProgrammingRequired,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
uint32_t &sizeCrossThreadData,
|
||||
uint64_t scratchAddress);
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||
const IndirectHeap &indirectHeap,
|
||||
uint64_t offsetInterfaceDescriptor,
|
||||
uint64_t kernelStartOffset,
|
||||
size_t sizeCrossThreadData,
|
||||
size_t sizePerThreadData,
|
||||
size_t bindingTablePointer,
|
||||
[[maybe_unused]] size_t offsetSamplerState,
|
||||
uint32_t numSamplers,
|
||||
const uint32_t threadGroupCount,
|
||||
uint32_t numThreadsPerThreadGroup,
|
||||
const Kernel &kernel,
|
||||
uint32_t bindingTablePrefetchSize,
|
||||
PreemptionMode preemptionMode,
|
||||
const Device &device,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
|
||||
|
||||
template void HardwareCommandsHelper<FamilyType>::programInlineData<FamilyType::WALKER_TYPE>(
|
||||
Kernel &kernel,
|
||||
FamilyType::WALKER_TYPE *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2021-2022 Intel Corporation
|
||||
* Copyright (C) 2021-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -12,10 +12,20 @@
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template class GpgpuWalkerHelper<XeHpgCoreFamily>;
|
||||
using Family = XeHpgCoreFamily;
|
||||
|
||||
template class HardwareInterface<XeHpgCoreFamily>;
|
||||
template class GpgpuWalkerHelper<Family>;
|
||||
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::WALKER_TYPE>(LinearStream *cmdStream, Family::WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::WALKER_TYPE>(Family::WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3],
|
||||
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);
|
||||
|
||||
template struct EnqueueOperation<XeHpgCoreFamily>;
|
||||
template class HardwareInterface<Family>;
|
||||
|
||||
template void HardwareInterface<Family>::dispatchWalker<Family::WALKER_TYPE>(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template void HardwareInterface<Family>::programWalker<Family::WALKER_TYPE>(LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template void HardwareInterface<Family>::dispatchKernelCommands<Family::WALKER_TYPE>(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
template Family::WALKER_TYPE *HardwareInterface<Family>::allocateWalkerSpace<Family::WALKER_TYPE>(LinearStream &commandStream, const Kernel &kernel);
|
||||
|
||||
template struct EnqueueOperation<Family>;
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -16,4 +16,52 @@ namespace NEO {
|
||||
using FamilyType = XeHpgCoreFamily;
|
||||
|
||||
template struct HardwareCommandsHelper<FamilyType>;
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendIndirectState<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||
LinearStream &commandStream,
|
||||
IndirectHeap &dsh,
|
||||
IndirectHeap &ioh,
|
||||
IndirectHeap &ssh,
|
||||
Kernel &kernel,
|
||||
uint64_t kernelStartOffset,
|
||||
uint32_t simd,
|
||||
const size_t localWorkSize[3],
|
||||
const uint32_t threadGroupCount,
|
||||
const uint64_t offsetInterfaceDescriptorTable,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
PreemptionMode preemptionMode,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
|
||||
bool localIdsGenerationByRuntime,
|
||||
uint64_t scratchAddress,
|
||||
const Device &device);
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendCrossThreadData<FamilyType::WALKER_TYPE>(
|
||||
IndirectHeap &indirectHeap,
|
||||
Kernel &kernel,
|
||||
bool inlineDataProgrammingRequired,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
uint32_t &sizeCrossThreadData,
|
||||
uint64_t scratchAddress);
|
||||
|
||||
template size_t HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
|
||||
const IndirectHeap &indirectHeap,
|
||||
uint64_t offsetInterfaceDescriptor,
|
||||
uint64_t kernelStartOffset,
|
||||
size_t sizeCrossThreadData,
|
||||
size_t sizePerThreadData,
|
||||
size_t bindingTablePointer,
|
||||
[[maybe_unused]] size_t offsetSamplerState,
|
||||
uint32_t numSamplers,
|
||||
const uint32_t threadGroupCount,
|
||||
uint32_t numThreadsPerThreadGroup,
|
||||
const Kernel &kernel,
|
||||
uint32_t bindingTablePrefetchSize,
|
||||
PreemptionMode preemptionMode,
|
||||
const Device &device,
|
||||
FamilyType::WALKER_TYPE *walkerCmd,
|
||||
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
|
||||
|
||||
template void HardwareCommandsHelper<FamilyType>::programInlineData<FamilyType::WALKER_TYPE>(
|
||||
Kernel &kernel,
|
||||
FamilyType::WALKER_TYPE *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
|
||||
} // namespace NEO
|
||||
|
||||
@@ -169,7 +169,7 @@ HWTEST_F(DispatchWalkerTest, WhenDispatchingWalkerThenCommandStreamMemoryIsntCha
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -212,7 +212,7 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalIdsWhenDispatchingWalkerThenWalkerIsDis
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -239,7 +239,7 @@ HWTEST_F(DispatchWalkerTest, GivenDefaultLwsAlgorithmWhenDispatchingWalkerThenDi
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -267,7 +267,7 @@ HWTEST_F(DispatchWalkerTest, GivenSquaredLwsAlgorithmWhenDispatchingWalkerThenDi
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -293,7 +293,7 @@ HWTEST_F(DispatchWalkerTest, GivenNdLwsAlgorithmWhenDispatchingWalkerThenDimensi
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -320,7 +320,7 @@ HWTEST_F(DispatchWalkerTest, GivenOldLwsAlgorithmWhenDispatchingWalkerThenDimens
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -347,7 +347,7 @@ HWTEST_F(DispatchWalkerTest, GivenNumWorkGroupsWhenDispatchingWalkerThenNumWorkG
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -377,7 +377,7 @@ HWTEST_F(DispatchWalkerTest, GivenGlobalWorkOffsetWhenDispatchingWalkerThenGloba
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -407,7 +407,7 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndDefaultAlgorithmWhenDispatch
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -437,7 +437,7 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndNdOnWhenDispatchingWalkerThe
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -468,7 +468,7 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmWhenDispatch
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -499,7 +499,7 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmOffAndNdOffW
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -528,7 +528,7 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeWhenDispatchingWalkerThenLwsIsC
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -560,7 +560,7 @@ HWTEST_F(DispatchWalkerTest, GivenTwoSetsOfLwsOffsetsWhenDispatchingWalkerThenLw
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -598,7 +598,7 @@ HWTEST_F(DispatchWalkerTest, GivenSplitKernelWhenDispatchingWalkerThenLwsIsCorre
|
||||
|
||||
MockMultiDispatchInfo multiDispatchInfo(std::vector<DispatchInfo *>({&di1, &di2}));
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -650,7 +650,7 @@ HWTEST_F(DispatchWalkerTest, GivenSplitWalkerWhenDispatchingWalkerThenLwsIsCorre
|
||||
multiDispatchInfo.push(di2);
|
||||
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -703,7 +703,7 @@ HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenCommandSt
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgs.blockedCommandsData = blockedCommandsData.get();
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -735,7 +735,7 @@ HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenRequiredH
|
||||
multiDispatchInfo.push(dispatchInfo);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgs.blockedCommandsData = blockedCommandsData.get();
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -782,7 +782,7 @@ HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenRequiredH
|
||||
auto blockedCommandsData = createBlockedCommandsData(*pCmdQ);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgs.blockedCommandsData = blockedCommandsData.get();
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -805,7 +805,7 @@ HWTEST_F(DispatchWalkerTest, givenBlockedQueueWhenDispatchWalkerIsCalledThenComm
|
||||
auto blockedCommandsData = createBlockedCommandsData(*pCmdQ);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgs.blockedCommandsData = blockedCommandsData.get();
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -831,7 +831,7 @@ HWTEST_F(DispatchWalkerTest, givenThereAreAllocationsForReuseWhenDispatchWalkerI
|
||||
auto blockedCommandsData = createBlockedCommandsData(*pCmdQ);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgs.blockedCommandsData = blockedCommandsData.get();
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -851,7 +851,7 @@ HWTEST_F(DispatchWalkerTest, GivenMultipleKernelsWhenDispatchingWalkerThenWorkDi
|
||||
|
||||
MockMultiDispatchInfo multiDispatchInfo(pClDevice, std::vector<Kernel *>({&kernel1, &kernel2}));
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -888,7 +888,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, GivenMultipleKernelsWhenDispatch
|
||||
auto dshBeforeMultiDisptach = indirectHeap.getUsed();
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -971,7 +971,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, GivenMultipleKernelsWhenDispatch
|
||||
// create commandStream
|
||||
auto &cmdStream = pCmdQ->getCS(0);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -1011,7 +1011,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, GivenMultipleKernelsWhenDispatch
|
||||
// create commandStream
|
||||
auto &cmdStream = pCmdQ->getCS(0);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -1056,7 +1056,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, GivenMultipleDispatchInfoAndSame
|
||||
// create commandStream
|
||||
auto &cmdStream = pCmdQ->getCS(0);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -1129,7 +1129,7 @@ HWTEST_P(DispatchWalkerTestForAuxTranslation, givenKernelWhenAuxToNonAuxWhenTran
|
||||
|
||||
builder.buildDispatchInfosForAuxTranslation<FamilyType>(multiDispatchInfo, builtinOpsParams);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -1178,7 +1178,7 @@ HWTEST_P(DispatchWalkerTestForAuxTranslation, givenKernelWhenNonAuxToAuxWhenTran
|
||||
|
||||
builder.buildDispatchInfosForAuxTranslation<FamilyType>(multiDispatchInfo, builtinOpsParams);
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -1349,7 +1349,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
|
||||
multiDispatchInfoWithoutImplicitArgs.push(dispatchInfoWithoutImplicitArgs);
|
||||
HardwareInterfaceWalkerArgs walkerArgsWithoutImplicitArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgsWithoutImplicitArgs.blockedCommandsData = blockedCommandsData.get();
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfoWithoutImplicitArgs,
|
||||
CsrDependencies(),
|
||||
@@ -1364,7 +1364,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
|
||||
multiDispatchInfoWithImplicitArgs.push(dispatchInfoWithImplicitArgs);
|
||||
HardwareInterfaceWalkerArgs walkerArgsWithImplicitArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgsWithImplicitArgs.blockedCommandsData = blockedCommandsData.get();
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*pCmdQ,
|
||||
multiDispatchInfoWithImplicitArgs,
|
||||
CsrDependencies(),
|
||||
|
||||
@@ -111,7 +111,7 @@ HWTEST2_F(Dg2AndLaterDispatchWalkerBasicTest, givenTimestampPacketWhenDispatchin
|
||||
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgs.currentTimestampPacketNodes = ×tampPacketContainer;
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
cmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
|
||||
@@ -464,7 +464,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenTimestamp
|
||||
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgs.currentTimestampPacketNodes = ×tampPacketContainer;
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<COMPUTE_WALKER>(
|
||||
cmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
|
||||
@@ -60,8 +60,8 @@ void HardwareCommandsTest::addSpaceForSingleKernelArg() {
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenProgramInterfaceDescriptorDataIsCreatedThenOnlyRequiredSpaceOnIndirectHeapIsAllocated) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
WALKER_TYPE walkerCmd{};
|
||||
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
|
||||
GPGPU_WALKER walkerCmd{};
|
||||
CommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, 0, false);
|
||||
|
||||
std::unique_ptr<Image> srcImage(Image2dHelper<>::create(pContext));
|
||||
@@ -87,15 +87,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenProgramInterfaceDescriptor
|
||||
auto kernel = multiDispatchInfo.begin()->getKernel();
|
||||
ASSERT_NE(nullptr, kernel);
|
||||
|
||||
typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192);
|
||||
auto usedIndirectHeapBefore = indirectHeap.getUsed();
|
||||
indirectHeap.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
|
||||
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
size_t crossThreadDataSize = kernel->getCrossThreadDataSize();
|
||||
HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
|
||||
indirectHeap, 0, 0, crossThreadDataSize, 64, 0, 0, 0, threadGroupCount, 1, *kernel, 0, pDevice->getPreemptionMode(), nullptr, *pDevice, &walkerCmd);
|
||||
HardwareCommandsHelper<FamilyType>::template sendInterfaceDescriptorData<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
|
||||
indirectHeap, 0, 0, crossThreadDataSize, 64, 0, 0, 0, threadGroupCount, 1, *kernel, 0, pDevice->getPreemptionMode(), *pDevice, &walkerCmd, nullptr);
|
||||
|
||||
auto usedIndirectHeapAfter = indirectHeap.getUsed();
|
||||
EXPECT_EQ(sizeof(INTERFACE_DESCRIPTOR_DATA), usedIndirectHeapAfter - usedIndirectHeapBefore);
|
||||
@@ -137,7 +137,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenMediaStateFlushIsCreatedTh
|
||||
|
||||
HWTEST_F(HardwareCommandsTest, WhenCrossThreadDataIsCreatedThenOnlyRequiredSpaceOnIndirectHeapIsAllocated) {
|
||||
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
|
||||
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
CommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, 0, false);
|
||||
|
||||
std::unique_ptr<Image> srcImage(Image2dHelper<>::create(pContext));
|
||||
@@ -166,12 +166,12 @@ HWTEST_F(HardwareCommandsTest, WhenCrossThreadDataIsCreatedThenOnlyRequiredSpace
|
||||
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192);
|
||||
auto usedBefore = indirectHeap.getUsed();
|
||||
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
|
||||
HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
|
||||
HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
|
||||
indirectHeap,
|
||||
*kernel,
|
||||
false,
|
||||
nullptr,
|
||||
sizeCrossThreadData);
|
||||
sizeCrossThreadData, 0);
|
||||
|
||||
auto usedAfter = indirectHeap.getUsed();
|
||||
EXPECT_EQ(kernel->getCrossThreadDataSize(), usedAfter - usedBefore);
|
||||
@@ -179,6 +179,7 @@ HWTEST_F(HardwareCommandsTest, WhenCrossThreadDataIsCreatedThenOnlyRequiredSpace
|
||||
|
||||
HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsNotSetThenAddPatchInfoDataOffsetsAreNotMoved) {
|
||||
CommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, 0, false);
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
|
||||
MockContext context;
|
||||
|
||||
@@ -192,12 +193,13 @@ HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoComme
|
||||
PatchInfoData patchInfoData = {0xaaaaaaaa, 0, PatchInfoAllocationType::KernelArg, 0xbbbbbbbb, 0, PatchInfoAllocationType::IndirectObjectHeap};
|
||||
kernel->getPatchInfoDataList().push_back(patchInfoData);
|
||||
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
|
||||
HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
|
||||
HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
|
||||
indirectHeap,
|
||||
*kernel,
|
||||
false,
|
||||
nullptr,
|
||||
sizeCrossThreadData);
|
||||
sizeCrossThreadData,
|
||||
0);
|
||||
|
||||
ASSERT_EQ(1u, kernel->getPatchInfoDataList().size());
|
||||
EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation);
|
||||
@@ -209,32 +211,36 @@ HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoComme
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenIndirectHeapNotAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenOffsetZeroIsReturned) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
auto nonInternalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize});
|
||||
IndirectHeap indirectHeap(nonInternalAllocation, false);
|
||||
|
||||
auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
|
||||
auto offset = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
|
||||
auto offset = HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
|
||||
indirectHeap,
|
||||
*mockKernelWithInternal->mockKernel,
|
||||
false,
|
||||
nullptr,
|
||||
sizeCrossThreadData);
|
||||
sizeCrossThreadData,
|
||||
0);
|
||||
EXPECT_EQ(0u, offset);
|
||||
pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation);
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenIndirectHeapAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenHeapBaseOffsetIsReturned) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
auto internalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties(pDevice->getRootDeviceIndex(), true, MemoryConstants::pageSize, AllocationType::INTERNAL_HEAP, pDevice->getDeviceBitfield()));
|
||||
IndirectHeap indirectHeap(internalAllocation, true);
|
||||
auto expectedOffset = internalAllocation->getGpuAddressToPatch();
|
||||
|
||||
auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
|
||||
auto offset = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
|
||||
auto offset = HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
|
||||
indirectHeap,
|
||||
*mockKernelWithInternal->mockKernel,
|
||||
false,
|
||||
nullptr,
|
||||
sizeCrossThreadData);
|
||||
sizeCrossThreadData,
|
||||
0);
|
||||
EXPECT_EQ(expectedOffset, offset);
|
||||
|
||||
pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation);
|
||||
@@ -243,6 +249,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenIndirectHeapAllocatedFrom
|
||||
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsSetThenAddPatchInfoDataOffsetsAreMoved) {
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.AddPatchInfoCommentsForAUBDump.set(true);
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
|
||||
CommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, 0, false);
|
||||
|
||||
@@ -262,12 +269,13 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenSendCrossThreadDataWhenWh
|
||||
kernel->getPatchInfoDataList().push_back(patchInfoData1);
|
||||
kernel->getPatchInfoDataList().push_back(patchInfoData2);
|
||||
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
|
||||
auto offsetCrossThreadData = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
|
||||
auto offsetCrossThreadData = HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
|
||||
indirectHeap,
|
||||
*kernel,
|
||||
false,
|
||||
nullptr,
|
||||
sizeCrossThreadData);
|
||||
sizeCrossThreadData,
|
||||
0);
|
||||
|
||||
ASSERT_NE(0u, offsetCrossThreadData);
|
||||
EXPECT_EQ(128u, offsetCrossThreadData);
|
||||
@@ -340,7 +348,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes
|
||||
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
|
||||
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*kernel);
|
||||
|
||||
HardwareCommandsHelper<FamilyType>::sendIndirectState(
|
||||
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
@@ -356,6 +364,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes
|
||||
pWalkerCmd,
|
||||
nullptr,
|
||||
true,
|
||||
0,
|
||||
*pDevice);
|
||||
|
||||
// It's okay these are EXPECT_GE as they're only going to be used for
|
||||
@@ -397,7 +406,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
|
||||
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
|
||||
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
|
||||
|
||||
HardwareCommandsHelper<FamilyType>::sendIndirectState(
|
||||
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
@@ -413,6 +422,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
|
||||
pWalkerCmd,
|
||||
nullptr,
|
||||
true,
|
||||
0,
|
||||
*pDevice);
|
||||
|
||||
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
|
||||
@@ -445,7 +455,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
|
||||
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
|
||||
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
|
||||
|
||||
HardwareCommandsHelper<FamilyType>::sendIndirectState(
|
||||
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
@@ -461,6 +471,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
|
||||
pWalkerCmd,
|
||||
nullptr,
|
||||
true,
|
||||
0,
|
||||
*pDevice);
|
||||
|
||||
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
|
||||
@@ -526,7 +537,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
|
||||
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
|
||||
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(mockKernel);
|
||||
|
||||
HardwareCommandsHelper<FamilyType>::sendIndirectState(
|
||||
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
@@ -542,6 +553,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
|
||||
pWalkerCmd,
|
||||
nullptr,
|
||||
true,
|
||||
0,
|
||||
*pDevice);
|
||||
|
||||
constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF);
|
||||
@@ -567,6 +579,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenSendingIndirectStateThenBi
|
||||
typedef typename FamilyType::BINDING_TABLE_STATE BINDING_TABLE_STATE;
|
||||
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
|
||||
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
|
||||
CommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, 0, false);
|
||||
std::unique_ptr<Image> dstImage(Image2dHelper<>::create(pContext));
|
||||
@@ -619,7 +632,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenSendingIndirectStateThenBi
|
||||
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
|
||||
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*kernel);
|
||||
|
||||
HardwareCommandsHelper<FamilyType>::sendIndirectState(
|
||||
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
@@ -635,6 +648,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenSendingIndirectStateThenBi
|
||||
pWalkerCmd,
|
||||
nullptr,
|
||||
true,
|
||||
0,
|
||||
*pDevice);
|
||||
|
||||
EXPECT_EQ(sshUsed + 0x00000000u, *(&bindingTableStatesPointers[0]));
|
||||
@@ -730,7 +744,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh
|
||||
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
|
||||
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*pKernel);
|
||||
|
||||
HardwareCommandsHelper<FamilyType>::sendIndirectState(
|
||||
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
@@ -746,6 +760,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh
|
||||
pWalkerCmd,
|
||||
nullptr,
|
||||
true,
|
||||
0,
|
||||
*pDevice);
|
||||
|
||||
bti = reinterpret_cast<typename FamilyType::BINDING_TABLE_STATE *>(reinterpret_cast<unsigned char *>(ssh.getCpuBase()) + localSshOffset + btiOffset);
|
||||
@@ -901,7 +916,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
|
||||
// Undefined Offset, Defined BorderColorOffset
|
||||
mockKernelWithInternal->kernelInfo.setSamplerTable(0, 2, undefined<uint16_t>);
|
||||
|
||||
HardwareCommandsHelper<FamilyType>::sendIndirectState(
|
||||
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
@@ -917,6 +932,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
|
||||
pWalkerCmd,
|
||||
nullptr,
|
||||
true,
|
||||
0,
|
||||
*pDevice);
|
||||
|
||||
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
|
||||
@@ -926,7 +942,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
|
||||
// Defined Offset, Undefined BorderColorOffset
|
||||
mockKernelWithInternal->kernelInfo.setSamplerTable(undefined<uint16_t>, 2, 0);
|
||||
|
||||
HardwareCommandsHelper<FamilyType>::sendIndirectState(
|
||||
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
@@ -942,6 +958,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
|
||||
pWalkerCmd,
|
||||
nullptr,
|
||||
true,
|
||||
0,
|
||||
*pDevice);
|
||||
|
||||
interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
|
||||
@@ -998,7 +1015,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
|
||||
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
|
||||
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
|
||||
|
||||
HardwareCommandsHelper<FamilyType>::sendIndirectState(
|
||||
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
@@ -1014,6 +1031,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
|
||||
pWalkerCmd,
|
||||
nullptr,
|
||||
true,
|
||||
0,
|
||||
*pDevice);
|
||||
|
||||
bool isMemorySame = memcmp(borderColorPointer, mockDsh, samplerTableOffset) == 0;
|
||||
@@ -1136,12 +1154,14 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
|
||||
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernel.getDescriptor(), false, gfxCoreHelper);
|
||||
|
||||
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
||||
HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
|
||||
indirectHeap,
|
||||
kernel,
|
||||
false,
|
||||
nullptr,
|
||||
sizeCrossThreadData);
|
||||
sizeCrossThreadData,
|
||||
0);
|
||||
|
||||
EXPECT_LE(implicitArgsProgrammingSize, indirectHeap.getUsed());
|
||||
|
||||
@@ -1272,40 +1292,46 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
||||
using HardwareCommandsTestXeHpAndLater = HardwareCommandsTest;
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenIndirectHeapNotAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenOffsetZeroIsReturned) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
auto nonInternalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize});
|
||||
IndirectHeap indirectHeap(nonInternalAllocation, false);
|
||||
|
||||
auto expectedOffset = is64bit ? 0u : indirectHeap.getHeapGpuBase();
|
||||
|
||||
auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
|
||||
auto offset = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
|
||||
auto offset = HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
|
||||
indirectHeap,
|
||||
*mockKernelWithInternal->mockKernel,
|
||||
false,
|
||||
nullptr,
|
||||
sizeCrossThreadData);
|
||||
sizeCrossThreadData,
|
||||
0);
|
||||
EXPECT_EQ(expectedOffset, offset);
|
||||
pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation);
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenIndirectHeapAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenHeapBaseOffsetIsReturned) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
auto internalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties(pDevice->getRootDeviceIndex(), true, MemoryConstants::pageSize, AllocationType::INTERNAL_HEAP, pDevice->getDeviceBitfield()));
|
||||
IndirectHeap indirectHeap(internalAllocation, true);
|
||||
auto expectedOffset = is64bit ? internalAllocation->getGpuAddressToPatch() : 0u;
|
||||
|
||||
auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
|
||||
auto offset = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
|
||||
auto offset = HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
|
||||
indirectHeap,
|
||||
*mockKernelWithInternal->mockKernel,
|
||||
false,
|
||||
nullptr,
|
||||
sizeCrossThreadData);
|
||||
sizeCrossThreadData,
|
||||
0);
|
||||
EXPECT_EQ(expectedOffset, offset);
|
||||
|
||||
pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation);
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsSetThenAddPatchInfoDataOffsetsAreMoved) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.AddPatchInfoCommentsForAUBDump.set(true);
|
||||
|
||||
@@ -1327,12 +1353,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenSendCrossThr
|
||||
kernel->getPatchInfoDataList().push_back(patchInfoData1);
|
||||
kernel->getPatchInfoDataList().push_back(patchInfoData2);
|
||||
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
|
||||
auto offsetCrossThreadData = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
|
||||
auto offsetCrossThreadData = HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
|
||||
indirectHeap,
|
||||
*kernel,
|
||||
false,
|
||||
nullptr,
|
||||
sizeCrossThreadData);
|
||||
sizeCrossThreadData,
|
||||
0);
|
||||
|
||||
auto expectedOffsetRelativeToIohBase = 128u;
|
||||
auto iohBaseAddress = is64bit ? 0u : indirectHeap.getHeapGpuBase();
|
||||
|
||||
@@ -267,7 +267,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWhenDispat
|
||||
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgs.currentTimestampPacketNodes = ×tampPacket;
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<GPGPU_WALKER>(
|
||||
*mockCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -306,7 +306,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketDisabledWh
|
||||
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgs.currentTimestampPacketNodes = ×tampPacket;
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
|
||||
*mockCmdQ,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
@@ -1401,7 +1401,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenDispatchingTh
|
||||
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgs.currentTimestampPacketNodes = ×tamp7;
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<WALKER>(
|
||||
*mockCmdQ,
|
||||
multiDispatchInfo,
|
||||
csrDeps,
|
||||
@@ -1475,7 +1475,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledOnDifferentCSRsFr
|
||||
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgs.currentTimestampPacketNodes = ×tamp7;
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<WALKER>(
|
||||
*mockCmdQ,
|
||||
multiDispatchInfo,
|
||||
csrDeps,
|
||||
@@ -1534,7 +1534,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndDependenciesRe
|
||||
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
|
||||
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<WALKER>(
|
||||
*mockCmdQ,
|
||||
multiDispatchInfo,
|
||||
csrDeps,
|
||||
|
||||
@@ -58,8 +58,8 @@ static uint32_t slmSizeInKb[] = {1, 4, 8, 16, 32, 64};
|
||||
HWCMDTEST_P(IGFX_GEN8_CORE, KernelSLMAndBarrierTest, GivenStaticSlmSizeWhenProgrammingSlmThenProgrammingIsCorrect) {
|
||||
ASSERT_NE(nullptr, pClDevice);
|
||||
CommandQueueHw<FamilyType> cmdQ(nullptr, pClDevice, 0, false);
|
||||
typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
WALKER_TYPE walkerCmd{};
|
||||
// define kernel info
|
||||
kernelInfo.kernelDescriptor.kernelAttributes.barrierCount = 1;
|
||||
@@ -74,7 +74,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, KernelSLMAndBarrierTest, GivenStaticSlmSizeWhenProgr
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
uint64_t interfaceDescriptorOffset = indirectHeap.getUsed();
|
||||
|
||||
size_t offsetInterfaceDescriptorData = HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
|
||||
size_t offsetInterfaceDescriptorData = HardwareCommandsHelper<FamilyType>::template sendInterfaceDescriptorData<WALKER_TYPE, INTERFACE_DESCRIPTOR_DATA>(
|
||||
indirectHeap,
|
||||
interfaceDescriptorOffset,
|
||||
0,
|
||||
@@ -88,9 +88,9 @@ HWCMDTEST_P(IGFX_GEN8_CORE, KernelSLMAndBarrierTest, GivenStaticSlmSizeWhenProgr
|
||||
kernel,
|
||||
4u,
|
||||
pDevice->getPreemptionMode(),
|
||||
nullptr,
|
||||
*pDevice,
|
||||
&walkerCmd);
|
||||
&walkerCmd,
|
||||
nullptr);
|
||||
|
||||
// add the heap base + offset
|
||||
uint32_t *pIdData = (uint32_t *)indirectHeap.getCpuBase() + offsetInterfaceDescriptorData;
|
||||
@@ -149,6 +149,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
HWTEST_F(KernelSLMAndBarrierTest, GivenInterfaceDescriptorProgrammedWhenOverrideSlmAllocationSizeIsSetThenSlmSizeIsOverwritten) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
|
||||
WALKER_TYPE walkerCmd{};
|
||||
uint32_t expectedSlmSize = 5;
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
@@ -166,7 +167,7 @@ HWTEST_F(KernelSLMAndBarrierTest, GivenInterfaceDescriptorProgrammedWhenOverride
|
||||
uint64_t interfaceDescriptorOffset = indirectHeap.getUsed();
|
||||
INTERFACE_DESCRIPTOR_DATA interfaceDescriptorData;
|
||||
|
||||
HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
|
||||
HardwareCommandsHelper<FamilyType>::template sendInterfaceDescriptorData<WALKER_TYPE, INTERFACE_DESCRIPTOR_DATA>(
|
||||
indirectHeap,
|
||||
interfaceDescriptorOffset,
|
||||
0,
|
||||
@@ -180,9 +181,9 @@ HWTEST_F(KernelSLMAndBarrierTest, GivenInterfaceDescriptorProgrammedWhenOverride
|
||||
kernel,
|
||||
4u,
|
||||
pDevice->getPreemptionMode(),
|
||||
&interfaceDescriptorData,
|
||||
*pDevice,
|
||||
&walkerCmd);
|
||||
&walkerCmd,
|
||||
&interfaceDescriptorData);
|
||||
|
||||
auto pInterfaceDescriptor = HardwareCommandsHelper<FamilyType>::getInterfaceDescriptor(indirectHeap, interfaceDescriptorOffset, &interfaceDescriptorData);
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@ class MockCommandQueue : public CommandQueue {
|
||||
using CommandQueue::device;
|
||||
using CommandQueue::gpgpuEngine;
|
||||
using CommandQueue::h2dEngines;
|
||||
using CommandQueue::heaplessModeEnabled;
|
||||
using CommandQueue::isCopyOnly;
|
||||
using CommandQueue::isTextureCacheFlushNeeded;
|
||||
using CommandQueue::migrateMultiGraphicsAllocationsIfRequired;
|
||||
|
||||
@@ -195,7 +195,7 @@ XE_HPC_CORETEST_F(SystemMemoryFenceViaComputeWalkerTest, givenSystemMemoryFenceG
|
||||
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
|
||||
walkerArgs.currentTimestampPacketNodes = ×tampPacket;
|
||||
HardwareInterface<FamilyType>::dispatchWalker(
|
||||
HardwareInterface<FamilyType>::template dispatchWalker<COMPUTE_WALKER>(
|
||||
commandQueue,
|
||||
multiDispatchInfo,
|
||||
CsrDependencies(),
|
||||
|
||||
@@ -70,8 +70,8 @@ XE_HPC_CORETEST_F(MemoryPrefetchTestsXeHpcCore, givenKernelWhenWalkerIsProgramme
|
||||
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(workSize, wgInfo, PreemptionMode::Disabled);
|
||||
|
||||
mockKernel->kernelInfo.heapInfo.kernelHeapSize = 1;
|
||||
HardwareInterface<FamilyType>::programWalker(commandStream, *mockKernel->mockKernel, *commandQueue,
|
||||
heap, heap, heap, dispatchInfo, walkerArgs);
|
||||
HardwareInterface<FamilyType>::template programWalker<COMPUTE_WALKER>(commandStream, *mockKernel->mockKernel, *commandQueue,
|
||||
heap, heap, heap, dispatchInfo, walkerArgs);
|
||||
|
||||
HardwareParse hwParse;
|
||||
hwParse.parseCommands<FamilyType>(commandStream, 0);
|
||||
@@ -121,8 +121,8 @@ XE_HPC_CORETEST_F(ProgramWalkerTestsXeHpcCore, givenProperThreadGroupSizesWhenWa
|
||||
hwInfo->platform.usRevId = productHelper.getHwRevIdFromStepping(REVISION_A0, *hwInfo);
|
||||
|
||||
{
|
||||
HardwareInterface<FamilyType>::programWalker(commandStream, *mockKernel->mockKernel, *commandQueue,
|
||||
heap, heap, heap, dispatchInfo, walkerArgs);
|
||||
HardwareInterface<FamilyType>::template programWalker<COMPUTE_WALKER>(commandStream, *mockKernel->mockKernel, *commandQueue,
|
||||
heap, heap, heap, dispatchInfo, walkerArgs);
|
||||
HardwareParse hwParse;
|
||||
hwParse.parseCommands<FamilyType>(commandStream, 0);
|
||||
auto itorWalker = find<COMPUTE_WALKER *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
|
||||
@@ -152,8 +152,8 @@ XE_HPC_CORETEST_F(ProgramWalkerTestsXeHpcCore, givenDebugVariableSetWhenProgramm
|
||||
{
|
||||
// default
|
||||
|
||||
HardwareInterface<FamilyType>::programWalker(commandStream, *mockKernel->mockKernel, *commandQueue,
|
||||
heap, heap, heap, dispatchInfo, walkerArgs);
|
||||
HardwareInterface<FamilyType>::template programWalker<COMPUTE_WALKER>(commandStream, *mockKernel->mockKernel, *commandQueue,
|
||||
heap, heap, heap, dispatchInfo, walkerArgs);
|
||||
|
||||
HardwareParse hwParse;
|
||||
hwParse.parseCommands<FamilyType>(commandStream, 0);
|
||||
@@ -171,8 +171,8 @@ XE_HPC_CORETEST_F(ProgramWalkerTestsXeHpcCore, givenDebugVariableSetWhenProgramm
|
||||
commandsOffset = commandStream.getUsed();
|
||||
DebugManager.flags.ForceL3PrefetchForComputeWalker.set(1);
|
||||
|
||||
HardwareInterface<FamilyType>::programWalker(commandStream, *mockKernel->mockKernel, *commandQueue,
|
||||
heap, heap, heap, dispatchInfo, walkerArgs);
|
||||
HardwareInterface<FamilyType>::template programWalker<COMPUTE_WALKER>(commandStream, *mockKernel->mockKernel, *commandQueue,
|
||||
heap, heap, heap, dispatchInfo, walkerArgs);
|
||||
|
||||
HardwareParse hwParse;
|
||||
hwParse.parseCommands<FamilyType>(commandStream, commandsOffset);
|
||||
@@ -190,8 +190,8 @@ XE_HPC_CORETEST_F(ProgramWalkerTestsXeHpcCore, givenDebugVariableSetWhenProgramm
|
||||
commandsOffset = commandStream.getUsed();
|
||||
DebugManager.flags.ForceL3PrefetchForComputeWalker.set(0);
|
||||
|
||||
HardwareInterface<FamilyType>::programWalker(commandStream, *mockKernel->mockKernel, *commandQueue,
|
||||
heap, heap, heap, dispatchInfo, walkerArgs);
|
||||
HardwareInterface<FamilyType>::template programWalker<COMPUTE_WALKER>(commandStream, *mockKernel->mockKernel, *commandQueue,
|
||||
heap, heap, heap, dispatchInfo, walkerArgs);
|
||||
|
||||
HardwareParse hwParse;
|
||||
hwParse.parseCommands<FamilyType>(commandStream, commandsOffset);
|
||||
|
||||
@@ -96,12 +96,15 @@ struct EncodeDispatchKernel {
|
||||
|
||||
static void encode(CommandContainer &container, EncodeDispatchKernelArgs &args);
|
||||
|
||||
static void encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
|
||||
template <typename WalkerType>
|
||||
static void encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs);
|
||||
|
||||
static void appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment,
|
||||
template <typename InterfaceDescriptorType>
|
||||
static void appendAdditionalIDDFields(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment,
|
||||
const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
|
||||
|
||||
static void setGrfInfo(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData,
|
||||
template <typename InterfaceDescriptorType>
|
||||
static void setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData,
|
||||
const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
|
||||
static void *getInterfaceDescriptor(CommandContainer &container, IndirectHeap *childDsh, uint32_t &iddOffset);
|
||||
@@ -129,15 +132,19 @@ struct EncodeDispatchKernel {
|
||||
uint32_t requiredWorkGroupOrder,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
|
||||
static void programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
|
||||
template <typename InterfaceDescriptorType>
|
||||
static void programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
|
||||
|
||||
static void adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WALKER_TYPE &walkerCmd);
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
|
||||
|
||||
static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
|
||||
|
||||
static void adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
|
||||
template <typename WalkerType>
|
||||
static void adjustTimestampPacket(WalkerType &walkerCmd, const HardwareInfo &hwInfo);
|
||||
|
||||
static void setupPostSyncMocs(WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
|
||||
template <typename WalkerType>
|
||||
static void setupPostSyncMocs(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
|
||||
|
||||
static void adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
|
||||
@@ -326,6 +333,7 @@ struct EncodeStateBaseAddressArgs {
|
||||
bool multiOsContextCapable = false;
|
||||
bool isRcs = false;
|
||||
bool doubleSbaWa = false;
|
||||
bool heaplessModeEnabled = false;
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
||||
@@ -574,7 +574,8 @@ bool EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(const KernelDes
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {}
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WalkerType &walkerCmd, const HardwareInfo &hwInfo) {}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeIndirectParams<Family>::encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr) {
|
||||
@@ -716,7 +717,8 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WALKER_TYPE &walkerCmd) {}
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) {
|
||||
|
||||
@@ -29,7 +29,8 @@
|
||||
namespace NEO {
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf,
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf,
|
||||
const size_t &sizeCrossThreadData, const size_t &sizePerThreadData,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
auto grfSize = sizeof(typename Family::GRF);
|
||||
@@ -92,7 +93,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
EncodeDispatchKernel<Family>::programBarrierEnable(idd,
|
||||
kernelDescriptor.kernelAttributes.barrierCount,
|
||||
hwInfo);
|
||||
auto slmSize = static_cast<typename INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE>(
|
||||
auto slmSize = static_cast<uint32_t>(
|
||||
gfxCoreHelper.computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize()));
|
||||
idd.setSharedLocalMemorySize(slmSize);
|
||||
|
||||
@@ -239,7 +240,9 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
false, // useGlobalAtomics
|
||||
false, // multiOsContextCapable
|
||||
args.isRcs, // isRcs
|
||||
container.doubleSbaWaRef()}; // doubleSbaWa
|
||||
container.doubleSbaWaRef(), // doubleSbaWa
|
||||
false, // heaplessModeEnabled
|
||||
};
|
||||
EncodeStateBaseAddress<Family>::encode(encodeStateBaseAddressArgs);
|
||||
container.setDirtyStateForAllHeaps(false);
|
||||
args.requiresUncachedMocs = false;
|
||||
@@ -394,17 +397,20 @@ void EncodeDispatchKernel<Family>::encodeThreadData(WALKER_TYPE &walkerCmd,
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor,
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor,
|
||||
uint32_t value,
|
||||
const HardwareInfo &hwInfo) {
|
||||
interfaceDescriptor.setBarrierEnable(value);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
|
||||
template <typename WalkerType>
|
||||
inline void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {}
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {}
|
||||
|
||||
template <typename Family>
|
||||
inline bool EncodeDispatchKernel<Family>::isDshNeeded(const DeviceInfo &deviceInfo) {
|
||||
@@ -592,7 +598,8 @@ inline void EncodeMiArbCheck<Family>::adjust(MI_ARB_CHECK &miArbCheck, std::opti
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::setupPostSyncMocs(WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {}
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::setupPostSyncMocs(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment) {}
|
||||
|
||||
@@ -37,7 +37,8 @@ constexpr size_t TimestampDestinationAddressAlignment = 16;
|
||||
constexpr size_t ImmWriteDestinationAddressAlignment = 8;
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf,
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf,
|
||||
const size_t &sizeCrossThreadData, const size_t &sizePerThreadData,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
}
|
||||
@@ -77,9 +78,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
|
||||
EncodeDispatchKernel<Family>::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData,
|
||||
sizePerThreadData, rootDeviceEnvironment);
|
||||
auto &productHelper = args.device->getProductHelper();
|
||||
productHelper.updateIddCommand(&idd, kernelDescriptor.kernelAttributes.numGrfRequired,
|
||||
kernelDescriptor.kernelAttributes.threadArbitrationPolicy);
|
||||
|
||||
bool localIdsGenerationByRuntime = args.dispatchInterface->requiresGenerationOfLocalIdsByRuntime();
|
||||
auto requiredWorkgroupOrder = args.dispatchInterface->getRequiredWorkgroupOrder();
|
||||
@@ -105,17 +103,18 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
hwInfo);
|
||||
|
||||
auto &gfxCoreHelper = args.device->getGfxCoreHelper();
|
||||
auto slmSize = static_cast<SHARED_LOCAL_MEMORY_SIZE>(
|
||||
auto slmSize = static_cast<uint32_t>(
|
||||
gfxCoreHelper.computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize()));
|
||||
|
||||
if (DebugManager.flags.OverrideSlmAllocationSize.get() != -1) {
|
||||
slmSize = static_cast<SHARED_LOCAL_MEMORY_SIZE>(DebugManager.flags.OverrideSlmAllocationSize.get());
|
||||
slmSize = static_cast<uint32_t>(DebugManager.flags.OverrideSlmAllocationSize.get());
|
||||
}
|
||||
idd.setSharedLocalMemorySize(slmSize);
|
||||
|
||||
auto bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
|
||||
bool skipSshProgramming = false;
|
||||
|
||||
auto &productHelper = args.device->getProductHelper();
|
||||
if (productHelper.isSkippingStatefulInformationRequired(kernelDescriptor)) {
|
||||
bindingTableStateCount = 0u;
|
||||
skipSshProgramming = true;
|
||||
@@ -272,7 +271,9 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
args.useGlobalAtomics, // useGlobalAtomics
|
||||
args.partitionCount > 1, // multiOsContextCapable
|
||||
args.isRcs, // isRcs
|
||||
container.doubleSbaWaRef()}; // doubleSbaWa
|
||||
container.doubleSbaWaRef(), // doubleSbaWa
|
||||
false, // heaplessModeEnabled
|
||||
};
|
||||
EncodeStateBaseAddress<Family>::encode(encodeStateBaseAddressArgs);
|
||||
container.setDirtyStateForAllHeaps(false);
|
||||
}
|
||||
@@ -392,7 +393,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline void EncodeDispatchKernel<Family>::setupPostSyncMocs(WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {
|
||||
template <typename WalkerType>
|
||||
inline void EncodeDispatchKernel<Family>::setupPostSyncMocs(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {
|
||||
auto &postSyncData = walkerCmd.getPostSync();
|
||||
auto gmmHelper = rootDeviceEnvironment.getGmmHelper();
|
||||
|
||||
|
||||
@@ -40,14 +40,16 @@ template <typename GfxFamily>
|
||||
struct ImplicitScalingDispatch {
|
||||
using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;
|
||||
|
||||
template <typename WalkerType>
|
||||
static size_t getSize(bool apiSelfCleanup,
|
||||
bool preferStaticPartitioning,
|
||||
const DeviceBitfield &devices,
|
||||
const Vec3<size_t> &groupStart,
|
||||
const Vec3<size_t> &groupCount);
|
||||
|
||||
template <typename WalkerType>
|
||||
static void dispatchCommands(LinearStream &commandStream,
|
||||
WALKER_TYPE &walkerCmd,
|
||||
WalkerType &walkerCmd,
|
||||
void **outWalkerPtr,
|
||||
const DeviceBitfield &devices,
|
||||
uint32_t &partitionCount,
|
||||
|
||||
@@ -56,22 +56,23 @@ WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(uint64_t workPar
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool apiSelfCleanup,
|
||||
bool preferStaticPartitioning,
|
||||
const DeviceBitfield &devices,
|
||||
const Vec3<size_t> &groupStart,
|
||||
const Vec3<size_t> &groupCount) {
|
||||
typename GfxFamily::COMPUTE_WALKER::PARTITION_TYPE partitionType{};
|
||||
typename WalkerType::PARTITION_TYPE partitionType{};
|
||||
bool staticPartitioning = false;
|
||||
const uint32_t tileCount = static_cast<uint32_t>(devices.count());
|
||||
|
||||
const uint32_t partitionCount = WalkerPartition::computePartitionCountAndPartitionType<GfxFamily>(tileCount,
|
||||
preferStaticPartitioning,
|
||||
groupStart,
|
||||
groupCount,
|
||||
{},
|
||||
&partitionType,
|
||||
&staticPartitioning);
|
||||
const uint32_t partitionCount = WalkerPartition::computePartitionCountAndPartitionType<GfxFamily, WalkerType>(tileCount,
|
||||
preferStaticPartitioning,
|
||||
groupStart,
|
||||
groupCount,
|
||||
{},
|
||||
&partitionType,
|
||||
&staticPartitioning);
|
||||
UNRECOVERABLE_IF(staticPartitioning && (tileCount != partitionCount));
|
||||
WalkerPartition::WalkerPartitionArgs args = prepareWalkerPartitionArgs<GfxFamily>(0u,
|
||||
tileCount,
|
||||
@@ -87,8 +88,9 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool apiSelfCleanup,
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType>
|
||||
void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandStream,
|
||||
WALKER_TYPE &walkerCmd,
|
||||
WalkerType &walkerCmd,
|
||||
void **outWalkerPtr,
|
||||
const DeviceBitfield &devices,
|
||||
uint32_t &partitionCount,
|
||||
@@ -104,7 +106,7 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
|
||||
const bool preferStaticPartitioning = workPartitionAllocationGpuVa != 0u;
|
||||
|
||||
bool staticPartitioning = false;
|
||||
partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType<GfxFamily>(&walkerCmd, tileCount, preferStaticPartitioning, usesImages, &staticPartitioning);
|
||||
partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType<GfxFamily, WalkerType>(&walkerCmd, tileCount, preferStaticPartitioning, usesImages, &staticPartitioning);
|
||||
|
||||
WalkerPartition::WalkerPartitionArgs args = prepareWalkerPartitionArgs<GfxFamily>(workPartitionAllocationGpuVa,
|
||||
tileCount,
|
||||
@@ -116,35 +118,35 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
|
||||
dcFlush,
|
||||
forceExecutionOnSingleTile);
|
||||
|
||||
auto dispatchCommandsSize = getSize(apiSelfCleanup, preferStaticPartitioning, devices, {walkerCmd.getThreadGroupIdStartingX(), walkerCmd.getThreadGroupIdStartingY(), walkerCmd.getThreadGroupIdStartingZ()}, {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()});
|
||||
auto dispatchCommandsSize = getSize<WalkerType>(apiSelfCleanup, preferStaticPartitioning, devices, {walkerCmd.getThreadGroupIdStartingX(), walkerCmd.getThreadGroupIdStartingY(), walkerCmd.getThreadGroupIdStartingZ()}, {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()});
|
||||
void *commandBuffer = commandStream.getSpace(dispatchCommandsSize);
|
||||
uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed() - dispatchCommandsSize;
|
||||
|
||||
if (staticPartitioning) {
|
||||
UNRECOVERABLE_IF(tileCount != partitionCount);
|
||||
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandBuffer,
|
||||
outWalkerPtr,
|
||||
cmdBufferGpuAddress,
|
||||
&walkerCmd,
|
||||
totalProgrammedSize,
|
||||
args,
|
||||
hwInfo);
|
||||
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily, WalkerType>(commandBuffer,
|
||||
outWalkerPtr,
|
||||
cmdBufferGpuAddress,
|
||||
&walkerCmd,
|
||||
totalProgrammedSize,
|
||||
args,
|
||||
hwInfo);
|
||||
} else {
|
||||
if (DebugManager.flags.ExperimentalSetWalkerPartitionCount.get()) {
|
||||
partitionCount = DebugManager.flags.ExperimentalSetWalkerPartitionCount.get();
|
||||
if (partitionCount == 1u) {
|
||||
walkerCmd.setPartitionType(GfxFamily::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
|
||||
walkerCmd.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
|
||||
}
|
||||
args.partitionCount = partitionCount;
|
||||
}
|
||||
|
||||
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily>(commandBuffer,
|
||||
outWalkerPtr,
|
||||
cmdBufferGpuAddress,
|
||||
&walkerCmd,
|
||||
totalProgrammedSize,
|
||||
args,
|
||||
hwInfo);
|
||||
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily, WalkerType>(commandBuffer,
|
||||
outWalkerPtr,
|
||||
cmdBufferGpuAddress,
|
||||
&walkerCmd,
|
||||
totalProgrammedSize,
|
||||
args,
|
||||
hwInfo);
|
||||
}
|
||||
UNRECOVERABLE_IF(totalProgrammedSize != dispatchCommandsSize);
|
||||
}
|
||||
|
||||
@@ -71,17 +71,19 @@ inline void *putCommand(void *&inputAddress, uint32_t &totalBytesProgrammed, siz
|
||||
return commandToReturn;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename GfxFamily, typename WalkerType>
|
||||
uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitionCount,
|
||||
bool preferStaticPartitioning,
|
||||
const Vec3<size_t> &groupStart,
|
||||
const Vec3<size_t> &groupCount,
|
||||
std::optional<typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE> requestedPartitionType,
|
||||
typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE *outSelectedPartitionType,
|
||||
std::optional<typename WalkerType::PARTITION_TYPE> requestedPartitionType,
|
||||
typename WalkerType::PARTITION_TYPE *outSelectedPartitionType,
|
||||
bool *outSelectStaticPartitioning) {
|
||||
|
||||
using PARTITION_TYPE = typename WalkerType::PARTITION_TYPE;
|
||||
// For non uniform starting point, there is no support for partition in Hardware. Disable partitioning and select dynamic algorithm
|
||||
if (groupStart.x || groupStart.y || groupStart.z) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_DISABLED;
|
||||
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_DISABLED;
|
||||
*outSelectStaticPartitioning = false;
|
||||
return 1u;
|
||||
}
|
||||
@@ -90,18 +92,18 @@ uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitio
|
||||
bool disablePartitionForPartitionCountOne{};
|
||||
|
||||
if (NEO::DebugManager.flags.ExperimentalSetWalkerPartitionType.get() != -1) {
|
||||
requestedPartitionType = static_cast<typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE>(NEO::DebugManager.flags.ExperimentalSetWalkerPartitionType.get());
|
||||
requestedPartitionType = static_cast<PARTITION_TYPE>(NEO::DebugManager.flags.ExperimentalSetWalkerPartitionType.get());
|
||||
}
|
||||
|
||||
if (requestedPartitionType.has_value()) {
|
||||
switch (requestedPartitionType.value()) {
|
||||
case COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X:
|
||||
case PARTITION_TYPE::PARTITION_TYPE_X:
|
||||
workgroupCount = groupCount.x;
|
||||
break;
|
||||
case COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Y:
|
||||
case PARTITION_TYPE::PARTITION_TYPE_Y:
|
||||
workgroupCount = groupCount.y;
|
||||
break;
|
||||
case COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Z:
|
||||
case PARTITION_TYPE::PARTITION_TYPE_Z:
|
||||
workgroupCount = groupCount.z;
|
||||
break;
|
||||
default:
|
||||
@@ -124,11 +126,11 @@ uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitio
|
||||
|
||||
// we first try with deepest dimension to see if we can partition there
|
||||
if (groupCount.z > 1 && (zImbalance <= minimalThreshold)) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Z;
|
||||
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Z;
|
||||
} else if (groupCount.y > 1 && (yImbalance < minimalThreshold)) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Y;
|
||||
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Y;
|
||||
} else if (groupCount.x % preferredMinimalPartitionCount == 0) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X;
|
||||
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_X;
|
||||
}
|
||||
// if we are here then there is no dimension that results in even distribution, choose max dimension to minimize impact
|
||||
else {
|
||||
@@ -138,11 +140,11 @@ uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitio
|
||||
if (goWithMaxAlgorithm) {
|
||||
// default mode, select greatest dimension
|
||||
if (maxDimension == groupCount.x) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X;
|
||||
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_X;
|
||||
} else if (maxDimension == groupCount.y) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Y;
|
||||
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Y;
|
||||
} else {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Z;
|
||||
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Z;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -175,32 +177,35 @@ uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitio
|
||||
}
|
||||
|
||||
if (partitionCount == 1u && disablePartitionForPartitionCountOne) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_DISABLED;
|
||||
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_DISABLED;
|
||||
}
|
||||
|
||||
return static_cast<uint32_t>(partitionCount);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint32_t computePartitionCountAndSetPartitionType(COMPUTE_WALKER<GfxFamily> *walker,
|
||||
template <typename GfxFamily, typename WalkerType>
|
||||
uint32_t computePartitionCountAndSetPartitionType(WalkerType *walker,
|
||||
uint32_t preferredMinimalPartitionCount,
|
||||
bool preferStaticPartitioning,
|
||||
bool usesImages,
|
||||
bool *outSelectStaticPartitioning) {
|
||||
|
||||
using PARTITION_TYPE = typename WalkerType::PARTITION_TYPE;
|
||||
|
||||
const Vec3<size_t> groupStart = {walker->getThreadGroupIdStartingX(), walker->getThreadGroupIdStartingY(), walker->getThreadGroupIdStartingZ()};
|
||||
const Vec3<size_t> groupCount = {walker->getThreadGroupIdXDimension(), walker->getThreadGroupIdYDimension(), walker->getThreadGroupIdZDimension()};
|
||||
std::optional<typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE> requestedPartitionType{};
|
||||
std::optional<PARTITION_TYPE> requestedPartitionType{};
|
||||
if (usesImages) {
|
||||
requestedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X;
|
||||
requestedPartitionType = PARTITION_TYPE::PARTITION_TYPE_X;
|
||||
}
|
||||
typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE partitionType{};
|
||||
const auto partitionCount = computePartitionCountAndPartitionType<GfxFamily>(preferredMinimalPartitionCount,
|
||||
preferStaticPartitioning,
|
||||
groupStart,
|
||||
groupCount,
|
||||
requestedPartitionType,
|
||||
&partitionType,
|
||||
outSelectStaticPartitioning);
|
||||
PARTITION_TYPE partitionType{};
|
||||
const auto partitionCount = computePartitionCountAndPartitionType<GfxFamily, WalkerType>(preferredMinimalPartitionCount,
|
||||
preferStaticPartitioning,
|
||||
groupStart,
|
||||
groupCount,
|
||||
requestedPartitionType,
|
||||
&partitionType,
|
||||
outSelectStaticPartitioning);
|
||||
walker->setPartitionType(partitionType);
|
||||
return partitionCount;
|
||||
}
|
||||
@@ -426,10 +431,10 @@ void programSelfCleanupEndSection(void *&inputAddress,
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * args.tileCount);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename GfxFamily, typename WalkerType>
|
||||
void programTilesSynchronizationWithPostSyncs(void *¤tBatchBufferPointer,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
COMPUTE_WALKER<GfxFamily> *inputWalker,
|
||||
WalkerType *inputWalker,
|
||||
uint32_t partitionCount) {
|
||||
const auto postSyncAddress = inputWalker->getPostSync().getDestinationAddress() + 8llu;
|
||||
for (uint32_t partitionId = 0u; partitionId < partitionCount; partitionId++) {
|
||||
@@ -472,13 +477,13 @@ uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) {
|
||||
computeWalkerSectionSize<GfxFamily>();
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename GfxFamily, typename WalkerType>
|
||||
void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgrammed,
|
||||
COMPUTE_WALKER<GfxFamily> *inputWalker,
|
||||
WalkerType *inputWalker,
|
||||
uint32_t partitionCount,
|
||||
bool forceExecutionOnSingleTile) {
|
||||
auto computeWalker = putCommand<COMPUTE_WALKER<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
||||
COMPUTE_WALKER<GfxFamily> cmd = *inputWalker;
|
||||
auto computeWalker = putCommand<WalkerType>(inputAddress, totalBytesProgrammed);
|
||||
WalkerType cmd = *inputWalker;
|
||||
|
||||
if (partitionCount > 1) {
|
||||
auto partitionType = inputWalker->getPartitionType();
|
||||
@@ -486,14 +491,14 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm
|
||||
assert(inputWalker->getThreadGroupIdStartingX() == 0u);
|
||||
assert(inputWalker->getThreadGroupIdStartingY() == 0u);
|
||||
assert(inputWalker->getThreadGroupIdStartingZ() == 0u);
|
||||
assert(partitionType != COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
|
||||
assert(partitionType != WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
|
||||
|
||||
cmd.setWorkloadPartitionEnable(true);
|
||||
|
||||
auto workgroupCount = 0u;
|
||||
if (partitionType == COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X) {
|
||||
if (partitionType == WalkerType::PARTITION_TYPE::PARTITION_TYPE_X) {
|
||||
workgroupCount = inputWalker->getThreadGroupIdXDimension();
|
||||
} else if (partitionType == COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Y) {
|
||||
} else if (partitionType == WalkerType::PARTITION_TYPE::PARTITION_TYPE_Y) {
|
||||
workgroupCount = inputWalker->getThreadGroupIdYDimension();
|
||||
} else {
|
||||
workgroupCount = inputWalker->getThreadGroupIdZDimension();
|
||||
@@ -540,11 +545,11 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm
|
||||
32. BATCH_BUFFER_END ( optional )
|
||||
*/
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename GfxFamily, typename WalkerType>
|
||||
void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
void **outWalkerPtr,
|
||||
uint64_t gpuAddressOfAllocation,
|
||||
COMPUTE_WALKER<GfxFamily> *inputWalker,
|
||||
WalkerType *inputWalker,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
WalkerPartitionArgs &args,
|
||||
const NEO::HardwareInfo &hwInfo) {
|
||||
@@ -617,7 +622,7 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
args.secondaryBatchBuffer);
|
||||
|
||||
// Walker section
|
||||
auto walkerPtr = programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.forceExecutionOnSingleTile);
|
||||
auto walkerPtr = programPartitionedWalker<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.forceExecutionOnSingleTile);
|
||||
if (outWalkerPtr) {
|
||||
*outWalkerPtr = walkerPtr;
|
||||
}
|
||||
@@ -686,11 +691,11 @@ uint64_t computeStaticPartitioningControlSectionOffset(WalkerPartitionArgs &args
|
||||
bbStartSize;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename GfxFamily, typename WalkerType>
|
||||
void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
void **outWalkerPtr,
|
||||
uint64_t gpuAddressOfAllocation,
|
||||
COMPUTE_WALKER<GfxFamily> *inputWalker,
|
||||
WalkerType *inputWalker,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
WalkerPartitionArgs &args,
|
||||
const NEO::HardwareInfo &hwInfo) {
|
||||
@@ -730,7 +735,7 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
|
||||
// Synchronize tiles after walker
|
||||
if (args.semaphoreProgrammingRequired) {
|
||||
programTilesSynchronizationWithPostSyncs<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
|
||||
programTilesSynchronizationWithPostSyncs<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
|
||||
}
|
||||
|
||||
if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) {
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include "shared/source/gmm_helper/page_table_mngr.h"
|
||||
#include "shared/source/helpers/api_specific_config.h"
|
||||
#include "shared/source/helpers/array_count.h"
|
||||
#include "shared/source/helpers/compiler_product_helper.h"
|
||||
#include "shared/source/helpers/flat_batch_buffer_helper.h"
|
||||
#include "shared/source/helpers/flush_stamp.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
@@ -96,6 +97,9 @@ CommandStreamReceiver::CommandStreamReceiver(ExecutionEnvironment &executionEnvi
|
||||
this->l1CachePolicyData.init(productHelper);
|
||||
|
||||
registeredClients.reserve(16);
|
||||
|
||||
auto &compilerProductHelper = rootDeviceEnvironment.getHelper<CompilerProductHelper>();
|
||||
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled();
|
||||
}
|
||||
|
||||
CommandStreamReceiver::~CommandStreamReceiver() {
|
||||
|
||||
@@ -573,6 +573,7 @@ class CommandStreamReceiver {
|
||||
volatile bool resourcesInitialized = false;
|
||||
bool doubleSbaWa = false;
|
||||
bool dshSupported = false;
|
||||
bool heaplessModeEnabled = false;
|
||||
};
|
||||
|
||||
typedef CommandStreamReceiver *(*CommandStreamReceiverCreateFunc)(bool withAubDump,
|
||||
|
||||
@@ -20,8 +20,9 @@ struct PipeControlArgs;
|
||||
|
||||
template <typename GfxFamily>
|
||||
class CommandStreamReceiverHw : public CommandStreamReceiver {
|
||||
typedef typename GfxFamily::MI_BATCH_BUFFER_START MI_BATCH_BUFFER_START;
|
||||
typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL;
|
||||
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
|
||||
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
|
||||
using STATE_BASE_ADDRESS = typename GfxFamily::STATE_BASE_ADDRESS;
|
||||
|
||||
struct ImmediateFlushData {
|
||||
PipelineSelectArgs pipelineSelectArgs{};
|
||||
@@ -176,6 +177,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
|
||||
}
|
||||
|
||||
void dispatchRayTracingStateCommand(LinearStream &cmdStream, Device &device);
|
||||
uint64_t getScratchPatchAddress();
|
||||
|
||||
protected:
|
||||
void programPreemption(LinearStream &csr, DispatchFlags &dispatchFlags);
|
||||
@@ -202,7 +204,6 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
|
||||
void addPipeControlBefore3dState(LinearStream &commandStream, DispatchFlags &dispatchFlags);
|
||||
bool are4GbHeapsAvailable() const;
|
||||
|
||||
uint64_t getScratchPatchAddress();
|
||||
void createScratchSpaceController();
|
||||
|
||||
bool detectInitProgrammingFlagsRequired(const DispatchFlags &dispatchFlags) const;
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "shared/source/gmm_helper/page_table_mngr.h"
|
||||
#include "shared/source/helpers/blit_commands_helper.h"
|
||||
#include "shared/source/helpers/blit_properties.h"
|
||||
#include "shared/source/helpers/compiler_product_helper.h"
|
||||
#include "shared/source/helpers/definitions/command_encoder_args.h"
|
||||
#include "shared/source/helpers/engine_node_helper.h"
|
||||
#include "shared/source/helpers/flat_batch_buffer_helper_hw.h"
|
||||
@@ -505,7 +506,9 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
|
||||
dispatchRayTracingStateCommand(commandStreamCSR, device);
|
||||
}
|
||||
|
||||
programVFEState(commandStreamCSR, dispatchFlags, device.getDeviceInfo().maxFrontEndThreads);
|
||||
if (this->heaplessModeEnabled == false) {
|
||||
programVFEState(commandStreamCSR, dispatchFlags, device.getDeviceInfo().maxFrontEndThreads);
|
||||
}
|
||||
|
||||
programPreemption(commandStreamCSR, dispatchFlags);
|
||||
|
||||
|
||||
@@ -78,8 +78,8 @@ class PreemptionHelper {
|
||||
|
||||
static PreemptionMode getDefaultPreemptionMode(const HardwareInfo &hwInfo);
|
||||
|
||||
template <typename GfxFamily>
|
||||
static void programInterfaceDescriptorDataPreemption(INTERFACE_DESCRIPTOR_DATA<GfxFamily> *idd, PreemptionMode preemptionMode);
|
||||
template <typename GfxFamily, typename InterfaceDescriptorType>
|
||||
static void programInterfaceDescriptorDataPreemption(InterfaceDescriptorType *idd, PreemptionMode preemptionMode);
|
||||
|
||||
protected:
|
||||
template <typename GfxFamily>
|
||||
|
||||
@@ -118,8 +118,8 @@ template <typename GfxFamily>
|
||||
void PreemptionHelper::applyPreemptionWaCmdsEnd(LinearStream *pCommandStream, const Device &device) {
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void PreemptionHelper::programInterfaceDescriptorDataPreemption(INTERFACE_DESCRIPTOR_DATA<GfxFamily> *idd, PreemptionMode preemptionMode) {
|
||||
template <typename GfxFamily, typename InterfaceDescriptorType>
|
||||
void PreemptionHelper::programInterfaceDescriptorDataPreemption(InterfaceDescriptorType *idd, PreemptionMode preemptionMode) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
|
||||
if (preemptionMode == PreemptionMode::MidThread) {
|
||||
idd->setThreadPreemptionDisable(INTERFACE_DESCRIPTOR_DATA::THREAD_PREEMPTION_DISABLE_DISABLE);
|
||||
|
||||
@@ -72,6 +72,14 @@ void EncodeComputeMode<Family>::programComputeModeCommand(LinearStream &csr, Sta
|
||||
}
|
||||
|
||||
template struct EncodeDispatchKernel<Family>;
|
||||
template void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::WALKER_TYPE>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
|
||||
template void EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
|
||||
template void EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template void EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
|
||||
template void EncodeDispatchKernel<Family>::programBarrierEnable<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
|
||||
template void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::WALKER_TYPE, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::WALKER_TYPE &walkerCmd);
|
||||
template void EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
|
||||
|
||||
template struct EncodeStates<Family>;
|
||||
template struct EncodeMath<Family>;
|
||||
template struct EncodeMathMMIO<Family>;
|
||||
|
||||
@@ -136,5 +136,25 @@ struct Gen11Family : public Gen11 {
|
||||
static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) {
|
||||
return cmdSetBaseFamily == IGFX_GEN8_CORE;
|
||||
}
|
||||
|
||||
template <typename WalkerType = WALKER_TYPE>
|
||||
static constexpr size_t getInterfaceDescriptorSize() {
|
||||
return sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
}
|
||||
|
||||
template <typename WalkerType = WALKER_TYPE>
|
||||
static WalkerType getInitGpuWalker() {
|
||||
return cmdInitGpgpuWalker;
|
||||
}
|
||||
|
||||
template <typename InterfaceDescriptorType>
|
||||
static InterfaceDescriptorType getInitInterfaceDescriptor() {
|
||||
return cmdInitInterfaceDescriptorData;
|
||||
}
|
||||
|
||||
template <typename WalkerType>
|
||||
static constexpr bool isHeaplessMode() {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
} // namespace NEO
|
||||
|
||||
@@ -112,6 +112,14 @@ void EncodeComputeMode<Family>::adjustPipelineSelect(CommandContainer &container
|
||||
}
|
||||
|
||||
template struct EncodeDispatchKernel<Family>;
|
||||
template void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::WALKER_TYPE>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
|
||||
template void EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
|
||||
template void EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template void EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
|
||||
template void EncodeDispatchKernel<Family>::programBarrierEnable<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
|
||||
template void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::WALKER_TYPE, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::WALKER_TYPE &walkerCmd);
|
||||
template void EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
|
||||
|
||||
template struct EncodeStates<Family>;
|
||||
template struct EncodeMath<Family>;
|
||||
template struct EncodeMathMMIO<Family>;
|
||||
|
||||
@@ -137,6 +137,21 @@ struct Gen12LpFamily : public Gen12Lp {
|
||||
static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) {
|
||||
return cmdSetBaseFamily == IGFX_GEN8_CORE;
|
||||
}
|
||||
|
||||
template <typename WalkerType>
|
||||
static constexpr size_t getInterfaceDescriptorSize() {
|
||||
return sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
}
|
||||
|
||||
template <typename InterfaceDescriptorType>
|
||||
static InterfaceDescriptorType getInitInterfaceDescriptor() {
|
||||
return cmdInitInterfaceDescriptorData;
|
||||
}
|
||||
|
||||
template <typename WalkerType>
|
||||
static constexpr bool isHeaplessMode() {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -52,6 +52,14 @@ void EncodeStateBaseAddress<Family>::setSbaAddressesForDebugger(NEO::Debugger::S
|
||||
}
|
||||
|
||||
template struct EncodeDispatchKernel<Family>;
|
||||
template void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::WALKER_TYPE>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
|
||||
template void EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
|
||||
template void EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template void EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
|
||||
template void EncodeDispatchKernel<Family>::programBarrierEnable<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
|
||||
template void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::WALKER_TYPE, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::WALKER_TYPE &walkerCmd);
|
||||
template void EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
|
||||
|
||||
template struct EncodeStates<Family>;
|
||||
template struct EncodeMath<Family>;
|
||||
template struct EncodeMathMMIO<Family>;
|
||||
|
||||
@@ -136,6 +136,26 @@ struct Gen8Family : public Gen8 {
|
||||
static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) {
|
||||
return cmdSetBaseFamily == IGFX_GEN8_CORE;
|
||||
}
|
||||
|
||||
template <typename WalkerType = WALKER_TYPE>
|
||||
static constexpr size_t getInterfaceDescriptorSize() {
|
||||
return sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
}
|
||||
|
||||
template <typename WalkerType = WALKER_TYPE>
|
||||
static WalkerType getInitGpuWalker() {
|
||||
return cmdInitGpgpuWalker;
|
||||
}
|
||||
|
||||
template <typename InterfaceDescriptorType>
|
||||
static InterfaceDescriptorType getInitInterfaceDescriptor() {
|
||||
return cmdInitInterfaceDescriptorData;
|
||||
}
|
||||
|
||||
template <typename WalkerType>
|
||||
static constexpr bool isHeaplessMode() {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -57,6 +57,14 @@ void EncodeComputeMode<Family>::programComputeModeCommand(LinearStream &csr, Sta
|
||||
}
|
||||
|
||||
template struct EncodeDispatchKernel<Family>;
|
||||
template void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::WALKER_TYPE>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
|
||||
template void EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
|
||||
template void EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template void EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
|
||||
template void EncodeDispatchKernel<Family>::programBarrierEnable<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
|
||||
template void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::WALKER_TYPE, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::WALKER_TYPE &walkerCmd);
|
||||
template void EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
|
||||
|
||||
template struct EncodeStates<Family>;
|
||||
template struct EncodeMath<Family>;
|
||||
template struct EncodeMathMMIO<Family>;
|
||||
|
||||
@@ -136,6 +136,26 @@ struct Gen9Family : public Gen9 {
|
||||
static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) {
|
||||
return cmdSetBaseFamily == IGFX_GEN8_CORE;
|
||||
}
|
||||
|
||||
template <typename WalkerType = WALKER_TYPE>
|
||||
static constexpr size_t getInterfaceDescriptorSize() {
|
||||
return sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
}
|
||||
|
||||
template <typename WalkerType = WALKER_TYPE>
|
||||
static WalkerType getInitGpuWalker() {
|
||||
return cmdInitGpgpuWalker;
|
||||
}
|
||||
|
||||
template <typename InterfaceDescriptorType>
|
||||
static InterfaceDescriptorType getInitInterfaceDescriptor() {
|
||||
return cmdInitInterfaceDescriptorData;
|
||||
}
|
||||
|
||||
template <typename WalkerType>
|
||||
static constexpr bool isHeaplessMode() {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2022 Intel Corporation
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -47,194 +47,6 @@ typedef struct tagBINDING_TABLE_STATE {
|
||||
}
|
||||
} BINDING_TABLE_STATE;
|
||||
STATIC_ASSERT(4 == sizeof(BINDING_TABLE_STATE));
|
||||
typedef struct tagGPGPU_WALKER {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
|
||||
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
|
||||
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
|
||||
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
|
||||
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
|
||||
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
|
||||
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
|
||||
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
|
||||
uint32_t CommandType : BITFIELD_RANGE(29, 31);
|
||||
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
|
||||
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
|
||||
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
|
||||
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
|
||||
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
|
||||
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
|
||||
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
|
||||
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
|
||||
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
|
||||
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
|
||||
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
|
||||
uint32_t ThreadGroupIdStartingX;
|
||||
uint32_t Reserved_192;
|
||||
uint32_t ThreadGroupIdXDimension;
|
||||
uint32_t ThreadGroupIdStartingY;
|
||||
uint32_t Reserved_288;
|
||||
uint32_t ThreadGroupIdYDimension;
|
||||
uint32_t ThreadGroupIdStartingResumeZ;
|
||||
uint32_t ThreadGroupIdZDimension;
|
||||
uint32_t RightExecutionMask;
|
||||
uint32_t BottomExecutionMask;
|
||||
} Common;
|
||||
uint32_t RawData[15];
|
||||
} TheStructure;
|
||||
typedef enum tagDWORD_LENGTH {
|
||||
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
|
||||
} DWORD_LENGTH;
|
||||
typedef enum tagSUBOPCODE {
|
||||
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
|
||||
} SUBOPCODE;
|
||||
typedef enum tagMEDIA_COMMAND_OPCODE {
|
||||
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
|
||||
} MEDIA_COMMAND_OPCODE;
|
||||
typedef enum tagPIPELINE {
|
||||
PIPELINE_MEDIA = 0x2,
|
||||
} PIPELINE;
|
||||
typedef enum tagCOMMAND_TYPE {
|
||||
COMMAND_TYPE_GFXPIPE = 0x3,
|
||||
} COMMAND_TYPE;
|
||||
typedef enum tagSIMD_SIZE {
|
||||
SIMD_SIZE_SIMD8 = 0x0,
|
||||
SIMD_SIZE_SIMD16 = 0x1,
|
||||
SIMD_SIZE_SIMD32 = 0x2,
|
||||
} SIMD_SIZE;
|
||||
typedef enum tagPATCH_CONSTANTS {
|
||||
INDIRECTDATASTARTADDRESS_BYTEOFFSET = 0xc,
|
||||
INDIRECTDATASTARTADDRESS_INDEX = 0x3,
|
||||
} PATCH_CONSTANTS;
|
||||
inline void init() {
|
||||
memset(&TheStructure, 0, sizeof(TheStructure));
|
||||
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
|
||||
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
|
||||
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
|
||||
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
|
||||
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
|
||||
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
|
||||
}
|
||||
static tagGPGPU_WALKER sInit() {
|
||||
GPGPU_WALKER state;
|
||||
state.init();
|
||||
return state;
|
||||
}
|
||||
inline uint32_t &getRawData(const uint32_t index) {
|
||||
DEBUG_BREAK_IF(index >= 15);
|
||||
return TheStructure.RawData[index];
|
||||
}
|
||||
inline void setPredicateEnable(const bool value) {
|
||||
TheStructure.Common.PredicateEnable = value;
|
||||
}
|
||||
inline bool getPredicateEnable() const {
|
||||
return (TheStructure.Common.PredicateEnable);
|
||||
}
|
||||
inline void setIndirectParameterEnable(const bool value) {
|
||||
TheStructure.Common.IndirectParameterEnable = value;
|
||||
}
|
||||
inline bool getIndirectParameterEnable() const {
|
||||
return (TheStructure.Common.IndirectParameterEnable);
|
||||
}
|
||||
inline void setInterfaceDescriptorOffset(const uint32_t value) {
|
||||
TheStructure.Common.InterfaceDescriptorOffset = value;
|
||||
}
|
||||
inline uint32_t getInterfaceDescriptorOffset() const {
|
||||
return (TheStructure.Common.InterfaceDescriptorOffset);
|
||||
}
|
||||
inline void setIndirectDataLength(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataLength = value;
|
||||
}
|
||||
inline uint32_t getIndirectDataLength() const {
|
||||
return (TheStructure.Common.IndirectDataLength);
|
||||
}
|
||||
typedef enum tagINDIRECTDATASTARTADDRESS {
|
||||
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
|
||||
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
|
||||
} INDIRECTDATASTARTADDRESS;
|
||||
inline void setIndirectDataStartAddress(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
|
||||
}
|
||||
inline uint32_t getIndirectDataStartAddress() const {
|
||||
return (TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT);
|
||||
}
|
||||
inline void setThreadWidthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadWidthCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadWidthCounterMaximum + 1);
|
||||
}
|
||||
inline void setThreadHeightCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadHeightCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadHeightCounterMaximum + 1);
|
||||
}
|
||||
inline void setThreadDepthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadDepthCounterMaximum = value;
|
||||
}
|
||||
inline uint32_t getThreadDepthCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadDepthCounterMaximum);
|
||||
}
|
||||
inline void setSimdSize(const SIMD_SIZE value) {
|
||||
TheStructure.Common.SimdSize = value;
|
||||
}
|
||||
inline SIMD_SIZE getSimdSize() const {
|
||||
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
|
||||
}
|
||||
inline void setThreadGroupIdStartingX(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingX = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingX() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingX);
|
||||
}
|
||||
inline void setThreadGroupIdXDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdXDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdXDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdXDimension);
|
||||
}
|
||||
inline void setThreadGroupIdStartingY(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingY = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingY() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingY);
|
||||
}
|
||||
inline void setThreadGroupIdYDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdYDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdYDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdYDimension);
|
||||
}
|
||||
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingResumeZ() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingResumeZ);
|
||||
}
|
||||
inline void setThreadGroupIdZDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdZDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdZDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdZDimension);
|
||||
}
|
||||
inline void setRightExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.RightExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getRightExecutionMask() const {
|
||||
return (TheStructure.Common.RightExecutionMask);
|
||||
}
|
||||
inline void setBottomExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.BottomExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getBottomExecutionMask() const {
|
||||
return (TheStructure.Common.BottomExecutionMask);
|
||||
}
|
||||
} GPGPU_WALKER;
|
||||
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
|
||||
|
||||
typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
union tagTheStructure {
|
||||
@@ -465,11 +277,11 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
inline uint32_t getNumberOfThreadsInGpgpuThreadGroup() const {
|
||||
return (TheStructure.Common.NumberOfThreadsInGpgpuThreadGroup);
|
||||
}
|
||||
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
|
||||
inline void setSharedLocalMemorySize(const uint32_t value) { // patched
|
||||
TheStructure.Common.SharedLocalMemorySize = value;
|
||||
}
|
||||
inline SHARED_LOCAL_MEMORY_SIZE getSharedLocalMemorySize() const {
|
||||
return static_cast<SHARED_LOCAL_MEMORY_SIZE>(TheStructure.Common.SharedLocalMemorySize);
|
||||
inline uint32_t getSharedLocalMemorySize() const { // patched
|
||||
return static_cast<uint32_t>(TheStructure.Common.SharedLocalMemorySize);
|
||||
}
|
||||
inline void setBarrierEnable(const uint32_t value) {
|
||||
TheStructure.Common.BarrierEnable = (value > 0u) ? 1u : 0u;
|
||||
@@ -492,6 +304,197 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
} INTERFACE_DESCRIPTOR_DATA;
|
||||
STATIC_ASSERT(32 == sizeof(INTERFACE_DESCRIPTOR_DATA));
|
||||
|
||||
typedef struct tagGPGPU_WALKER {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
|
||||
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
|
||||
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
|
||||
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
|
||||
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
|
||||
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
|
||||
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
|
||||
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
|
||||
uint32_t CommandType : BITFIELD_RANGE(29, 31);
|
||||
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
|
||||
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
|
||||
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
|
||||
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
|
||||
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
|
||||
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
|
||||
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
|
||||
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
|
||||
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
|
||||
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
|
||||
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
|
||||
uint32_t ThreadGroupIdStartingX;
|
||||
uint32_t Reserved_192;
|
||||
uint32_t ThreadGroupIdXDimension;
|
||||
uint32_t ThreadGroupIdStartingY;
|
||||
uint32_t Reserved_288;
|
||||
uint32_t ThreadGroupIdYDimension;
|
||||
uint32_t ThreadGroupIdStartingResumeZ;
|
||||
uint32_t ThreadGroupIdZDimension;
|
||||
uint32_t RightExecutionMask;
|
||||
uint32_t BottomExecutionMask;
|
||||
} Common;
|
||||
uint32_t RawData[15];
|
||||
} TheStructure;
|
||||
typedef enum tagDWORD_LENGTH {
|
||||
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
|
||||
} DWORD_LENGTH;
|
||||
typedef enum tagSUBOPCODE {
|
||||
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
|
||||
} SUBOPCODE;
|
||||
typedef enum tagMEDIA_COMMAND_OPCODE {
|
||||
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
|
||||
} MEDIA_COMMAND_OPCODE;
|
||||
typedef enum tagPIPELINE {
|
||||
PIPELINE_MEDIA = 0x2,
|
||||
} PIPELINE;
|
||||
typedef enum tagCOMMAND_TYPE {
|
||||
COMMAND_TYPE_GFXPIPE = 0x3,
|
||||
} COMMAND_TYPE;
|
||||
typedef enum tagSIMD_SIZE {
|
||||
SIMD_SIZE_SIMD8 = 0x0,
|
||||
SIMD_SIZE_SIMD16 = 0x1,
|
||||
SIMD_SIZE_SIMD32 = 0x2,
|
||||
} SIMD_SIZE;
|
||||
typedef enum tagPATCH_CONSTANTS {
|
||||
INDIRECTDATASTARTADDRESS_BYTEOFFSET = 0xc,
|
||||
INDIRECTDATASTARTADDRESS_INDEX = 0x3,
|
||||
} PATCH_CONSTANTS;
|
||||
inline void init() {
|
||||
memset(&TheStructure, 0, sizeof(TheStructure));
|
||||
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
|
||||
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
|
||||
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
|
||||
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
|
||||
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
|
||||
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
|
||||
}
|
||||
static tagGPGPU_WALKER sInit() {
|
||||
GPGPU_WALKER state;
|
||||
state.init();
|
||||
return state;
|
||||
}
|
||||
inline uint32_t &getRawData(const uint32_t index) {
|
||||
DEBUG_BREAK_IF(index >= 15);
|
||||
return TheStructure.RawData[index];
|
||||
}
|
||||
inline void setPredicateEnable(const bool value) {
|
||||
TheStructure.Common.PredicateEnable = value;
|
||||
}
|
||||
inline bool getPredicateEnable() const {
|
||||
return (TheStructure.Common.PredicateEnable);
|
||||
}
|
||||
inline void setIndirectParameterEnable(const bool value) {
|
||||
TheStructure.Common.IndirectParameterEnable = value;
|
||||
}
|
||||
inline bool getIndirectParameterEnable() const {
|
||||
return (TheStructure.Common.IndirectParameterEnable);
|
||||
}
|
||||
inline void setInterfaceDescriptorOffset(const uint32_t value) {
|
||||
TheStructure.Common.InterfaceDescriptorOffset = value;
|
||||
}
|
||||
inline uint32_t getInterfaceDescriptorOffset() const {
|
||||
return (TheStructure.Common.InterfaceDescriptorOffset);
|
||||
}
|
||||
inline void setIndirectDataLength(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataLength = value;
|
||||
}
|
||||
inline uint32_t getIndirectDataLength() const {
|
||||
return (TheStructure.Common.IndirectDataLength);
|
||||
}
|
||||
typedef enum tagINDIRECTDATASTARTADDRESS {
|
||||
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
|
||||
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
|
||||
} INDIRECTDATASTARTADDRESS;
|
||||
inline void setIndirectDataStartAddress(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
|
||||
}
|
||||
inline uint32_t getIndirectDataStartAddress() const {
|
||||
return (TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT);
|
||||
}
|
||||
inline void setThreadWidthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadWidthCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadWidthCounterMaximum + 1);
|
||||
}
|
||||
inline void setThreadHeightCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadHeightCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadHeightCounterMaximum + 1);
|
||||
}
|
||||
inline void setThreadDepthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadDepthCounterMaximum = value;
|
||||
}
|
||||
inline uint32_t getThreadDepthCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadDepthCounterMaximum);
|
||||
}
|
||||
inline void setSimdSize(const SIMD_SIZE value) {
|
||||
TheStructure.Common.SimdSize = value;
|
||||
}
|
||||
inline SIMD_SIZE getSimdSize() const {
|
||||
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
|
||||
}
|
||||
inline void setThreadGroupIdStartingX(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingX = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingX() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingX);
|
||||
}
|
||||
inline void setThreadGroupIdXDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdXDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdXDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdXDimension);
|
||||
}
|
||||
inline void setThreadGroupIdStartingY(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingY = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingY() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingY);
|
||||
}
|
||||
inline void setThreadGroupIdYDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdYDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdYDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdYDimension);
|
||||
}
|
||||
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingResumeZ() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingResumeZ);
|
||||
}
|
||||
inline void setThreadGroupIdZDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdZDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdZDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdZDimension);
|
||||
}
|
||||
inline void setRightExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.RightExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getRightExecutionMask() const {
|
||||
return (TheStructure.Common.RightExecutionMask);
|
||||
}
|
||||
inline void setBottomExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.BottomExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getBottomExecutionMask() const {
|
||||
return (TheStructure.Common.BottomExecutionMask);
|
||||
}
|
||||
using InterfaceDescriptorType = INTERFACE_DESCRIPTOR_DATA; // patched
|
||||
|
||||
} GPGPU_WALKER;
|
||||
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
|
||||
|
||||
typedef struct tagMEDIA_INTERFACE_DESCRIPTOR_LOAD {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
|
||||
@@ -48,205 +48,6 @@ typedef struct tagBINDING_TABLE_STATE {
|
||||
} BINDING_TABLE_STATE;
|
||||
STATIC_ASSERT(4 == sizeof(BINDING_TABLE_STATE));
|
||||
|
||||
typedef struct tagGPGPU_WALKER {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
// DWORD 0
|
||||
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
|
||||
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
|
||||
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
|
||||
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
|
||||
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
|
||||
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
|
||||
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
|
||||
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
|
||||
uint32_t CommandType : BITFIELD_RANGE(29, 31);
|
||||
// DWORD 1
|
||||
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
|
||||
// DWORD 2
|
||||
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
|
||||
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
|
||||
// DWORD 3
|
||||
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
|
||||
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
|
||||
// DWORD 4
|
||||
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
|
||||
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
|
||||
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
|
||||
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
|
||||
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
|
||||
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
|
||||
// DWORD 5
|
||||
uint32_t ThreadGroupIdStartingX;
|
||||
// DWORD 6
|
||||
uint32_t Reserved_192;
|
||||
// DWORD 7
|
||||
uint32_t ThreadGroupIdXDimension;
|
||||
// DWORD 8
|
||||
uint32_t ThreadGroupIdStartingY;
|
||||
// DWORD 9
|
||||
uint32_t Reserved_288;
|
||||
// DWORD 10
|
||||
uint32_t ThreadGroupIdYDimension;
|
||||
// DWORD 11
|
||||
uint32_t ThreadGroupIdStartingResumeZ;
|
||||
// DWORD 12
|
||||
uint32_t ThreadGroupIdZDimension;
|
||||
// DWORD 13
|
||||
uint32_t RightExecutionMask;
|
||||
// DWORD 14
|
||||
uint32_t BottomExecutionMask;
|
||||
} Common;
|
||||
uint32_t RawData[15];
|
||||
} TheStructure;
|
||||
typedef enum tagDWORD_LENGTH {
|
||||
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
|
||||
} DWORD_LENGTH;
|
||||
typedef enum tagSUBOPCODE {
|
||||
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
|
||||
} SUBOPCODE;
|
||||
typedef enum tagMEDIA_COMMAND_OPCODE {
|
||||
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
|
||||
} MEDIA_COMMAND_OPCODE;
|
||||
typedef enum tagPIPELINE {
|
||||
PIPELINE_MEDIA = 0x2,
|
||||
} PIPELINE;
|
||||
typedef enum tagCOMMAND_TYPE {
|
||||
COMMAND_TYPE_GFXPIPE = 0x3,
|
||||
} COMMAND_TYPE;
|
||||
typedef enum tagSIMD_SIZE {
|
||||
SIMD_SIZE_SIMD8 = 0x0,
|
||||
SIMD_SIZE_SIMD16 = 0x1,
|
||||
SIMD_SIZE_SIMD32 = 0x2,
|
||||
} SIMD_SIZE;
|
||||
inline void init() {
|
||||
memset(&TheStructure, 0, sizeof(TheStructure));
|
||||
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
|
||||
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
|
||||
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
|
||||
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
|
||||
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
|
||||
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
|
||||
}
|
||||
static tagGPGPU_WALKER sInit() {
|
||||
GPGPU_WALKER state;
|
||||
state.init();
|
||||
return state;
|
||||
}
|
||||
inline uint32_t &getRawData(const uint32_t index) {
|
||||
return TheStructure.RawData[index];
|
||||
}
|
||||
inline void setPredicateEnable(const bool value) {
|
||||
TheStructure.Common.PredicateEnable = value;
|
||||
}
|
||||
inline bool getPredicateEnable() const {
|
||||
return TheStructure.Common.PredicateEnable;
|
||||
}
|
||||
inline void setIndirectParameterEnable(const bool value) {
|
||||
TheStructure.Common.IndirectParameterEnable = value;
|
||||
}
|
||||
inline bool getIndirectParameterEnable() const {
|
||||
return TheStructure.Common.IndirectParameterEnable;
|
||||
}
|
||||
inline void setInterfaceDescriptorOffset(const uint32_t value) {
|
||||
TheStructure.Common.InterfaceDescriptorOffset = value;
|
||||
}
|
||||
inline uint32_t getInterfaceDescriptorOffset() const {
|
||||
return TheStructure.Common.InterfaceDescriptorOffset;
|
||||
}
|
||||
inline void setIndirectDataLength(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataLength = value;
|
||||
}
|
||||
inline uint32_t getIndirectDataLength() const {
|
||||
return TheStructure.Common.IndirectDataLength;
|
||||
}
|
||||
typedef enum tagINDIRECTDATASTARTADDRESS {
|
||||
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
|
||||
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
|
||||
} INDIRECTDATASTARTADDRESS;
|
||||
inline void setIndirectDataStartAddress(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
|
||||
}
|
||||
inline uint32_t getIndirectDataStartAddress() const {
|
||||
return TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT;
|
||||
}
|
||||
inline void setThreadWidthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadWidthCounterMaximum() const {
|
||||
return TheStructure.Common.ThreadWidthCounterMaximum + 1;
|
||||
}
|
||||
inline void setThreadHeightCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadHeightCounterMaximum() const {
|
||||
return TheStructure.Common.ThreadHeightCounterMaximum + 1;
|
||||
}
|
||||
inline void setThreadDepthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadDepthCounterMaximum = value;
|
||||
}
|
||||
inline uint32_t getThreadDepthCounterMaximum() const {
|
||||
return TheStructure.Common.ThreadDepthCounterMaximum;
|
||||
}
|
||||
inline void setSimdSize(const SIMD_SIZE value) {
|
||||
TheStructure.Common.SimdSize = value;
|
||||
}
|
||||
inline SIMD_SIZE getSimdSize() const {
|
||||
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
|
||||
}
|
||||
inline void setThreadGroupIdStartingX(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingX = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingX() const {
|
||||
return TheStructure.Common.ThreadGroupIdStartingX;
|
||||
}
|
||||
inline void setThreadGroupIdXDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdXDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdXDimension() const {
|
||||
return TheStructure.Common.ThreadGroupIdXDimension;
|
||||
}
|
||||
inline void setThreadGroupIdStartingY(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingY = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingY() const {
|
||||
return TheStructure.Common.ThreadGroupIdStartingY;
|
||||
}
|
||||
inline void setThreadGroupIdYDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdYDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdYDimension() const {
|
||||
return TheStructure.Common.ThreadGroupIdYDimension;
|
||||
}
|
||||
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingResumeZ() const {
|
||||
return TheStructure.Common.ThreadGroupIdStartingResumeZ;
|
||||
}
|
||||
inline void setThreadGroupIdZDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdZDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdZDimension() const {
|
||||
return TheStructure.Common.ThreadGroupIdZDimension;
|
||||
}
|
||||
inline void setRightExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.RightExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getRightExecutionMask() const {
|
||||
return TheStructure.Common.RightExecutionMask;
|
||||
}
|
||||
inline void setBottomExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.BottomExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getBottomExecutionMask() const {
|
||||
return TheStructure.Common.BottomExecutionMask;
|
||||
}
|
||||
} GPGPU_WALKER;
|
||||
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
|
||||
|
||||
typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
@@ -486,11 +287,11 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
inline OVER_DISPATCH_CONTROL getOverDispatchControl() const {
|
||||
return static_cast<OVER_DISPATCH_CONTROL>(TheStructure.Common.OverDispatchControl);
|
||||
}
|
||||
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
|
||||
inline void setSharedLocalMemorySize(const uint32_t value) { // patched
|
||||
TheStructure.Common.SharedLocalMemorySize = value;
|
||||
}
|
||||
inline SHARED_LOCAL_MEMORY_SIZE getSharedLocalMemorySize() const {
|
||||
return static_cast<SHARED_LOCAL_MEMORY_SIZE>(TheStructure.Common.SharedLocalMemorySize);
|
||||
inline uint32_t getSharedLocalMemorySize() const { // patched
|
||||
return static_cast<uint32_t>(TheStructure.Common.SharedLocalMemorySize);
|
||||
}
|
||||
inline void setBarrierEnable(const uint32_t value) {
|
||||
TheStructure.Common.BarrierEnable = (value > 0u) ? 1u : 0u;
|
||||
@@ -513,6 +314,208 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
} INTERFACE_DESCRIPTOR_DATA;
|
||||
STATIC_ASSERT(32 == sizeof(INTERFACE_DESCRIPTOR_DATA));
|
||||
|
||||
typedef struct tagGPGPU_WALKER {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
// DWORD 0
|
||||
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
|
||||
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
|
||||
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
|
||||
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
|
||||
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
|
||||
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
|
||||
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
|
||||
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
|
||||
uint32_t CommandType : BITFIELD_RANGE(29, 31);
|
||||
// DWORD 1
|
||||
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
|
||||
// DWORD 2
|
||||
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
|
||||
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
|
||||
// DWORD 3
|
||||
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
|
||||
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
|
||||
// DWORD 4
|
||||
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
|
||||
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
|
||||
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
|
||||
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
|
||||
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
|
||||
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
|
||||
// DWORD 5
|
||||
uint32_t ThreadGroupIdStartingX;
|
||||
// DWORD 6
|
||||
uint32_t Reserved_192;
|
||||
// DWORD 7
|
||||
uint32_t ThreadGroupIdXDimension;
|
||||
// DWORD 8
|
||||
uint32_t ThreadGroupIdStartingY;
|
||||
// DWORD 9
|
||||
uint32_t Reserved_288;
|
||||
// DWORD 10
|
||||
uint32_t ThreadGroupIdYDimension;
|
||||
// DWORD 11
|
||||
uint32_t ThreadGroupIdStartingResumeZ;
|
||||
// DWORD 12
|
||||
uint32_t ThreadGroupIdZDimension;
|
||||
// DWORD 13
|
||||
uint32_t RightExecutionMask;
|
||||
// DWORD 14
|
||||
uint32_t BottomExecutionMask;
|
||||
} Common;
|
||||
uint32_t RawData[15];
|
||||
} TheStructure;
|
||||
typedef enum tagDWORD_LENGTH {
|
||||
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
|
||||
} DWORD_LENGTH;
|
||||
typedef enum tagSUBOPCODE {
|
||||
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
|
||||
} SUBOPCODE;
|
||||
typedef enum tagMEDIA_COMMAND_OPCODE {
|
||||
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
|
||||
} MEDIA_COMMAND_OPCODE;
|
||||
typedef enum tagPIPELINE {
|
||||
PIPELINE_MEDIA = 0x2,
|
||||
} PIPELINE;
|
||||
typedef enum tagCOMMAND_TYPE {
|
||||
COMMAND_TYPE_GFXPIPE = 0x3,
|
||||
} COMMAND_TYPE;
|
||||
typedef enum tagSIMD_SIZE {
|
||||
SIMD_SIZE_SIMD8 = 0x0,
|
||||
SIMD_SIZE_SIMD16 = 0x1,
|
||||
SIMD_SIZE_SIMD32 = 0x2,
|
||||
} SIMD_SIZE;
|
||||
inline void init() {
|
||||
memset(&TheStructure, 0, sizeof(TheStructure));
|
||||
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
|
||||
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
|
||||
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
|
||||
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
|
||||
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
|
||||
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
|
||||
}
|
||||
static tagGPGPU_WALKER sInit() {
|
||||
GPGPU_WALKER state;
|
||||
state.init();
|
||||
return state;
|
||||
}
|
||||
inline uint32_t &getRawData(const uint32_t index) {
|
||||
return TheStructure.RawData[index];
|
||||
}
|
||||
inline void setPredicateEnable(const bool value) {
|
||||
TheStructure.Common.PredicateEnable = value;
|
||||
}
|
||||
inline bool getPredicateEnable() const {
|
||||
return TheStructure.Common.PredicateEnable;
|
||||
}
|
||||
inline void setIndirectParameterEnable(const bool value) {
|
||||
TheStructure.Common.IndirectParameterEnable = value;
|
||||
}
|
||||
inline bool getIndirectParameterEnable() const {
|
||||
return TheStructure.Common.IndirectParameterEnable;
|
||||
}
|
||||
inline void setInterfaceDescriptorOffset(const uint32_t value) {
|
||||
TheStructure.Common.InterfaceDescriptorOffset = value;
|
||||
}
|
||||
inline uint32_t getInterfaceDescriptorOffset() const {
|
||||
return TheStructure.Common.InterfaceDescriptorOffset;
|
||||
}
|
||||
inline void setIndirectDataLength(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataLength = value;
|
||||
}
|
||||
inline uint32_t getIndirectDataLength() const {
|
||||
return TheStructure.Common.IndirectDataLength;
|
||||
}
|
||||
typedef enum tagINDIRECTDATASTARTADDRESS {
|
||||
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
|
||||
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
|
||||
} INDIRECTDATASTARTADDRESS;
|
||||
inline void setIndirectDataStartAddress(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
|
||||
}
|
||||
inline uint32_t getIndirectDataStartAddress() const {
|
||||
return TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT;
|
||||
}
|
||||
inline void setThreadWidthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadWidthCounterMaximum() const {
|
||||
return TheStructure.Common.ThreadWidthCounterMaximum + 1;
|
||||
}
|
||||
inline void setThreadHeightCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadHeightCounterMaximum() const {
|
||||
return TheStructure.Common.ThreadHeightCounterMaximum + 1;
|
||||
}
|
||||
inline void setThreadDepthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadDepthCounterMaximum = value;
|
||||
}
|
||||
inline uint32_t getThreadDepthCounterMaximum() const {
|
||||
return TheStructure.Common.ThreadDepthCounterMaximum;
|
||||
}
|
||||
inline void setSimdSize(const SIMD_SIZE value) {
|
||||
TheStructure.Common.SimdSize = value;
|
||||
}
|
||||
inline SIMD_SIZE getSimdSize() const {
|
||||
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
|
||||
}
|
||||
inline void setThreadGroupIdStartingX(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingX = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingX() const {
|
||||
return TheStructure.Common.ThreadGroupIdStartingX;
|
||||
}
|
||||
inline void setThreadGroupIdXDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdXDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdXDimension() const {
|
||||
return TheStructure.Common.ThreadGroupIdXDimension;
|
||||
}
|
||||
inline void setThreadGroupIdStartingY(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingY = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingY() const {
|
||||
return TheStructure.Common.ThreadGroupIdStartingY;
|
||||
}
|
||||
inline void setThreadGroupIdYDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdYDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdYDimension() const {
|
||||
return TheStructure.Common.ThreadGroupIdYDimension;
|
||||
}
|
||||
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingResumeZ() const {
|
||||
return TheStructure.Common.ThreadGroupIdStartingResumeZ;
|
||||
}
|
||||
inline void setThreadGroupIdZDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdZDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdZDimension() const {
|
||||
return TheStructure.Common.ThreadGroupIdZDimension;
|
||||
}
|
||||
inline void setRightExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.RightExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getRightExecutionMask() const {
|
||||
return TheStructure.Common.RightExecutionMask;
|
||||
}
|
||||
inline void setBottomExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.BottomExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getBottomExecutionMask() const {
|
||||
return TheStructure.Common.BottomExecutionMask;
|
||||
}
|
||||
|
||||
using InterfaceDescriptorType = INTERFACE_DESCRIPTOR_DATA; // patched
|
||||
|
||||
} GPGPU_WALKER;
|
||||
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
|
||||
|
||||
typedef struct tagMEDIA_INTERFACE_DESCRIPTOR_LOAD {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2022 Intel Corporation
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -48,195 +48,6 @@ typedef struct tagBINDING_TABLE_STATE {
|
||||
} BINDING_TABLE_STATE;
|
||||
STATIC_ASSERT(4 == sizeof(BINDING_TABLE_STATE));
|
||||
|
||||
typedef struct tagGPGPU_WALKER {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
|
||||
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
|
||||
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
|
||||
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
|
||||
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
|
||||
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
|
||||
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
|
||||
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
|
||||
uint32_t CommandType : BITFIELD_RANGE(29, 31);
|
||||
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
|
||||
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
|
||||
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
|
||||
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
|
||||
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
|
||||
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
|
||||
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
|
||||
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
|
||||
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
|
||||
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
|
||||
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
|
||||
uint32_t ThreadGroupIdStartingX;
|
||||
uint32_t Reserved_192;
|
||||
uint32_t ThreadGroupIdXDimension;
|
||||
uint32_t ThreadGroupIdStartingY;
|
||||
uint32_t Reserved_288;
|
||||
uint32_t ThreadGroupIdYDimension;
|
||||
uint32_t ThreadGroupIdStartingResumeZ;
|
||||
uint32_t ThreadGroupIdZDimension;
|
||||
uint32_t RightExecutionMask;
|
||||
uint32_t BottomExecutionMask;
|
||||
} Common;
|
||||
uint32_t RawData[15];
|
||||
} TheStructure;
|
||||
typedef enum tagDWORD_LENGTH {
|
||||
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
|
||||
} DWORD_LENGTH;
|
||||
typedef enum tagSUBOPCODE {
|
||||
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
|
||||
} SUBOPCODE;
|
||||
typedef enum tagMEDIA_COMMAND_OPCODE {
|
||||
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
|
||||
} MEDIA_COMMAND_OPCODE;
|
||||
typedef enum tagPIPELINE {
|
||||
PIPELINE_MEDIA = 0x2,
|
||||
} PIPELINE;
|
||||
typedef enum tagCOMMAND_TYPE {
|
||||
COMMAND_TYPE_GFXPIPE = 0x3,
|
||||
} COMMAND_TYPE;
|
||||
typedef enum tagSIMD_SIZE {
|
||||
SIMD_SIZE_SIMD8 = 0x0,
|
||||
SIMD_SIZE_SIMD16 = 0x1,
|
||||
SIMD_SIZE_SIMD32 = 0x2,
|
||||
} SIMD_SIZE;
|
||||
typedef enum tagPATCH_CONSTANTS {
|
||||
INDIRECTDATASTARTADDRESS_BYTEOFFSET = 0xc,
|
||||
INDIRECTDATASTARTADDRESS_INDEX = 0x3,
|
||||
} PATCH_CONSTANTS;
|
||||
inline void init() {
|
||||
memset(&TheStructure, 0, sizeof(TheStructure));
|
||||
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
|
||||
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
|
||||
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
|
||||
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
|
||||
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
|
||||
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
|
||||
}
|
||||
static tagGPGPU_WALKER sInit() {
|
||||
GPGPU_WALKER state;
|
||||
state.init();
|
||||
return state;
|
||||
}
|
||||
inline uint32_t &getRawData(const uint32_t index) {
|
||||
DEBUG_BREAK_IF(index >= 15);
|
||||
return TheStructure.RawData[index];
|
||||
}
|
||||
inline void setPredicateEnable(const bool value) {
|
||||
TheStructure.Common.PredicateEnable = value;
|
||||
}
|
||||
inline bool getPredicateEnable() const {
|
||||
return (TheStructure.Common.PredicateEnable);
|
||||
}
|
||||
inline void setIndirectParameterEnable(const bool value) {
|
||||
TheStructure.Common.IndirectParameterEnable = value;
|
||||
}
|
||||
inline bool getIndirectParameterEnable() const {
|
||||
return (TheStructure.Common.IndirectParameterEnable);
|
||||
}
|
||||
inline void setInterfaceDescriptorOffset(const uint32_t value) {
|
||||
TheStructure.Common.InterfaceDescriptorOffset = value;
|
||||
}
|
||||
inline uint32_t getInterfaceDescriptorOffset() const {
|
||||
return (TheStructure.Common.InterfaceDescriptorOffset);
|
||||
}
|
||||
inline void setIndirectDataLength(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataLength = value;
|
||||
}
|
||||
inline uint32_t getIndirectDataLength() const {
|
||||
return (TheStructure.Common.IndirectDataLength);
|
||||
}
|
||||
typedef enum tagINDIRECTDATASTARTADDRESS {
|
||||
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
|
||||
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
|
||||
} INDIRECTDATASTARTADDRESS;
|
||||
inline void setIndirectDataStartAddress(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
|
||||
}
|
||||
inline uint32_t getIndirectDataStartAddress() const {
|
||||
return (TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT);
|
||||
}
|
||||
inline void setThreadWidthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadWidthCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadWidthCounterMaximum + 1);
|
||||
}
|
||||
inline void setThreadHeightCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadHeightCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadHeightCounterMaximum + 1);
|
||||
}
|
||||
inline void setThreadDepthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadDepthCounterMaximum = value;
|
||||
}
|
||||
inline uint32_t getThreadDepthCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadDepthCounterMaximum);
|
||||
}
|
||||
inline void setSimdSize(const SIMD_SIZE value) {
|
||||
TheStructure.Common.SimdSize = value;
|
||||
}
|
||||
inline SIMD_SIZE getSimdSize() const {
|
||||
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
|
||||
}
|
||||
inline void setThreadGroupIdStartingX(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingX = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingX() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingX);
|
||||
}
|
||||
inline void setThreadGroupIdXDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdXDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdXDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdXDimension);
|
||||
}
|
||||
inline void setThreadGroupIdStartingY(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingY = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingY() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingY);
|
||||
}
|
||||
inline void setThreadGroupIdYDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdYDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdYDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdYDimension);
|
||||
}
|
||||
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingResumeZ() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingResumeZ);
|
||||
}
|
||||
inline void setThreadGroupIdZDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdZDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdZDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdZDimension);
|
||||
}
|
||||
inline void setRightExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.RightExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getRightExecutionMask() const {
|
||||
return (TheStructure.Common.RightExecutionMask);
|
||||
}
|
||||
inline void setBottomExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.BottomExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getBottomExecutionMask() const {
|
||||
return (TheStructure.Common.BottomExecutionMask);
|
||||
}
|
||||
} GPGPU_WALKER;
|
||||
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
|
||||
|
||||
typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
@@ -452,11 +263,11 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
inline uint32_t getNumberOfThreadsInGpgpuThreadGroup() const {
|
||||
return (TheStructure.Common.NumberOfThreadsInGpgpuThreadGroup);
|
||||
}
|
||||
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
|
||||
inline void setSharedLocalMemorySize(const uint32_t value) { // patched
|
||||
TheStructure.Common.SharedLocalMemorySize = value;
|
||||
}
|
||||
inline SHARED_LOCAL_MEMORY_SIZE getSharedLocalMemorySize() const {
|
||||
return static_cast<SHARED_LOCAL_MEMORY_SIZE>(TheStructure.Common.SharedLocalMemorySize);
|
||||
inline uint32_t getSharedLocalMemorySize() const { // patched
|
||||
return static_cast<uint32_t>(TheStructure.Common.SharedLocalMemorySize);
|
||||
}
|
||||
inline void setBarrierEnable(const uint32_t value) {
|
||||
TheStructure.Common.BarrierEnable = (value > 0u) ? 1u : 0u;
|
||||
@@ -479,6 +290,196 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
} INTERFACE_DESCRIPTOR_DATA;
|
||||
STATIC_ASSERT(32 == sizeof(INTERFACE_DESCRIPTOR_DATA));
|
||||
|
||||
typedef struct tagGPGPU_WALKER {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
|
||||
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
|
||||
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
|
||||
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
|
||||
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
|
||||
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
|
||||
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
|
||||
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
|
||||
uint32_t CommandType : BITFIELD_RANGE(29, 31);
|
||||
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
|
||||
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
|
||||
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
|
||||
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
|
||||
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
|
||||
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
|
||||
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
|
||||
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
|
||||
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
|
||||
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
|
||||
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
|
||||
uint32_t ThreadGroupIdStartingX;
|
||||
uint32_t Reserved_192;
|
||||
uint32_t ThreadGroupIdXDimension;
|
||||
uint32_t ThreadGroupIdStartingY;
|
||||
uint32_t Reserved_288;
|
||||
uint32_t ThreadGroupIdYDimension;
|
||||
uint32_t ThreadGroupIdStartingResumeZ;
|
||||
uint32_t ThreadGroupIdZDimension;
|
||||
uint32_t RightExecutionMask;
|
||||
uint32_t BottomExecutionMask;
|
||||
} Common;
|
||||
uint32_t RawData[15];
|
||||
} TheStructure;
|
||||
typedef enum tagDWORD_LENGTH {
|
||||
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
|
||||
} DWORD_LENGTH;
|
||||
typedef enum tagSUBOPCODE {
|
||||
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
|
||||
} SUBOPCODE;
|
||||
typedef enum tagMEDIA_COMMAND_OPCODE {
|
||||
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
|
||||
} MEDIA_COMMAND_OPCODE;
|
||||
typedef enum tagPIPELINE {
|
||||
PIPELINE_MEDIA = 0x2,
|
||||
} PIPELINE;
|
||||
typedef enum tagCOMMAND_TYPE {
|
||||
COMMAND_TYPE_GFXPIPE = 0x3,
|
||||
} COMMAND_TYPE;
|
||||
typedef enum tagSIMD_SIZE {
|
||||
SIMD_SIZE_SIMD8 = 0x0,
|
||||
SIMD_SIZE_SIMD16 = 0x1,
|
||||
SIMD_SIZE_SIMD32 = 0x2,
|
||||
} SIMD_SIZE;
|
||||
typedef enum tagPATCH_CONSTANTS {
|
||||
INDIRECTDATASTARTADDRESS_BYTEOFFSET = 0xc,
|
||||
INDIRECTDATASTARTADDRESS_INDEX = 0x3,
|
||||
} PATCH_CONSTANTS;
|
||||
inline void init() {
|
||||
memset(&TheStructure, 0, sizeof(TheStructure));
|
||||
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
|
||||
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
|
||||
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
|
||||
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
|
||||
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
|
||||
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
|
||||
}
|
||||
static tagGPGPU_WALKER sInit() {
|
||||
GPGPU_WALKER state;
|
||||
state.init();
|
||||
return state;
|
||||
}
|
||||
inline uint32_t &getRawData(const uint32_t index) {
|
||||
DEBUG_BREAK_IF(index >= 15);
|
||||
return TheStructure.RawData[index];
|
||||
}
|
||||
inline void setPredicateEnable(const bool value) {
|
||||
TheStructure.Common.PredicateEnable = value;
|
||||
}
|
||||
inline bool getPredicateEnable() const {
|
||||
return (TheStructure.Common.PredicateEnable);
|
||||
}
|
||||
inline void setIndirectParameterEnable(const bool value) {
|
||||
TheStructure.Common.IndirectParameterEnable = value;
|
||||
}
|
||||
inline bool getIndirectParameterEnable() const {
|
||||
return (TheStructure.Common.IndirectParameterEnable);
|
||||
}
|
||||
inline void setInterfaceDescriptorOffset(const uint32_t value) {
|
||||
TheStructure.Common.InterfaceDescriptorOffset = value;
|
||||
}
|
||||
inline uint32_t getInterfaceDescriptorOffset() const {
|
||||
return (TheStructure.Common.InterfaceDescriptorOffset);
|
||||
}
|
||||
inline void setIndirectDataLength(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataLength = value;
|
||||
}
|
||||
inline uint32_t getIndirectDataLength() const {
|
||||
return (TheStructure.Common.IndirectDataLength);
|
||||
}
|
||||
typedef enum tagINDIRECTDATASTARTADDRESS {
|
||||
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
|
||||
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
|
||||
} INDIRECTDATASTARTADDRESS;
|
||||
inline void setIndirectDataStartAddress(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
|
||||
}
|
||||
inline uint32_t getIndirectDataStartAddress() const {
|
||||
return (TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT);
|
||||
}
|
||||
inline void setThreadWidthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadWidthCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadWidthCounterMaximum + 1);
|
||||
}
|
||||
inline void setThreadHeightCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadHeightCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadHeightCounterMaximum + 1);
|
||||
}
|
||||
inline void setThreadDepthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadDepthCounterMaximum = value;
|
||||
}
|
||||
inline uint32_t getThreadDepthCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadDepthCounterMaximum);
|
||||
}
|
||||
inline void setSimdSize(const SIMD_SIZE value) {
|
||||
TheStructure.Common.SimdSize = value;
|
||||
}
|
||||
inline SIMD_SIZE getSimdSize() const {
|
||||
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
|
||||
}
|
||||
inline void setThreadGroupIdStartingX(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingX = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingX() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingX);
|
||||
}
|
||||
inline void setThreadGroupIdXDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdXDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdXDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdXDimension);
|
||||
}
|
||||
inline void setThreadGroupIdStartingY(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingY = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingY() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingY);
|
||||
}
|
||||
inline void setThreadGroupIdYDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdYDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdYDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdYDimension);
|
||||
}
|
||||
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingResumeZ() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingResumeZ);
|
||||
}
|
||||
inline void setThreadGroupIdZDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdZDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdZDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdZDimension);
|
||||
}
|
||||
inline void setRightExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.RightExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getRightExecutionMask() const {
|
||||
return (TheStructure.Common.RightExecutionMask);
|
||||
}
|
||||
inline void setBottomExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.BottomExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getBottomExecutionMask() const {
|
||||
return (TheStructure.Common.BottomExecutionMask);
|
||||
}
|
||||
using InterfaceDescriptorType = INTERFACE_DESCRIPTOR_DATA; // patched
|
||||
} GPGPU_WALKER;
|
||||
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
|
||||
|
||||
typedef struct tagMEDIA_INTERFACE_DESCRIPTOR_LOAD {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2022 Intel Corporation
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -48,195 +48,6 @@ typedef struct tagBINDING_TABLE_STATE {
|
||||
} BINDING_TABLE_STATE;
|
||||
STATIC_ASSERT(4 == sizeof(BINDING_TABLE_STATE));
|
||||
|
||||
typedef struct tagGPGPU_WALKER {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
|
||||
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
|
||||
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
|
||||
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
|
||||
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
|
||||
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
|
||||
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
|
||||
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
|
||||
uint32_t CommandType : BITFIELD_RANGE(29, 31);
|
||||
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
|
||||
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
|
||||
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
|
||||
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
|
||||
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
|
||||
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
|
||||
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
|
||||
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
|
||||
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
|
||||
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
|
||||
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
|
||||
uint32_t ThreadGroupIdStartingX;
|
||||
uint32_t Reserved_192;
|
||||
uint32_t ThreadGroupIdXDimension;
|
||||
uint32_t ThreadGroupIdStartingY;
|
||||
uint32_t Reserved_288;
|
||||
uint32_t ThreadGroupIdYDimension;
|
||||
uint32_t ThreadGroupIdStartingResumeZ;
|
||||
uint32_t ThreadGroupIdZDimension;
|
||||
uint32_t RightExecutionMask;
|
||||
uint32_t BottomExecutionMask;
|
||||
} Common;
|
||||
uint32_t RawData[15];
|
||||
} TheStructure;
|
||||
typedef enum tagDWORD_LENGTH {
|
||||
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
|
||||
} DWORD_LENGTH;
|
||||
typedef enum tagSUBOPCODE {
|
||||
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
|
||||
} SUBOPCODE;
|
||||
typedef enum tagMEDIA_COMMAND_OPCODE {
|
||||
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
|
||||
} MEDIA_COMMAND_OPCODE;
|
||||
typedef enum tagPIPELINE {
|
||||
PIPELINE_MEDIA = 0x2,
|
||||
} PIPELINE;
|
||||
typedef enum tagCOMMAND_TYPE {
|
||||
COMMAND_TYPE_GFXPIPE = 0x3,
|
||||
} COMMAND_TYPE;
|
||||
typedef enum tagSIMD_SIZE {
|
||||
SIMD_SIZE_SIMD8 = 0x0,
|
||||
SIMD_SIZE_SIMD16 = 0x1,
|
||||
SIMD_SIZE_SIMD32 = 0x2,
|
||||
} SIMD_SIZE;
|
||||
typedef enum tagPATCH_CONSTANTS {
|
||||
INDIRECTDATASTARTADDRESS_BYTEOFFSET = 0xc,
|
||||
INDIRECTDATASTARTADDRESS_INDEX = 0x3,
|
||||
} PATCH_CONSTANTS;
|
||||
inline void init() {
|
||||
memset(&TheStructure, 0, sizeof(TheStructure));
|
||||
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
|
||||
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
|
||||
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
|
||||
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
|
||||
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
|
||||
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
|
||||
}
|
||||
static tagGPGPU_WALKER sInit() {
|
||||
GPGPU_WALKER state;
|
||||
state.init();
|
||||
return state;
|
||||
}
|
||||
inline uint32_t &getRawData(const uint32_t index) {
|
||||
DEBUG_BREAK_IF(index >= 15);
|
||||
return TheStructure.RawData[index];
|
||||
}
|
||||
inline void setPredicateEnable(const bool value) {
|
||||
TheStructure.Common.PredicateEnable = value;
|
||||
}
|
||||
inline bool getPredicateEnable() const {
|
||||
return (TheStructure.Common.PredicateEnable);
|
||||
}
|
||||
inline void setIndirectParameterEnable(const bool value) {
|
||||
TheStructure.Common.IndirectParameterEnable = value;
|
||||
}
|
||||
inline bool getIndirectParameterEnable() const {
|
||||
return (TheStructure.Common.IndirectParameterEnable);
|
||||
}
|
||||
inline void setInterfaceDescriptorOffset(const uint32_t value) {
|
||||
TheStructure.Common.InterfaceDescriptorOffset = value;
|
||||
}
|
||||
inline uint32_t getInterfaceDescriptorOffset() const {
|
||||
return (TheStructure.Common.InterfaceDescriptorOffset);
|
||||
}
|
||||
inline void setIndirectDataLength(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataLength = value;
|
||||
}
|
||||
inline uint32_t getIndirectDataLength() const {
|
||||
return (TheStructure.Common.IndirectDataLength);
|
||||
}
|
||||
typedef enum tagINDIRECTDATASTARTADDRESS {
|
||||
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
|
||||
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
|
||||
} INDIRECTDATASTARTADDRESS;
|
||||
inline void setIndirectDataStartAddress(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
|
||||
}
|
||||
inline uint32_t getIndirectDataStartAddress() const {
|
||||
return (TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT);
|
||||
}
|
||||
inline void setThreadWidthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadWidthCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadWidthCounterMaximum + 1);
|
||||
}
|
||||
inline void setThreadHeightCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadHeightCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadHeightCounterMaximum + 1);
|
||||
}
|
||||
inline void setThreadDepthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadDepthCounterMaximum = value;
|
||||
}
|
||||
inline uint32_t getThreadDepthCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadDepthCounterMaximum);
|
||||
}
|
||||
inline void setSimdSize(const SIMD_SIZE value) {
|
||||
TheStructure.Common.SimdSize = value;
|
||||
}
|
||||
inline SIMD_SIZE getSimdSize() const {
|
||||
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
|
||||
}
|
||||
inline void setThreadGroupIdStartingX(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingX = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingX() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingX);
|
||||
}
|
||||
inline void setThreadGroupIdXDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdXDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdXDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdXDimension);
|
||||
}
|
||||
inline void setThreadGroupIdStartingY(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingY = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingY() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingY);
|
||||
}
|
||||
inline void setThreadGroupIdYDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdYDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdYDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdYDimension);
|
||||
}
|
||||
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingResumeZ() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingResumeZ);
|
||||
}
|
||||
inline void setThreadGroupIdZDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdZDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdZDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdZDimension);
|
||||
}
|
||||
inline void setRightExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.RightExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getRightExecutionMask() const {
|
||||
return (TheStructure.Common.RightExecutionMask);
|
||||
}
|
||||
inline void setBottomExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.BottomExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getBottomExecutionMask() const {
|
||||
return (TheStructure.Common.BottomExecutionMask);
|
||||
}
|
||||
} GPGPU_WALKER;
|
||||
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
|
||||
|
||||
typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
@@ -461,11 +272,11 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
inline bool getGlobalBarrierEnable() const {
|
||||
return (TheStructure.Common.GlobalBarrierEnable);
|
||||
}
|
||||
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
|
||||
inline void setSharedLocalMemorySize(const uint32_t value) { // patched
|
||||
TheStructure.Common.SharedLocalMemorySize = value;
|
||||
}
|
||||
inline SHARED_LOCAL_MEMORY_SIZE getSharedLocalMemorySize() const {
|
||||
return static_cast<SHARED_LOCAL_MEMORY_SIZE>(TheStructure.Common.SharedLocalMemorySize);
|
||||
inline uint32_t getSharedLocalMemorySize() const { // patched
|
||||
return static_cast<uint32_t>(TheStructure.Common.SharedLocalMemorySize);
|
||||
}
|
||||
inline void setBarrierEnable(const uint32_t value) {
|
||||
TheStructure.Common.BarrierEnable = (value > 0u) ? 1u : 0u;
|
||||
@@ -488,6 +299,196 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
} INTERFACE_DESCRIPTOR_DATA;
|
||||
STATIC_ASSERT(32 == sizeof(INTERFACE_DESCRIPTOR_DATA));
|
||||
|
||||
typedef struct tagGPGPU_WALKER {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
|
||||
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
|
||||
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
|
||||
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
|
||||
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
|
||||
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
|
||||
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
|
||||
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
|
||||
uint32_t CommandType : BITFIELD_RANGE(29, 31);
|
||||
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
|
||||
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
|
||||
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
|
||||
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
|
||||
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
|
||||
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
|
||||
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
|
||||
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
|
||||
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
|
||||
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
|
||||
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
|
||||
uint32_t ThreadGroupIdStartingX;
|
||||
uint32_t Reserved_192;
|
||||
uint32_t ThreadGroupIdXDimension;
|
||||
uint32_t ThreadGroupIdStartingY;
|
||||
uint32_t Reserved_288;
|
||||
uint32_t ThreadGroupIdYDimension;
|
||||
uint32_t ThreadGroupIdStartingResumeZ;
|
||||
uint32_t ThreadGroupIdZDimension;
|
||||
uint32_t RightExecutionMask;
|
||||
uint32_t BottomExecutionMask;
|
||||
} Common;
|
||||
uint32_t RawData[15];
|
||||
} TheStructure;
|
||||
typedef enum tagDWORD_LENGTH {
|
||||
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
|
||||
} DWORD_LENGTH;
|
||||
typedef enum tagSUBOPCODE {
|
||||
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
|
||||
} SUBOPCODE;
|
||||
typedef enum tagMEDIA_COMMAND_OPCODE {
|
||||
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
|
||||
} MEDIA_COMMAND_OPCODE;
|
||||
typedef enum tagPIPELINE {
|
||||
PIPELINE_MEDIA = 0x2,
|
||||
} PIPELINE;
|
||||
typedef enum tagCOMMAND_TYPE {
|
||||
COMMAND_TYPE_GFXPIPE = 0x3,
|
||||
} COMMAND_TYPE;
|
||||
typedef enum tagSIMD_SIZE {
|
||||
SIMD_SIZE_SIMD8 = 0x0,
|
||||
SIMD_SIZE_SIMD16 = 0x1,
|
||||
SIMD_SIZE_SIMD32 = 0x2,
|
||||
} SIMD_SIZE;
|
||||
typedef enum tagPATCH_CONSTANTS {
|
||||
INDIRECTDATASTARTADDRESS_BYTEOFFSET = 0xc,
|
||||
INDIRECTDATASTARTADDRESS_INDEX = 0x3,
|
||||
} PATCH_CONSTANTS;
|
||||
inline void init() {
|
||||
memset(&TheStructure, 0, sizeof(TheStructure));
|
||||
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
|
||||
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
|
||||
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
|
||||
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
|
||||
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
|
||||
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
|
||||
}
|
||||
static tagGPGPU_WALKER sInit() {
|
||||
GPGPU_WALKER state;
|
||||
state.init();
|
||||
return state;
|
||||
}
|
||||
inline uint32_t &getRawData(const uint32_t index) {
|
||||
DEBUG_BREAK_IF(index >= 15);
|
||||
return TheStructure.RawData[index];
|
||||
}
|
||||
inline void setPredicateEnable(const bool value) {
|
||||
TheStructure.Common.PredicateEnable = value;
|
||||
}
|
||||
inline bool getPredicateEnable() const {
|
||||
return (TheStructure.Common.PredicateEnable);
|
||||
}
|
||||
inline void setIndirectParameterEnable(const bool value) {
|
||||
TheStructure.Common.IndirectParameterEnable = value;
|
||||
}
|
||||
inline bool getIndirectParameterEnable() const {
|
||||
return (TheStructure.Common.IndirectParameterEnable);
|
||||
}
|
||||
inline void setInterfaceDescriptorOffset(const uint32_t value) {
|
||||
TheStructure.Common.InterfaceDescriptorOffset = value;
|
||||
}
|
||||
inline uint32_t getInterfaceDescriptorOffset() const {
|
||||
return (TheStructure.Common.InterfaceDescriptorOffset);
|
||||
}
|
||||
inline void setIndirectDataLength(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataLength = value;
|
||||
}
|
||||
inline uint32_t getIndirectDataLength() const {
|
||||
return (TheStructure.Common.IndirectDataLength);
|
||||
}
|
||||
typedef enum tagINDIRECTDATASTARTADDRESS {
|
||||
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
|
||||
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
|
||||
} INDIRECTDATASTARTADDRESS;
|
||||
inline void setIndirectDataStartAddress(const uint32_t value) {
|
||||
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
|
||||
}
|
||||
inline uint32_t getIndirectDataStartAddress() const {
|
||||
return (TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT);
|
||||
}
|
||||
inline void setThreadWidthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadWidthCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadWidthCounterMaximum + 1);
|
||||
}
|
||||
inline void setThreadHeightCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
|
||||
}
|
||||
inline uint32_t getThreadHeightCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadHeightCounterMaximum + 1);
|
||||
}
|
||||
inline void setThreadDepthCounterMaximum(const uint32_t value) {
|
||||
TheStructure.Common.ThreadDepthCounterMaximum = value;
|
||||
}
|
||||
inline uint32_t getThreadDepthCounterMaximum() const {
|
||||
return (TheStructure.Common.ThreadDepthCounterMaximum);
|
||||
}
|
||||
inline void setSimdSize(const SIMD_SIZE value) {
|
||||
TheStructure.Common.SimdSize = value;
|
||||
}
|
||||
inline SIMD_SIZE getSimdSize() const {
|
||||
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
|
||||
}
|
||||
inline void setThreadGroupIdStartingX(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingX = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingX() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingX);
|
||||
}
|
||||
inline void setThreadGroupIdXDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdXDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdXDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdXDimension);
|
||||
}
|
||||
inline void setThreadGroupIdStartingY(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingY = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingY() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingY);
|
||||
}
|
||||
inline void setThreadGroupIdYDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdYDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdYDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdYDimension);
|
||||
}
|
||||
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdStartingResumeZ() const {
|
||||
return (TheStructure.Common.ThreadGroupIdStartingResumeZ);
|
||||
}
|
||||
inline void setThreadGroupIdZDimension(const uint32_t value) {
|
||||
TheStructure.Common.ThreadGroupIdZDimension = value;
|
||||
}
|
||||
inline uint32_t getThreadGroupIdZDimension() const {
|
||||
return (TheStructure.Common.ThreadGroupIdZDimension);
|
||||
}
|
||||
inline void setRightExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.RightExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getRightExecutionMask() const {
|
||||
return (TheStructure.Common.RightExecutionMask);
|
||||
}
|
||||
inline void setBottomExecutionMask(const uint32_t value) {
|
||||
TheStructure.Common.BottomExecutionMask = value;
|
||||
}
|
||||
inline uint32_t getBottomExecutionMask() const {
|
||||
return (TheStructure.Common.BottomExecutionMask);
|
||||
}
|
||||
using InterfaceDescriptorType = INTERFACE_DESCRIPTOR_DATA; // patched
|
||||
} GPGPU_WALKER;
|
||||
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
|
||||
|
||||
typedef struct tagMEDIA_INTERFACE_DESCRIPTOR_LOAD {
|
||||
union tagTheStructure {
|
||||
struct tagCommon {
|
||||
|
||||
@@ -5382,11 +5382,11 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
inline uint32_t getNumberOfThreadsInGpgpuThreadGroup() const {
|
||||
return TheStructure.Common.NumberOfThreadsInGpgpuThreadGroup;
|
||||
}
|
||||
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
|
||||
inline void setSharedLocalMemorySize(const uint32_t value) { // patched
|
||||
TheStructure.Common.SharedLocalMemorySize = value;
|
||||
}
|
||||
inline SHARED_LOCAL_MEMORY_SIZE getSharedLocalMemorySize() const {
|
||||
return static_cast<SHARED_LOCAL_MEMORY_SIZE>(TheStructure.Common.SharedLocalMemorySize);
|
||||
inline uint32_t getSharedLocalMemorySize() const { // patched
|
||||
return static_cast<uint32_t>(TheStructure.Common.SharedLocalMemorySize);
|
||||
}
|
||||
inline void setRoundingMode(const ROUNDING_MODE value) {
|
||||
TheStructure.Common.RoundingMode = value;
|
||||
@@ -5808,6 +5808,7 @@ typedef struct tagCOMPUTE_WALKER {
|
||||
inline uint32_t *getInlineDataPointer() {
|
||||
return reinterpret_cast<uint32_t *>(&TheStructure.Common.InlineData);
|
||||
}
|
||||
using InterfaceDescriptorType = INTERFACE_DESCRIPTOR_DATA; // patched
|
||||
} COMPUTE_WALKER;
|
||||
STATIC_ASSERT(156 == sizeof(COMPUTE_WALKER));
|
||||
|
||||
|
||||
@@ -5149,11 +5149,11 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
inline uint32_t getNumberOfThreadsInGpgpuThreadGroup() const {
|
||||
return TheStructure.Common.NumberOfThreadsInGpgpuThreadGroup;
|
||||
}
|
||||
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
|
||||
inline void setSharedLocalMemorySize(const uint32_t value) { // patched
|
||||
TheStructure.Common.SharedLocalMemorySize = value;
|
||||
}
|
||||
inline SHARED_LOCAL_MEMORY_SIZE getSharedLocalMemorySize() const {
|
||||
return static_cast<SHARED_LOCAL_MEMORY_SIZE>(TheStructure.Common.SharedLocalMemorySize);
|
||||
inline uint32_t getSharedLocalMemorySize() const { // patched
|
||||
return static_cast<uint32_t>(TheStructure.Common.SharedLocalMemorySize);
|
||||
}
|
||||
inline void setRoundingMode(const ROUNDING_MODE value) {
|
||||
TheStructure.Common.RoundingMode = value;
|
||||
@@ -5579,6 +5579,7 @@ typedef struct tagCOMPUTE_WALKER {
|
||||
inline uint32_t *getInlineDataPointer() {
|
||||
return reinterpret_cast<uint32_t *>(&TheStructure.Common.InlineData);
|
||||
}
|
||||
using InterfaceDescriptorType = INTERFACE_DESCRIPTOR_DATA; // patched
|
||||
} COMPUTE_WALKER;
|
||||
STATIC_ASSERT(156 == sizeof(COMPUTE_WALKER));
|
||||
|
||||
|
||||
@@ -16,7 +16,6 @@ enum class MemoryCompressionState;
|
||||
class GmmHelper;
|
||||
class IndirectHeap;
|
||||
class LinearStream;
|
||||
|
||||
struct DispatchFlags;
|
||||
struct HardwareInfo;
|
||||
struct StateBaseAddressProperties;
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "shared/source/gmm_helper/cache_settings_helper.h"
|
||||
#include "shared/source/gmm_helper/gmm_helper.h"
|
||||
#include "shared/source/helpers/cache_policy.h"
|
||||
#include "shared/source/helpers/compiler_product_helper.h"
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/helpers/state_base_address.h"
|
||||
#include "shared/source/indirect_heap/indirect_heap.h"
|
||||
@@ -21,6 +22,7 @@ namespace NEO {
|
||||
template <typename GfxFamily>
|
||||
void StateBaseAddressHelper<GfxFamily>::programStateBaseAddressIntoCommandStream(StateBaseAddressHelperArgs<GfxFamily> &args, NEO::LinearStream &commandStream) {
|
||||
StateBaseAddressHelper<GfxFamily>::programStateBaseAddress(args);
|
||||
|
||||
auto cmdSpace = StateBaseAddressHelper<GfxFamily>::getSpaceForSbaCmd(commandStream);
|
||||
*cmdSpace = *args.stateBaseAddressCmd;
|
||||
|
||||
|
||||
@@ -86,7 +86,6 @@ class ProductHelper {
|
||||
virtual uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const = 0;
|
||||
virtual void setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const = 0;
|
||||
virtual void updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const = 0;
|
||||
virtual void updateIddCommand(void *const commandPtr, uint32_t numGrf, int32_t threadArbitrationPolicy) const = 0;
|
||||
virtual bool obtainBlitterPreference(const HardwareInfo &hwInfo) const = 0;
|
||||
virtual bool isBlitterFullySupported(const HardwareInfo &hwInfo) const = 0;
|
||||
virtual bool isPageTableManagerSupported(const HardwareInfo &hwInfo) const = 0;
|
||||
|
||||
@@ -203,9 +203,6 @@ void ProductHelperHw<gfxProduct>::setForceNonCoherent(void *const commandPtr, co
|
||||
template <PRODUCT_FAMILY gfxProduct>
|
||||
void ProductHelperHw<gfxProduct>::updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const {}
|
||||
|
||||
template <PRODUCT_FAMILY gfxProduct>
|
||||
void ProductHelperHw<gfxProduct>::updateIddCommand(void *const commandPtr, uint32_t numGrf, int32_t threadArbitrationPolicy) const {}
|
||||
|
||||
template <PRODUCT_FAMILY gfxProduct>
|
||||
bool ProductHelperHw<gfxProduct>::isPageTableManagerSupported(const HardwareInfo &hwInfo) const {
|
||||
return false;
|
||||
|
||||
@@ -38,7 +38,6 @@ class ProductHelperHw : public ProductHelper {
|
||||
uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const override;
|
||||
void setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const override;
|
||||
void updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const override;
|
||||
void updateIddCommand(void *const commandPtr, uint32_t numGrf, int32_t threadArbitrationPolicy) const override;
|
||||
bool obtainBlitterPreference(const HardwareInfo &hwInfo) const override;
|
||||
bool isBlitterFullySupported(const HardwareInfo &hwInfo) const override;
|
||||
bool isPageTableManagerSupported(const HardwareInfo &hwInfo) const override;
|
||||
|
||||
@@ -27,12 +27,14 @@ using Family = NEO::XeHpcCoreFamily;
|
||||
namespace NEO {
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WalkerType &walkerCmd, const HardwareInfo &hwInfo) {
|
||||
walkerCmd.getPostSync().setDataportSubsliceCacheFlush(true);
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WALKER_TYPE &walkerCmd) {
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
|
||||
const auto &productHelper = device.getProductHelper();
|
||||
|
||||
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
@@ -255,6 +257,7 @@ inline void EncodeMiFlushDW<Family>::adjust(MI_FLUSH_DW *miFlushDwCmd, const Pro
|
||||
miFlushDwCmd->setFlushLlc(1);
|
||||
}
|
||||
|
||||
template <>
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor,
|
||||
uint32_t value,
|
||||
@@ -273,7 +276,8 @@ void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DAT
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
|
||||
const auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
|
||||
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
|
||||
auto programGlobalFenceAsPostSyncOperationInComputeWalker = productHelper.isGlobalFenceInCommandStreamRequired(hwInfo) &&
|
||||
@@ -297,8 +301,9 @@ void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDevice
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
|
||||
using PREFERRED_SLM_ALLOCATION_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::PREFERRED_SLM_ALLOCATION_SIZE;
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
|
||||
using PREFERRED_SLM_ALLOCATION_SIZE = typename InterfaceDescriptorType::PREFERRED_SLM_ALLOCATION_SIZE;
|
||||
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
|
||||
const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
|
||||
const uint32_t workGroupCountPerDss = static_cast<uint32_t>(Math::divideAndRoundUp(threadsPerDssCount, threadsPerThreadGroup));
|
||||
@@ -367,6 +372,13 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
|
||||
}
|
||||
|
||||
template struct EncodeDispatchKernel<Family>;
|
||||
template void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::WALKER_TYPE>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
|
||||
template void EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
|
||||
template void EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template void EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
|
||||
template void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::WALKER_TYPE, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::WALKER_TYPE &walkerCmd);
|
||||
template void EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
|
||||
|
||||
template struct EncodeStates<Family>;
|
||||
template struct EncodeMath<Family>;
|
||||
template struct EncodeMathMMIO<Family>;
|
||||
|
||||
@@ -133,6 +133,26 @@ struct XeHpcCoreFamily : public XeHpcCore {
|
||||
static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) {
|
||||
return cmdSetBaseFamily == IGFX_XE_HP_CORE;
|
||||
}
|
||||
|
||||
template <typename WalkerType = WALKER_TYPE>
|
||||
static WalkerType getInitGpuWalker() {
|
||||
return cmdInitGpgpuWalker;
|
||||
}
|
||||
|
||||
template <typename WalkerType = WALKER_TYPE>
|
||||
static constexpr size_t getInterfaceDescriptorSize() {
|
||||
return sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
}
|
||||
|
||||
template <typename InterfaceDescriptorType>
|
||||
static InterfaceDescriptorType getInitInterfaceDescriptor() {
|
||||
return cmdInitInterfaceDescriptorData;
|
||||
}
|
||||
|
||||
template <typename WalkerType>
|
||||
static constexpr bool isHeaplessMode() {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -30,4 +30,7 @@ bool ImplicitScalingDispatch<Family>::platformSupportsImplicitScaling(const Root
|
||||
}
|
||||
|
||||
template struct ImplicitScalingDispatch<Family>;
|
||||
template void ImplicitScalingDispatch<Family>::dispatchCommands<Family::WALKER_TYPE>(LinearStream &commandStream, Family::WALKER_TYPE &walkerCmd, void **outWalkerPtr, const DeviceBitfield &devices, uint32_t &partitionCount, bool useSecondaryBatchBuffer, bool apiSelfCleanup, bool usesImages, bool dcFlush, bool forceExecutionOnSingleTile, uint64_t workPartitionAllocationGpuVa, const HardwareInfo &hwInfo);
|
||||
template size_t ImplicitScalingDispatch<Family>::getSize<Family::WALKER_TYPE>(bool apiSelfCleanup, bool preferStaticPartitioning, const DeviceBitfield &devices, const Vec3<size_t> &groupStart, const Vec3<size_t> &groupCount);
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -26,15 +26,17 @@ using Family = NEO::XeHpgCoreFamily;
|
||||
namespace NEO {
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WalkerType &walkerCmd, const HardwareInfo &hwInfo) {
|
||||
auto &postSyncData = walkerCmd.getPostSync();
|
||||
|
||||
postSyncData.setDataportSubsliceCacheFlush(true);
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
|
||||
using PREFERRED_SLM_ALLOCATION_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::PREFERRED_SLM_ALLOCATION_SIZE;
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
|
||||
using PREFERRED_SLM_ALLOCATION_SIZE = typename InterfaceDescriptorType::PREFERRED_SLM_ALLOCATION_SIZE;
|
||||
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
|
||||
const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
|
||||
const uint32_t workGroupCountPerDss = threadsPerDssCount / threadsPerThreadGroup;
|
||||
@@ -92,7 +94,8 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTO
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WALKER_TYPE &walkerCmd) {
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
|
||||
const auto &productHelper = device.getProductHelper();
|
||||
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
if (interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup() == 1) {
|
||||
@@ -108,6 +111,7 @@ void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCR
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo) {
|
||||
using BARRIERS = INTERFACE_DESCRIPTOR_DATA::NUMBER_OF_BARRIERS;
|
||||
@@ -118,7 +122,8 @@ void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DAT
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
|
||||
auto *releaseHelper = rootDeviceEnvironment.getReleaseHelper();
|
||||
bool l3PrefetchDisable = releaseHelper->isPrefetchDisablingRequired();
|
||||
int32_t overrideL3PrefetchDisable = DebugManager.flags.ForceL3PrefetchForComputeWalker.get();
|
||||
@@ -217,6 +222,13 @@ size_t EncodeMiFlushDW<Family>::getWaSize(const EncodeDummyBlitWaArgs &waArgs) {
|
||||
template void flushGpuCache<Family>(LinearStream *commandStream, const Range<L3Range> &ranges, uint64_t postSyncAddress, const HardwareInfo &hwInfo);
|
||||
|
||||
template struct EncodeDispatchKernel<Family>;
|
||||
template void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::WALKER_TYPE>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
|
||||
template void EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
|
||||
template void EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
|
||||
template void EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
|
||||
template void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::WALKER_TYPE, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::WALKER_TYPE &walkerCmd);
|
||||
template void EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
|
||||
|
||||
template struct EncodeStates<Family>;
|
||||
template struct EncodeMath<Family>;
|
||||
template struct EncodeMathMMIO<Family>;
|
||||
|
||||
@@ -149,6 +149,26 @@ struct XeHpgCoreFamily : public XeHpgCore {
|
||||
static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) {
|
||||
return cmdSetBaseFamily == IGFX_XE_HP_CORE;
|
||||
}
|
||||
|
||||
template <typename WalkerType = WALKER_TYPE>
|
||||
static WalkerType getInitGpuWalker() {
|
||||
return cmdInitGpgpuWalker;
|
||||
}
|
||||
|
||||
template <typename WalkerType = WALKER_TYPE>
|
||||
static constexpr size_t getInterfaceDescriptorSize() {
|
||||
return sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
}
|
||||
|
||||
template <typename InterfaceDescriptorType>
|
||||
static InterfaceDescriptorType getInitInterfaceDescriptor() {
|
||||
return cmdInitInterfaceDescriptorData;
|
||||
}
|
||||
|
||||
template <typename WalkerType>
|
||||
static constexpr bool isHeaplessMode() {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2021-2022 Intel Corporation
|
||||
* Copyright (C) 2021-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -17,4 +17,7 @@ template <>
|
||||
bool ImplicitScalingDispatch<Family>::pipeControlStallRequired = true;
|
||||
|
||||
template struct ImplicitScalingDispatch<Family>;
|
||||
template void ImplicitScalingDispatch<Family>::dispatchCommands<Family::WALKER_TYPE>(LinearStream &commandStream, Family::WALKER_TYPE &walkerCmd, void **outWalkerPtr, const DeviceBitfield &devices, uint32_t &partitionCount, bool useSecondaryBatchBuffer, bool apiSelfCleanup, bool usesImages, bool dcFlush, bool forceExecutionOnSingleTile, uint64_t workPartitionAllocationGpuVa, const HardwareInfo &hwInfo);
|
||||
template size_t ImplicitScalingDispatch<Family>::getSize<Family::WALKER_TYPE>(bool apiSelfCleanup, bool preferStaticPartitioning, const DeviceBitfield &devices, const Vec3<size_t> &groupStart, const Vec3<size_t> &groupCount);
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -319,10 +319,6 @@ template <>
|
||||
void ProductHelperHw<IGFX_UNKNOWN>::updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const {
|
||||
}
|
||||
|
||||
template <>
|
||||
void ProductHelperHw<IGFX_UNKNOWN>::updateIddCommand(void *const commandPtr, uint32_t numGrf, int32_t threadArbitrationPolicy) const {
|
||||
}
|
||||
|
||||
template <>
|
||||
void ProductHelperHw<IGFX_UNKNOWN>::enableCompression(HardwareInfo *hwInfo) const {
|
||||
}
|
||||
|
||||
@@ -33,9 +33,9 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenOverrideSlmTotalSizeDebugVari
|
||||
|
||||
bool requiresUncachedMocs = false;
|
||||
|
||||
int32_t maxValueToProgram = 0xC;
|
||||
uint32_t maxValueToProgram = 0xC;
|
||||
|
||||
for (int32_t valueToProgram = 0x0; valueToProgram < maxValueToProgram; valueToProgram++) {
|
||||
for (uint32_t valueToProgram = 0x0; valueToProgram < maxValueToProgram; valueToProgram++) {
|
||||
DebugManager.flags.OverrideSlmAllocationSize.set(valueToProgram);
|
||||
cmdContainer->reset();
|
||||
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
|
||||
|
||||
@@ -176,9 +176,9 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenOverrideSlmTotalSizeD
|
||||
dispatchInterface->getSlmTotalSizeResult = slmTotalSize;
|
||||
|
||||
bool requiresUncachedMocs = false;
|
||||
int32_t maxValueToProgram = 0x8;
|
||||
uint32_t maxValueToProgram = 0x8;
|
||||
|
||||
for (int32_t valueToProgram = 0x0; valueToProgram < maxValueToProgram; valueToProgram++) {
|
||||
for (uint32_t valueToProgram = 0x0; valueToProgram < maxValueToProgram; valueToProgram++) {
|
||||
DebugManager.flags.OverrideSlmAllocationSize.set(valueToProgram);
|
||||
cmdContainer->reset();
|
||||
|
||||
@@ -1164,7 +1164,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
|
||||
size_t total = cmdContainer->getCommandStream()->getUsed();
|
||||
size_t partitionedWalkerSize = total - containerUsedAfterBase;
|
||||
|
||||
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::getSize(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
|
||||
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
|
||||
EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize);
|
||||
|
||||
GenCmdList partitionedWalkerList;
|
||||
@@ -1215,7 +1215,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
|
||||
EXPECT_EQ(2u, dispatchArgs.partitionCount);
|
||||
size_t partitionedWalkerSize = cmdContainer->getCommandStream()->getUsed();
|
||||
|
||||
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::getSize(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
|
||||
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
|
||||
EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize);
|
||||
|
||||
GenCmdList partitionedWalkerList;
|
||||
@@ -1314,7 +1314,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling,
|
||||
EXPECT_EQ(2u, dispatchArgs.partitionCount);
|
||||
size_t partitionedWalkerSize = cmdContainer->getCommandStream()->getUsed();
|
||||
|
||||
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::getSize(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
|
||||
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
|
||||
EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize);
|
||||
|
||||
GenCmdList partitionedWalkerList;
|
||||
|
||||
@@ -33,7 +33,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenGetSizeWhenDispatchingCm
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
|
||||
@@ -76,7 +76,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenWorkgroupOneAndNoPartiti
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, false, false, false, dcFlushFlag,
|
||||
@@ -120,7 +120,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenWorkgroupOneAndPartition
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
|
||||
@@ -167,7 +167,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenDi
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
|
||||
@@ -219,7 +219,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenPa
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
|
||||
@@ -273,7 +273,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
|
||||
@@ -324,7 +324,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
|
||||
@@ -361,7 +361,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
|
||||
@@ -398,7 +398,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenDynamicPartitioningPrefe
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
|
||||
@@ -445,7 +445,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
|
||||
size_t estimatedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
EXPECT_EQ(expectedSize, estimatedSize);
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
@@ -513,7 +513,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
|
||||
size_t estimatedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
EXPECT_EQ(expectedSize, estimatedSize);
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
@@ -573,7 +573,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
|
||||
size_t estimatedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
EXPECT_EQ(expectedSize, estimatedSize);
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
@@ -633,7 +633,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
|
||||
size_t estimatedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
EXPECT_EQ(expectedSize, estimatedSize);
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
@@ -700,7 +700,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
|
||||
size_t estimatedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
EXPECT_EQ(expectedSize, estimatedSize);
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
@@ -763,7 +763,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
|
||||
size_t estimatedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
EXPECT_EQ(expectedSize, estimatedSize);
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
@@ -828,7 +828,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
|
||||
size_t estimatedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
EXPECT_EQ(expectedSize, estimatedSize);
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
@@ -896,7 +896,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
|
||||
size_t estimatedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
EXPECT_EQ(expectedSize, estimatedSize);
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
@@ -963,7 +963,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
|
||||
size_t estimatedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
|
||||
EXPECT_EQ(expectedSize, estimatedSize);
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
|
||||
@@ -53,7 +53,9 @@ class CommandEncodeStatesFixture : public DeviceFixture {
|
||||
false, // useGlobalAtomics
|
||||
false, // multiOsContextCapable
|
||||
false, // isRcs
|
||||
container->doubleSbaWaRef()}; // doubleSbaWa
|
||||
container->doubleSbaWaRef(), // doubleSbaWa
|
||||
false // heaplessModeEnabled
|
||||
};
|
||||
return args;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user