feature: Add heapless mode programming in ocl

Related-To: NEO-7621
Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
Kamil Kopryk
2023-11-23 13:58:58 +00:00
committed by Compute-Runtime-Automation
parent c35b13ccae
commit ce7298d512
82 changed files with 1927 additions and 1224 deletions

View File

@@ -17,6 +17,7 @@
#include "shared/source/debugger/debugger_l0.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/compiler_product_helper.h"
#include "shared/source/memory_manager/allocation_properties.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/os_interface/os_context.h"
@@ -98,6 +99,8 @@ ze_result_t CommandQueueImp::initialize(bool copyOnly, bool isInternal, bool imm
this->doubleSbaWa = productHelper.isAdditionalStateBaseAddressWARequired(hwInfo);
this->cmdListHeapAddressModel = L0GfxCoreHelper::getHeapAddressModel(rootDeviceEnvironment);
this->dispatchCmdListBatchBufferAsPrimary = L0GfxCoreHelper::dispatchCmdListBatchBufferAsPrimary(rootDeviceEnvironment, !immediateCmdListQueue);
auto &compilerProductHelper = rootDeviceEnvironment.getHelper<NEO::CompilerProductHelper>();
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled();
}
return returnValue;
}

View File

@@ -87,6 +87,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
bool doubleSbaWa = false;
bool dispatchCmdListBatchBufferAsPrimary = false;
bool internalQueueForImmediateCommandList = false;
bool heaplessModeEnabled = false;
};
using CommandQueueAllocatorFn = CommandQueue *(*)(Device *device, NEO::CommandStreamReceiver *csr,

View File

@@ -17,6 +17,7 @@
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/array_count.h"
#include "shared/source/helpers/bit_helpers.h"
#include "shared/source/helpers/compiler_product_helper.h"
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/helpers/flush_stamp.h"
#include "shared/source/helpers/get_info.h"
@@ -102,6 +103,7 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr
auto &hwInfo = device->getHardwareInfo();
auto &gfxCoreHelper = device->getGfxCoreHelper();
auto &productHelper = device->getProductHelper();
auto &compilerProductHelper = device->getCompilerProductHelper();
auto &rootDeviceEnvironment = device->getRootDeviceEnvironment();
bcsAllowed = productHelper.isBlitterFullySupported(hwInfo) &&
@@ -127,6 +129,8 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr
if (NEO::Debugger::isDebugEnabled(internalUsage) && device->getDevice().getL0Debugger()) {
device->getDevice().getL0Debugger()->notifyCommandQueueCreated(&device->getDevice());
}
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled();
}
}

View File

@@ -384,6 +384,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
void handlePostCompletionOperations(bool checkQueueCompletion);
bool getHeaplessModeEnabled() const { return this->heaplessModeEnabled; }
protected:
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest);
@@ -477,6 +479,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
bool dcFlushRequiredOnStallingCommandsOnNextFlush = false;
bool splitBarrierRequired = false;
bool gpgpuCsrClientRegistered = false;
bool heaplessModeEnabled = false;
};
template <typename PtrType>

View File

@@ -552,11 +552,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
dispatchWalkerArgs.event = event;
dispatchWalkerArgs.relaxedOrderingEnabled = relaxedOrderingEnabled;
HardwareInterface<GfxFamily>::dispatchWalker(
*this,
multiDispatchInfo,
csrDeps,
dispatchWalkerArgs);
HardwareInterface<GfxFamily>::dispatchWalkerCommon(*this, multiDispatchInfo, csrDeps, dispatchWalkerArgs);
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
for (auto &dispatchInfo : multiDispatchInfo) {

View File

@@ -35,8 +35,9 @@ class GpgpuWalkerHelper {
static size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
static size_t getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel);
template <typename WalkerType>
static size_t setGpgpuWalkerThreadData(
WALKER_TYPE *walkerCmd,
WalkerType *walkerCmd,
const KernelDescriptor &kernelDescriptor,
const size_t globalOffsets[3],
const size_t startWorkGroups[3],
@@ -67,9 +68,10 @@ class GpgpuWalkerHelper {
TagNodeBase &hwPerfCounter,
LinearStream *commandStream);
template <typename WalkerType>
static void setupTimestampPacket(
LinearStream *cmdStream,
WALKER_TYPE *walkerCmd,
WalkerType *walkerCmd,
TagNodeBase *timestampPacketNode,
const RootDeviceEnvironment &rootDeviceEnvironment);
@@ -94,6 +96,7 @@ struct EnqueueOperation {
static size_t getSizeForCacheFlushAfterWalkerCommands(const Kernel &kernel, const CommandQueue &commandQueue);
private:
template <typename WalkerType>
static size_t getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo);
static size_t getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue);
};

View File

@@ -247,7 +247,7 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCS(uint32_t cmdType, bool res
if (isCommandWithoutKernel(cmdType)) {
return EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue);
} else {
return EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, pKernel, dispatchInfo);
return EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel<typename GfxFamily::WALKER_TYPE>(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, pKernel, dispatchInfo);
}
}

View File

@@ -17,8 +17,9 @@
namespace NEO {
template <typename GfxFamily>
template <typename WalkerType>
inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
WALKER_TYPE *walkerCmd,
WalkerType *walkerCmd,
const KernelDescriptor &kernelDescriptor,
const size_t globalOffsets[3],
const size_t startWorkGroups[3],
@@ -58,9 +59,10 @@ inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
}
template <typename GfxFamily>
template <typename WalkerType>
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
LinearStream *cmdStream,
WALKER_TYPE *walkerCmd,
WalkerType *walkerCmd,
TagNodeBase *timestampPacketNode,
const RootDeviceEnvironment &rootDeviceEnvironment) {
@@ -78,6 +80,7 @@ void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
}
template <typename GfxFamily>
template <typename WalkerType>
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo) {
size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + HardwareCommandsHelper<GfxFamily>::getSizeRequiredCS() +
sizeof(PIPE_CONTROL) * (MemorySynchronizationCommands<GfxFamily>::isBarrierWaRequired(commandQueue.getDevice().getRootDeviceEnvironment()) ? 2 : 1);

View File

@@ -21,8 +21,9 @@
namespace NEO {
template <typename GfxFamily>
template <typename WalkerType>
size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
WALKER_TYPE *walkerCmd,
WalkerType *walkerCmd,
const KernelDescriptor &kernelDescriptor,
const size_t globalOffsets[3],
const size_t startWorkGroups[3],
@@ -50,7 +51,7 @@ size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
}
walkerCmd->setExecutionMask(static_cast<uint32_t>(executionMask));
walkerCmd->setSimdSize(getSimdConfig<WALKER_TYPE>(simd));
walkerCmd->setSimdSize(getSimdConfig<WalkerType>(simd));
walkerCmd->setMessageSimd(walkerCmd->getSimdSize());
if (DebugManager.flags.ForceSimdMessageSizeInWalker.get() != -1) {
@@ -64,6 +65,7 @@ size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
// 1) cross-thread inline data will be put into R1, but if kernel uses local ids, then cross-thread should be put further back
// so whenever local ids are driver or hw generated, reserve space by setting right values for emitLocalIds
// 2) Auto-generation of local ids should be possible, when in fact local ids are used
if (!localIdsGenerationByRuntime && kernelUsesLocalIds) {
uint32_t emitLocalIdsForDim = 0;
if (kernelDescriptor.kernelAttributes.localId[0]) {
@@ -77,6 +79,7 @@ size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
}
walkerCmd->setEmitLocalId(emitLocalIdsForDim);
}
if (inlineDataProgrammingRequired == true) {
walkerCmd->setEmitInlineParameter(1);
}
@@ -94,20 +97,20 @@ size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
}
template <typename GfxFamily>
template <typename WalkerType>
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(LinearStream *cmdStream,
WALKER_TYPE *walkerCmd,
WalkerType *walkerCmd,
TagNodeBase *timestampPacketNode,
const RootDeviceEnvironment &rootDeviceEnvironment) {
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
const auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
auto &postSyncData = walkerCmd->getPostSync();
postSyncData.setDataportPipelineFlush(true);
EncodeDispatchKernel<GfxFamily>::setupPostSyncMocs(*walkerCmd, rootDeviceEnvironment,
MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, rootDeviceEnvironment));
EncodeDispatchKernel<GfxFamily>::template setupPostSyncMocs<WalkerType>(*walkerCmd, rootDeviceEnvironment,
MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, rootDeviceEnvironment));
EncodeDispatchKernel<GfxFamily>::adjustTimestampPacket(*walkerCmd, hwInfo);
EncodeDispatchKernel<GfxFamily>::template adjustTimestampPacket<WalkerType>(*walkerCmd, hwInfo);
if (DebugManager.flags.UseImmDataWriteModeOnPostSyncOperation.get()) {
postSyncData.setOperation(GfxFamily::POSTSYNC_DATA::OPERATION::OPERATION_WRITE_IMMEDIATE_DATA);
@@ -119,8 +122,11 @@ void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(LinearStream *cmdStream,
auto contextStartAddress = TimestampPacketHelper::getContextStartGpuAddress(*timestampPacketNode);
postSyncData.setDestinationAddress(contextStartAddress);
}
if (DebugManager.flags.OverrideSystolicInComputeWalker.get() != -1) {
walkerCmd->setSystolicModeEnable((DebugManager.flags.OverrideSystolicInComputeWalker.get()));
if constexpr (std::is_same_v<WalkerType, typename GfxFamily::COMPUTE_WALKER>) {
if (DebugManager.flags.OverrideSystolicInComputeWalker.get() != -1) {
walkerCmd->setSystolicModeEnable((DebugManager.flags.OverrideSystolicInComputeWalker.get()));
}
}
}
@@ -130,10 +136,11 @@ void GpgpuWalkerHelper<GfxFamily>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxF
}
template <typename GfxFamily>
template <typename WalkerType>
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo) {
size_t numBarriers = MemorySynchronizationCommands<GfxFamily>::isBarrierWaRequired(commandQueue.getDevice().getRootDeviceEnvironment()) ? 2 : 1;
size_t size = sizeof(typename GfxFamily::COMPUTE_WALKER) +
size_t size = sizeof(WalkerType) +
(MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(false) * numBarriers) +
HardwareCommandsHelper<GfxFamily>::getSizeRequiredCS() +
EncodeMemoryPrefetch<GfxFamily>::getSizeForMemoryPrefetch(pKernel->getKernelInfo().heapInfo.kernelHeapSize, commandQueue.getDevice().getRootDeviceEnvironment());
@@ -144,7 +151,7 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilin
Vec3<size_t> groupCount = dispatchInfo.getNumberOfWorkgroups();
UNRECOVERABLE_IF(groupCount.x == 0);
const bool staticPartitioning = commandQueue.getGpgpuCommandStreamReceiver().isStaticWorkPartitioningEnabled();
size += static_cast<size_t>(ImplicitScalingDispatch<GfxFamily>::getSize(false, staticPartitioning, devices, groupStart, groupCount));
size += static_cast<size_t>(ImplicitScalingDispatch<GfxFamily>::template getSize<WalkerType>(false, staticPartitioning, devices, groupStart, groupCount));
}
size += PerformanceCounters::getGpuCommandsSize(commandQueue.getPerfCounters(), commandQueue.getGpgpuEngine().osContext->getEngineType(), reservePerfCounters);

View File

@@ -55,12 +55,19 @@ class HardwareInterface {
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;
template <typename WalkerType>
static void dispatchWalker(
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
const CsrDependencies &csrDependencies,
HardwareInterfaceWalkerArgs &walkerArgs);
static void dispatchWalkerCommon(
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
const CsrDependencies &csrDependencies,
HardwareInterfaceWalkerArgs &walkerArgs);
static void getDefaultDshSpace(
const size_t &offsetInterfaceDescriptorTable,
CommandQueue &commandQueue,
@@ -94,6 +101,7 @@ class HardwareInterface {
DebugPauseState waitCondition,
const HardwareInfo &hwInfo);
template <typename WalkerType>
static void programWalker(
LinearStream &commandStream,
Kernel &kernel,
@@ -104,12 +112,14 @@ class HardwareInterface {
const DispatchInfo &dispatchInfo,
HardwareInterfaceWalkerArgs &walkerArgs);
static WALKER_TYPE *allocateWalkerSpace(LinearStream &commandStream,
const Kernel &kernel);
template <typename WalkerType>
static WalkerType *allocateWalkerSpace(LinearStream &commandStream,
const Kernel &kernel);
static void obtainIndirectHeaps(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo,
bool blockedQueue, IndirectHeap *&dsh, IndirectHeap *&ioh, IndirectHeap *&ssh);
template <typename WalkerType>
static void dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream,
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh,
HardwareInterfaceWalkerArgs &walkerArgs);

View File

@@ -23,11 +23,18 @@
namespace NEO {
template <typename GfxFamily>
inline typename GfxFamily::WALKER_TYPE *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream, const Kernel &kernel) {
auto walkerCmd = commandStream.getSpaceForCmd<WALKER_TYPE>();
template <typename WalkerType>
inline WalkerType *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream, const Kernel &kernel) {
auto walkerCmd = commandStream.getSpaceForCmd<WalkerType>();
return walkerCmd;
}
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchWalkerCommon(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs) {
dispatchWalker<typename GfxFamily::WALKER_TYPE>(commandQueue, multiDispatchInfo, csrDependencies, walkerArgs);
}
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
TagNodeBase *hwTimeStamps,
@@ -61,6 +68,7 @@ inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
}
template <typename GfxFamily>
template <typename WalkerType>
void HardwareInterface<GfxFamily>::dispatchWalker(
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
@@ -111,7 +119,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
walkerArgs.interfaceDescriptorIndex = 0;
walkerArgs.offsetInterfaceDescriptorTable = dsh->getUsed();
size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
size_t totalInterfaceDescriptorTableSize = GfxFamily::template getInterfaceDescriptorSize<WalkerType>();
getDefaultDshSpace(walkerArgs.offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize, dsh, commandStream);
@@ -143,7 +151,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
dispatchInfo.dispatchInitCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getRootDeviceEnvironment());
walkerArgs.isMainKernel = (dispatchInfo.getKernel() == mainKernel);
dispatchKernelCommands(commandQueue, dispatchInfo, *commandStream, *dsh, *ioh, *ssh, walkerArgs);
dispatchKernelCommands<WalkerType>(commandQueue, dispatchInfo, *commandStream, *dsh, *ioh, *ssh, walkerArgs);
walkerArgs.currentDispatchIndex++;
dispatchInfo.dispatchEpilogueCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getRootDeviceEnvironment());
@@ -164,6 +172,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
}
template <typename GfxFamily>
template <typename WalkerType>
void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream,
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh,
HardwareInterfaceWalkerArgs &walkerArgs) {
@@ -223,7 +232,7 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
dispatchWorkarounds(&commandStream, commandQueue, kernel, true);
programWalker(commandStream, kernel, commandQueue, dsh, ioh, ssh, dispatchInfo, walkerArgs);
programWalker<WalkerType>(commandStream, kernel, commandQueue, dsh, ioh, ssh, dispatchInfo, walkerArgs);
dispatchWorkarounds(&commandStream, commandQueue, kernel, false);
}

View File

@@ -47,6 +47,7 @@ inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
}
template <typename GfxFamily>
template <typename WalkerType>
inline void HardwareInterface<GfxFamily>::programWalker(
LinearStream &commandStream,
Kernel &kernel,
@@ -57,8 +58,8 @@ inline void HardwareInterface<GfxFamily>::programWalker(
const DispatchInfo &dispatchInfo,
HardwareInterfaceWalkerArgs &walkerArgs) {
auto walkerCmdBuf = allocateWalkerSpace(commandStream, kernel);
WALKER_TYPE walkerCmd = GfxFamily::cmdInitGpgpuWalker;
auto walkerCmdBuf = allocateWalkerSpace<WalkerType>(commandStream, kernel);
WalkerType walkerCmd = GfxFamily::cmdInitGpgpuWalker;
uint32_t dim = dispatchInfo.getDim();
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
auto &rootDeviceEnvironment = commandQueue.getDevice().getRootDeviceEnvironment();
@@ -82,7 +83,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
false, false, 0u);
HardwareCommandsHelper<GfxFamily>::sendIndirectState(
HardwareCommandsHelper<GfxFamily>::template sendIndirectState<WalkerType, INTERFACE_DESCRIPTOR_DATA>(
commandStream,
dsh,
ioh,
@@ -98,6 +99,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
&walkerCmd,
nullptr,
kernelUsesLocalIds,
0,
commandQueue.getDevice());
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), false, kernel.getKernelInfo().kernelDescriptor};

View File

@@ -8,7 +8,9 @@
#pragma once
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/device/device.h"
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/source/os_interface/os_interface.h"
@@ -37,6 +39,7 @@ inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
}
template <typename GfxFamily>
template <typename WalkerType>
inline void HardwareInterface<GfxFamily>::programWalker(
LinearStream &commandStream,
Kernel &kernel,
@@ -47,9 +50,9 @@ inline void HardwareInterface<GfxFamily>::programWalker(
const DispatchInfo &dispatchInfo,
HardwareInterfaceWalkerArgs &walkerArgs) {
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
using InterfaceDescriptorType = typename WalkerType::InterfaceDescriptorType;
WalkerType walkerCmd = GfxFamily::template getInitGpuWalker<WalkerType>();
COMPUTE_WALKER walkerCmd = GfxFamily::cmdInitGpgpuWalker;
auto &kernelInfo = kernel.getKernelInfo();
uint32_t dim = dispatchInfo.getDim();
@@ -75,7 +78,6 @@ inline void HardwareInterface<GfxFamily>::programWalker(
simd);
bool inlineDataProgrammingRequired = EncodeDispatchKernel<GfxFamily>::inlineDataProgrammingRequired(kernel.getKernelInfo().kernelDescriptor);
auto idd = &walkerCmd.getInterfaceDescriptor();
auto &queueCsr = commandQueue.getGpgpuCommandStreamReceiver();
auto &rootDeviceEnvironment = commandQueue.getDevice().getRootDeviceEnvironment();
@@ -86,27 +88,40 @@ inline void HardwareInterface<GfxFamily>::programWalker(
}
if (timestampPacketNode) {
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacketNode, rootDeviceEnvironment);
GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacket<WalkerType>(&commandStream, &walkerCmd, timestampPacketNode, rootDeviceEnvironment);
}
auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
const auto &hwInfo = commandQueue.getDevice().getHardwareInfo();
if (auto kernelAllocation = kernelInfo.getGraphicsAllocation()) {
EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.kernelHeapSize, 0, rootDeviceEnvironment);
constexpr bool heaplessModeEnabled = GfxFamily::template isHeaplessMode<WalkerType>();
if constexpr (heaplessModeEnabled == false) {
if (auto kernelAllocation = kernelInfo.getGraphicsAllocation()) {
EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.kernelHeapSize, 0, rootDeviceEnvironment);
}
}
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups,
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder);
GpgpuWalkerHelper<GfxFamily>::template setGpgpuWalkerThreadData<WalkerType>(&walkerCmd, kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups,
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder);
HardwareCommandsHelper<GfxFamily>::sendIndirectState(
auto interfaceDescriptor = &walkerCmd.getInterfaceDescriptor();
uint64_t scratchAddress = 0;
if constexpr (heaplessModeEnabled) {
auto scratchAllocation = queueCsr.getScratchAllocation();
if (scratchAllocation) {
scratchAddress = scratchAllocation->getGpuAddress();
}
}
HardwareCommandsHelper<GfxFamily>::template sendIndirectState<WalkerType, InterfaceDescriptorType>(
commandStream,
dsh,
ioh,
ssh,
kernel,
kernel.getKernelStartAddress(localIdsGenerationByRuntime, kernelUsesLocalIds, isCcsUsed, false),
kernel.getKernelStartAddress(localIdsGenerationByRuntime, kernelUsesLocalIds, isCcsUsed, heaplessModeEnabled),
simd,
walkerArgs.localWorkSizes,
threadGroupCount,
@@ -114,8 +129,9 @@ inline void HardwareInterface<GfxFamily>::programWalker(
walkerArgs.interfaceDescriptorIndex,
walkerArgs.preemptionMode,
&walkerCmd,
idd,
interfaceDescriptor,
localIdsGenerationByRuntime,
scratchAddress,
commandQueue.getDevice());
bool kernelSystemAllocation = false;
@@ -126,7 +142,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
}
bool requiredSystemFence = kernelSystemAllocation && walkerArgs.event != nullptr;
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), requiredSystemFence, kernelInfo.kernelDescriptor};
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, encodeWalkerArgs);
EncodeDispatchKernel<GfxFamily>::template encodeAdditionalWalkerFields<WalkerType>(rootDeviceEnvironment, walkerCmd, encodeWalkerArgs);
auto devices = queueCsr.getOsContext().getDeviceBitfield();
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true);
@@ -139,18 +155,18 @@ inline void HardwareInterface<GfxFamily>::programWalker(
if (partitionWalker) {
const uint64_t workPartitionAllocationGpuVa = commandQueue.getDevice().getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
uint32_t partitionCount = 0u;
ImplicitScalingDispatch<GfxFamily>::dispatchCommands(commandStream,
walkerCmd,
nullptr,
devices,
partitionCount,
false,
false,
kernel.usesImages(),
queueCsr.getDcFlushSupport(),
kernel.isSingleSubdevicePreferred(),
workPartitionAllocationGpuVa,
hwInfo);
ImplicitScalingDispatch<GfxFamily>::template dispatchCommands<WalkerType>(commandStream,
walkerCmd,
nullptr,
devices,
partitionCount,
false,
false,
kernel.usesImages(),
queueCsr.getDcFlushSupport(),
kernel.isSingleSubdevicePreferred(),
workPartitionAllocationGpuVa,
hwInfo);
if (queueCsr.isStaticWorkPartitioningEnabled()) {
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount));
}
@@ -159,7 +175,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
timestampPacketNode->setPacketsUsed(partitionCount);
}
} else {
auto computeWalkerOnStream = commandStream.getSpaceForCmd<typename GfxFamily::COMPUTE_WALKER>();
auto computeWalkerOnStream = commandStream.getSpaceForCmd<WalkerType>();
*computeWalkerOnStream = walkerCmd;
}
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2022 Intel Corporation
* Copyright (C) 2019-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -13,10 +13,20 @@
namespace NEO {
template class HardwareInterface<Gen11Family>;
using Family = Gen11Family;
template class GpgpuWalkerHelper<Gen11Family>;
template class HardwareInterface<Family>;
template struct EnqueueOperation<Gen11Family>;
template void HardwareInterface<Family>::dispatchWalker<Family::WALKER_TYPE>(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs);
template void HardwareInterface<Family>::programWalker<Family::WALKER_TYPE>(LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs);
template void HardwareInterface<Family>::dispatchKernelCommands<Family::WALKER_TYPE>(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs);
template Family::WALKER_TYPE *HardwareInterface<Family>::allocateWalkerSpace<Family::WALKER_TYPE>(LinearStream &commandStream, const Kernel &kernel);
template class GpgpuWalkerHelper<Family>;
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::WALKER_TYPE>(LinearStream *cmdStream, Family::WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::WALKER_TYPE>(Family::WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3],
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);
template struct EnqueueOperation<Family>;
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2022 Intel Corporation
* Copyright (C) 2019-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -16,4 +16,53 @@ namespace NEO {
using FamilyType = Gen11Family;
template struct HardwareCommandsHelper<FamilyType>;
template size_t HardwareCommandsHelper<FamilyType>::sendIndirectState<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
LinearStream &commandStream,
IndirectHeap &dsh,
IndirectHeap &ioh,
IndirectHeap &ssh,
Kernel &kernel,
uint64_t kernelStartOffset,
uint32_t simd,
const size_t localWorkSize[3],
const uint32_t threadGroupCount,
const uint64_t offsetInterfaceDescriptorTable,
uint32_t &interfaceDescriptorIndex,
PreemptionMode preemptionMode,
FamilyType::WALKER_TYPE *walkerCmd,
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
bool localIdsGenerationByRuntime,
uint64_t scratchAddress,
const Device &device);
template size_t HardwareCommandsHelper<FamilyType>::sendCrossThreadData<FamilyType::WALKER_TYPE>(
IndirectHeap &indirectHeap,
Kernel &kernel,
bool inlineDataProgrammingRequired,
FamilyType::WALKER_TYPE *walkerCmd,
uint32_t &sizeCrossThreadData,
uint64_t scratchAddress);
template size_t HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
const IndirectHeap &indirectHeap,
uint64_t offsetInterfaceDescriptor,
uint64_t kernelStartOffset,
size_t sizeCrossThreadData,
size_t sizePerThreadData,
size_t bindingTablePointer,
[[maybe_unused]] size_t offsetSamplerState,
uint32_t numSamplers,
const uint32_t threadGroupCount,
uint32_t numThreadsPerThreadGroup,
const Kernel &kernel,
uint32_t bindingTablePrefetchSize,
PreemptionMode preemptionMode,
const Device &device,
FamilyType::WALKER_TYPE *walkerCmd,
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
template void HardwareCommandsHelper<FamilyType>::programInlineData<FamilyType::WALKER_TYPE>(
Kernel &kernel,
FamilyType::WALKER_TYPE *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2022 Intel Corporation
* Copyright (C) 2019-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -12,48 +12,58 @@
namespace NEO {
using Family = Gen12LpFamily;
template <>
void GpgpuWalkerHelper<Gen12LpFamily>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<Gen12LpFamily> *storeCmd) {
void GpgpuWalkerHelper<Family>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<Family> *storeCmd) {
storeCmd->setMmioRemapEnable(true);
}
template <>
void HardwareInterface<Gen12LpFamily>::dispatchWorkarounds(
void HardwareInterface<Family>::dispatchWorkarounds(
LinearStream *commandStream,
CommandQueue &commandQueue,
Kernel &kernel,
const bool &enable) {
using MI_LOAD_REGISTER_IMM = typename Gen12LpFamily::MI_LOAD_REGISTER_IMM;
using PIPE_CONTROL = typename Gen12LpFamily::PIPE_CONTROL;
using MI_LOAD_REGISTER_IMM = typename Family::MI_LOAD_REGISTER_IMM;
using PIPE_CONTROL = typename Family::PIPE_CONTROL;
if (kernel.requiresWaDisableRccRhwoOptimization()) {
PIPE_CONTROL cmdPipeControl = Gen12LpFamily::cmdInitPipeControl;
PIPE_CONTROL cmdPipeControl = Family::cmdInitPipeControl;
cmdPipeControl.setCommandStreamerStallEnable(true);
auto pCmdPipeControl = commandStream->getSpaceForCmd<PIPE_CONTROL>();
*pCmdPipeControl = cmdPipeControl;
uint32_t value = enable ? 0x40004000 : 0x40000000;
NEO::LriHelper<Gen12LpFamily>::program(commandStream,
0x7010,
value,
false);
NEO::LriHelper<Family>::program(commandStream,
0x7010,
value,
false);
}
}
template <>
size_t GpgpuWalkerHelper<Gen12LpFamily>::getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel) {
size_t GpgpuWalkerHelper<Family>::getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel) {
if (pKernel->requiresWaDisableRccRhwoOptimization()) {
return (2 * (sizeof(Gen12LpFamily::PIPE_CONTROL) + sizeof(Gen12LpFamily::MI_LOAD_REGISTER_IMM)));
return (2 * (sizeof(Gen12LpFamily::PIPE_CONTROL) + sizeof(Family::MI_LOAD_REGISTER_IMM)));
}
return 0u;
}
template class HardwareInterface<Gen12LpFamily>;
template class HardwareInterface<Family>;
template class GpgpuWalkerHelper<Gen12LpFamily>;
template void HardwareInterface<Family>::dispatchWalker<Family::WALKER_TYPE>(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs);
template void HardwareInterface<Family>::programWalker<Family::WALKER_TYPE>(LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs);
template void HardwareInterface<Family>::dispatchKernelCommands<Family::WALKER_TYPE>(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs);
template Family::WALKER_TYPE *HardwareInterface<Family>::allocateWalkerSpace<Family::WALKER_TYPE>(LinearStream &commandStream, const Kernel &kernel);
template struct EnqueueOperation<Gen12LpFamily>;
template class GpgpuWalkerHelper<Family>;
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::WALKER_TYPE>(LinearStream *cmdStream, Family::WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::WALKER_TYPE>(Family::WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3],
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);
template struct EnqueueOperation<Family>;
} // namespace NEO

View File

@@ -23,4 +23,54 @@ size_t HardwareCommandsHelper<FamilyType>::getSizeRequiredCS() {
}
template struct HardwareCommandsHelper<FamilyType>;
template size_t HardwareCommandsHelper<FamilyType>::sendIndirectState<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
LinearStream &commandStream,
IndirectHeap &dsh,
IndirectHeap &ioh,
IndirectHeap &ssh,
Kernel &kernel,
uint64_t kernelStartOffset,
uint32_t simd,
const size_t localWorkSize[3],
const uint32_t threadGroupCount,
const uint64_t offsetInterfaceDescriptorTable,
uint32_t &interfaceDescriptorIndex,
PreemptionMode preemptionMode,
FamilyType::WALKER_TYPE *walkerCmd,
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
bool localIdsGenerationByRuntime,
uint64_t scratchAddress,
const Device &device);
template size_t HardwareCommandsHelper<FamilyType>::sendCrossThreadData<FamilyType::WALKER_TYPE>(
IndirectHeap &indirectHeap,
Kernel &kernel,
bool inlineDataProgrammingRequired,
FamilyType::WALKER_TYPE *walkerCmd,
uint32_t &sizeCrossThreadData,
uint64_t scratchAddress);
template size_t HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
const IndirectHeap &indirectHeap,
uint64_t offsetInterfaceDescriptor,
uint64_t kernelStartOffset,
size_t sizeCrossThreadData,
size_t sizePerThreadData,
size_t bindingTablePointer,
[[maybe_unused]] size_t offsetSamplerState,
uint32_t numSamplers,
const uint32_t threadGroupCount,
uint32_t numThreadsPerThreadGroup,
const Kernel &kernel,
uint32_t bindingTablePrefetchSize,
PreemptionMode preemptionMode,
const Device &device,
FamilyType::WALKER_TYPE *walkerCmd,
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
template void HardwareCommandsHelper<FamilyType>::programInlineData<FamilyType::WALKER_TYPE>(
Kernel &kernel,
FamilyType::WALKER_TYPE *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2022 Intel Corporation
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -13,34 +13,36 @@
namespace NEO {
using Family = Gen8Family;
template <>
void GpgpuWalkerHelper<Gen8Family>::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
void GpgpuWalkerHelper<Family>::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
if (disablePerfMode) {
if (kernel.getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesFencesForReadWriteImages) {
// Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
GpgpuWalkerHelper<Gen8Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
GpgpuWalkerHelper<Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
}
} else {
if (kernel.getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesFencesForReadWriteImages) {
// Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
typedef typename Gen8Family::PIPE_CONTROL PIPE_CONTROL;
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
auto pipeControlSpace = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
auto pipeControl = Gen8Family::cmdInitPipeControl;
auto pipeControl = Family::cmdInitPipeControl;
pipeControl.setCommandStreamerStallEnable(true);
*pipeControlSpace = pipeControl;
// Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
GpgpuWalkerHelper<Gen8Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
GpgpuWalkerHelper<Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
}
}
}
template <>
size_t GpgpuWalkerHelper<Gen8Family>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
typedef typename Gen8Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
typedef typename Gen8Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
typedef typename Gen8Family::PIPE_CONTROL PIPE_CONTROL;
typedef typename Gen8Family::MI_MATH MI_MATH;
typedef typename Gen8Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
size_t GpgpuWalkerHelper<Family>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
typedef typename Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
typedef typename Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
typedef typename Family::MI_MATH MI_MATH;
typedef typename Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
size_t n = 0;
if (pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesFencesForReadWriteImages) {
n += sizeof(PIPE_CONTROL) +
@@ -54,10 +56,18 @@ size_t GpgpuWalkerHelper<Gen8Family>::getSizeForWADisableLSQCROPERFforOCL(const
return n;
}
template class HardwareInterface<Gen8Family>;
template class HardwareInterface<Family>;
template class GpgpuWalkerHelper<Gen8Family>;
template void HardwareInterface<Family>::dispatchWalker<Family::WALKER_TYPE>(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs);
template void HardwareInterface<Family>::programWalker<Family::WALKER_TYPE>(LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs);
template void HardwareInterface<Family>::dispatchKernelCommands<Family::WALKER_TYPE>(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs);
template Family::WALKER_TYPE *HardwareInterface<Family>::allocateWalkerSpace<Family::WALKER_TYPE>(LinearStream &commandStream, const Kernel &kernel);
template struct EnqueueOperation<Gen8Family>;
template class GpgpuWalkerHelper<Family>;
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::WALKER_TYPE>(LinearStream *cmdStream, Family::WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::WALKER_TYPE>(Family::WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3],
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);
template struct EnqueueOperation<Family>;
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2022 Intel Corporation
* Copyright (C) 2019-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -15,4 +15,53 @@ namespace NEO {
using FamilyType = Gen8Family;
template struct HardwareCommandsHelper<FamilyType>;
template size_t HardwareCommandsHelper<FamilyType>::sendIndirectState<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
LinearStream &commandStream,
IndirectHeap &dsh,
IndirectHeap &ioh,
IndirectHeap &ssh,
Kernel &kernel,
uint64_t kernelStartOffset,
uint32_t simd,
const size_t localWorkSize[3],
const uint32_t threadGroupCount,
const uint64_t offsetInterfaceDescriptorTable,
uint32_t &interfaceDescriptorIndex,
PreemptionMode preemptionMode,
FamilyType::WALKER_TYPE *walkerCmd,
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
bool localIdsGenerationByRuntime,
uint64_t scratchAddress,
const Device &device);
template size_t HardwareCommandsHelper<FamilyType>::sendCrossThreadData<FamilyType::WALKER_TYPE>(
IndirectHeap &indirectHeap,
Kernel &kernel,
bool inlineDataProgrammingRequired,
FamilyType::WALKER_TYPE *walkerCmd,
uint32_t &sizeCrossThreadData,
uint64_t scratchAddress);
template size_t HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
const IndirectHeap &indirectHeap,
uint64_t offsetInterfaceDescriptor,
uint64_t kernelStartOffset,
size_t sizeCrossThreadData,
size_t sizePerThreadData,
size_t bindingTablePointer,
[[maybe_unused]] size_t offsetSamplerState,
uint32_t numSamplers,
const uint32_t threadGroupCount,
uint32_t numThreadsPerThreadGroup,
const Kernel &kernel,
uint32_t bindingTablePrefetchSize,
PreemptionMode preemptionMode,
const Device &device,
FamilyType::WALKER_TYPE *walkerCmd,
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
template void HardwareCommandsHelper<FamilyType>::programInlineData<FamilyType::WALKER_TYPE>(
Kernel &kernel,
FamilyType::WALKER_TYPE *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2022 Intel Corporation
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -12,35 +12,37 @@
namespace NEO {
using Family = Gen9Family;
template <>
void GpgpuWalkerHelper<Gen9Family>::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
void GpgpuWalkerHelper<Family>::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
if (disablePerfMode) {
if (kernel.getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesFencesForReadWriteImages) {
// Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
GpgpuWalkerHelper<Gen9Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
GpgpuWalkerHelper<Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
}
} else {
if (kernel.getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesFencesForReadWriteImages) {
// Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
typedef typename Gen9Family::PIPE_CONTROL PIPE_CONTROL;
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
auto pipeControlSpace = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
auto pipeControl = Gen9Family::cmdInitPipeControl;
auto pipeControl = Family::cmdInitPipeControl;
pipeControl.setCommandStreamerStallEnable(true);
*pipeControlSpace = pipeControl;
// Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
GpgpuWalkerHelper<Gen9Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
GpgpuWalkerHelper<Family>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, AluRegisters::OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
}
}
}
template <>
size_t GpgpuWalkerHelper<Gen9Family>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
typedef typename Gen9Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
typedef typename Gen9Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
typedef typename Gen9Family::PIPE_CONTROL PIPE_CONTROL;
typedef typename Gen9Family::MI_MATH MI_MATH;
typedef typename Gen9Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
size_t GpgpuWalkerHelper<Family>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
typedef typename Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
typedef typename Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
typedef typename Family::MI_MATH MI_MATH;
typedef typename Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
size_t n = 0;
if (pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesFencesForReadWriteImages) {
n += sizeof(PIPE_CONTROL) +
@@ -54,10 +56,18 @@ size_t GpgpuWalkerHelper<Gen9Family>::getSizeForWADisableLSQCROPERFforOCL(const
return n;
}
template class HardwareInterface<Gen9Family>;
template class HardwareInterface<Family>;
template class GpgpuWalkerHelper<Gen9Family>;
template void HardwareInterface<Family>::dispatchWalker<Family::WALKER_TYPE>(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs);
template void HardwareInterface<Family>::programWalker<Family::WALKER_TYPE>(LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs);
template void HardwareInterface<Family>::dispatchKernelCommands<Family::WALKER_TYPE>(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs);
template Family::WALKER_TYPE *HardwareInterface<Family>::allocateWalkerSpace<Family::WALKER_TYPE>(LinearStream &commandStream, const Kernel &kernel);
template struct EnqueueOperation<Gen9Family>;
template class GpgpuWalkerHelper<Family>;
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::WALKER_TYPE>(LinearStream *cmdStream, Family::WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::WALKER_TYPE>(Family::WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3],
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);
template struct EnqueueOperation<Family>;
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2022 Intel Corporation
* Copyright (C) 2019-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -17,4 +17,53 @@ namespace NEO {
using FamilyType = Gen9Family;
template struct HardwareCommandsHelper<FamilyType>;
template size_t HardwareCommandsHelper<FamilyType>::sendIndirectState<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
LinearStream &commandStream,
IndirectHeap &dsh,
IndirectHeap &ioh,
IndirectHeap &ssh,
Kernel &kernel,
uint64_t kernelStartOffset,
uint32_t simd,
const size_t localWorkSize[3],
const uint32_t threadGroupCount,
const uint64_t offsetInterfaceDescriptorTable,
uint32_t &interfaceDescriptorIndex,
PreemptionMode preemptionMode,
FamilyType::WALKER_TYPE *walkerCmd,
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
bool localIdsGenerationByRuntime,
uint64_t scratchAddress,
const Device &device);
template size_t HardwareCommandsHelper<FamilyType>::sendCrossThreadData<FamilyType::WALKER_TYPE>(
IndirectHeap &indirectHeap,
Kernel &kernel,
bool inlineDataProgrammingRequired,
FamilyType::WALKER_TYPE *walkerCmd,
uint32_t &sizeCrossThreadData,
uint64_t scratchAddress);
template size_t HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
const IndirectHeap &indirectHeap,
uint64_t offsetInterfaceDescriptor,
uint64_t kernelStartOffset,
size_t sizeCrossThreadData,
size_t sizePerThreadData,
size_t bindingTablePointer,
[[maybe_unused]] size_t offsetSamplerState,
uint32_t numSamplers,
const uint32_t threadGroupCount,
uint32_t numThreadsPerThreadGroup,
const Kernel &kernel,
uint32_t bindingTablePrefetchSize,
PreemptionMode preemptionMode,
const Device &device,
FamilyType::WALKER_TYPE *walkerCmd,
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
template void HardwareCommandsHelper<FamilyType>::programInlineData<FamilyType::WALKER_TYPE>(
Kernel &kernel,
FamilyType::WALKER_TYPE *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
} // namespace NEO

View File

@@ -37,6 +37,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
inline static uint32_t additionalSizeRequiredDsh();
template <typename WalkerType, typename InterfaceDescriptorType>
static size_t sendInterfaceDescriptorData(
const IndirectHeap &indirectHeap,
uint64_t offsetInterfaceDescriptor,
@@ -51,9 +52,9 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
const Kernel &kernel,
uint32_t bindingTablePrefetchSize,
PreemptionMode preemptionMode,
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
const Device &device,
WALKER_TYPE *walkerCmd);
WalkerType *walkerCmd,
InterfaceDescriptorType *inlineInterfaceDescriptor);
static void sendMediaStateFlush(
LinearStream &commandStream,
@@ -64,13 +65,16 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
size_t offsetInterfaceDescriptorData,
size_t sizeInterfaceDescriptorData);
template <typename WalkerType>
static size_t sendCrossThreadData(
IndirectHeap &indirectHeap,
Kernel &kernel,
bool inlineDataProgrammingRequired,
WALKER_TYPE *walkerCmd,
uint32_t &sizeCrossThreadData);
WalkerType *walkerCmd,
uint32_t &sizeCrossThreadData,
uint64_t scratchAddress);
template <typename WalkerType, typename InterfaceDescriptorType>
static size_t sendIndirectState(
LinearStream &commandStream,
IndirectHeap &dsh,
@@ -84,11 +88,15 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
const uint64_t offsetInterfaceDescriptorTable,
uint32_t &interfaceDescriptorIndex,
PreemptionMode preemptionMode,
WALKER_TYPE *walkerCmd,
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
WalkerType *walkerCmd,
InterfaceDescriptorType *inlineInterfaceDescriptor,
bool localIdsGenerationByRuntime,
uint64_t scratchAddress,
const Device &device);
template <typename WalkerType>
static void programInlineData(Kernel &kernel, WalkerType *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
static void programPerThreadData(
bool localIdsGenerationByRuntime,
size_t &sizePerThreadData,

View File

@@ -19,12 +19,15 @@
#include "shared/source/indirect_heap/indirect_heap.h"
#include "shared/source/kernel/dispatch_kernel_encoder_interface.h"
#include "shared/source/kernel/implicit_args.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "opencl/source/cl_device/cl_device.h"
#include "opencl/source/context/context.h"
#include "opencl/source/helpers/dispatch_info.h"
#include "opencl/source/kernel/kernel.h"
#include "hardware_commands_helper.h"
namespace NEO {
template <typename GfxFamily>
@@ -117,66 +120,68 @@ size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(
}
template <typename GfxFamily>
template <typename WalkerType, typename InterfaceDescriptorType>
size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
const IndirectHeap &indirectHeap,
uint64_t offsetInterfaceDescriptor,
uint64_t kernelStartOffset,
size_t sizeCrossThreadData,
size_t sizePerThreadData,
size_t bindingTablePointer,
[[maybe_unused]] size_t bindingTablePointer,
[[maybe_unused]] size_t offsetSamplerState,
uint32_t numSamplers,
[[maybe_unused]] uint32_t numSamplers,
const uint32_t threadGroupCount,
uint32_t threadsPerThreadGroup,
const Kernel &kernel,
uint32_t bindingTablePrefetchSize,
[[maybe_unused]] uint32_t bindingTablePrefetchSize,
PreemptionMode preemptionMode,
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
const Device &device,
WALKER_TYPE *walkerCmd) {
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
using SHARED_LOCAL_MEMORY_SIZE = typename INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
WalkerType *walkerCmd,
InterfaceDescriptorType *inlineInterfaceDescriptor) {
const auto &hardwareInfo = device.getHardwareInfo();
const auto &kernelDescriptor = kernel.getKernelInfo().kernelDescriptor;
constexpr bool heaplessModeEnabled = GfxFamily::template isHeaplessMode<WalkerType>();
// Allocate some memory for the interface descriptor
auto pInterfaceDescriptor = getInterfaceDescriptor(indirectHeap, offsetInterfaceDescriptor, inlineInterfaceDescriptor);
auto interfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData;
InterfaceDescriptorType *pInterfaceDescriptor = nullptr;
auto interfaceDescriptor = GfxFamily::template getInitInterfaceDescriptor<InterfaceDescriptorType>();
// Program the kernel start pointer
interfaceDescriptor.setKernelStartPointer(static_cast<uint32_t>(kernelStartOffset & std::numeric_limits<uint32_t>::max()));
if constexpr (heaplessModeEnabled) {
pInterfaceDescriptor = inlineInterfaceDescriptor;
interfaceDescriptor.setKernelStartPointer(kernelStartOffset);
} else {
pInterfaceDescriptor = getInterfaceDescriptor(indirectHeap, offsetInterfaceDescriptor, inlineInterfaceDescriptor);
interfaceDescriptor.setKernelStartPointer(static_cast<uint32_t>(kernelStartOffset));
}
// # of threads in thread group should be based on LWS.
// # of threads in thread group should be based on LWS.
interfaceDescriptor.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
auto slmTotalSize = kernel.getSlmTotalSize();
const auto &kernelDescriptor = kernel.getKernelInfo().kernelDescriptor;
EncodeDispatchKernel<GfxFamily>::setGrfInfo(&interfaceDescriptor, kernelDescriptor.kernelAttributes.numGrfRequired,
sizeCrossThreadData, sizePerThreadData, device.getRootDeviceEnvironment());
auto &productHelper = device.getProductHelper();
productHelper.updateIddCommand(&interfaceDescriptor, kernelDescriptor.kernelAttributes.numGrfRequired,
kernelDescriptor.kernelAttributes.threadArbitrationPolicy);
EncodeDispatchKernel<GfxFamily>::appendAdditionalIDDFields(&interfaceDescriptor, device.getRootDeviceEnvironment(), threadsPerThreadGroup,
slmTotalSize, SlmPolicy::SlmPolicyNone);
EncodeDispatchKernel<GfxFamily>::appendAdditionalIDDFields(&interfaceDescriptor, device.getRootDeviceEnvironment(),
threadsPerThreadGroup, slmTotalSize, SlmPolicy::SlmPolicyNone);
interfaceDescriptor.setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer));
if constexpr (heaplessModeEnabled == false) {
interfaceDescriptor.setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer));
if constexpr (GfxFamily::supportsSampler) {
if (device.getDeviceInfo().imageSupport) {
interfaceDescriptor.setSamplerStatePointer(static_cast<uint32_t>(offsetSamplerState));
if constexpr (GfxFamily::supportsSampler) {
if (device.getDeviceInfo().imageSupport) {
interfaceDescriptor.setSamplerStatePointer(static_cast<uint32_t>(offsetSamplerState));
}
}
EncodeDispatchKernel<GfxFamily>::adjustBindingTablePrefetch(interfaceDescriptor, numSamplers, bindingTablePrefetchSize);
}
EncodeDispatchKernel<GfxFamily>::adjustBindingTablePrefetch(interfaceDescriptor, numSamplers, bindingTablePrefetchSize);
const auto &hardwareInfo = device.getHardwareInfo();
auto &gfxCoreHelper = device.getGfxCoreHelper();
auto programmableIDSLMSize =
static_cast<SHARED_LOCAL_MEMORY_SIZE>(gfxCoreHelper.computeSlmValues(hardwareInfo, slmTotalSize));
auto programmableIDSLMSize = static_cast<uint32_t>(gfxCoreHelper.computeSlmValues(hardwareInfo, slmTotalSize));
if (DebugManager.flags.OverrideSlmAllocationSize.get() != -1) {
programmableIDSLMSize = static_cast<SHARED_LOCAL_MEMORY_SIZE>(DebugManager.flags.OverrideSlmAllocationSize.get());
programmableIDSLMSize = static_cast<uint32_t>(DebugManager.flags.OverrideSlmAllocationSize.get());
}
interfaceDescriptor.setSharedLocalMemorySize(programmableIDSLMSize);
@@ -212,6 +217,7 @@ void HardwareCommandsHelper<GfxFamily>::programPerThreadData(
}
template <typename GfxFamily>
template <typename WalkerType, typename InterfaceDescriptorType>
size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
LinearStream &commandStream,
IndirectHeap &dsh,
@@ -225,34 +231,42 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
const uint64_t offsetInterfaceDescriptorTable,
uint32_t &interfaceDescriptorIndex,
PreemptionMode preemptionMode,
WALKER_TYPE *walkerCmd,
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
WalkerType *walkerCmd,
InterfaceDescriptorType *inlineInterfaceDescriptor,
bool localIdsGenerationByRuntime,
uint64_t scratchAddress,
const Device &device) {
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
DEBUG_BREAK_IF(simd != 1 && simd != 8 && simd != 16 && simd != 32);
auto inlineDataProgrammingRequired = EncodeDispatchKernel<GfxFamily>::inlineDataProgrammingRequired(kernel.getKernelInfo().kernelDescriptor);
// Copy the kernel over to the ISH
const auto &kernelInfo = kernel.getKernelInfo();
constexpr bool heaplessModeEnabled = GfxFamily::template isHeaplessMode<WalkerType>();
ssh.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
size_t dstBindingTablePointer = HardwareCommandsHelper<GfxFamily>::checkForAdditionalBTAndSetBTPointer(ssh, kernel);
// Copy our sampler state if it exists
const auto &samplerTable = kernelInfo.kernelDescriptor.payloadMappings.samplerTable;
size_t dstBindingTablePointer = 0;
uint32_t samplerCount = 0;
uint32_t samplerStateOffset = 0;
if (isValidOffset(samplerTable.tableOffset) && isValidOffset(samplerTable.borderColor)) {
samplerCount = samplerTable.numSamplers;
samplerStateOffset = EncodeStates<GfxFamily>::copySamplerState(&dsh, samplerTable.tableOffset,
samplerCount, samplerTable.borderColor,
kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(),
device.getRootDeviceEnvironment());
uint32_t bindingTablePrefetchSize = 0;
if constexpr (heaplessModeEnabled == false) {
ssh.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
dstBindingTablePointer = HardwareCommandsHelper<GfxFamily>::checkForAdditionalBTAndSetBTPointer(ssh, kernel);
const auto &kernelInfo = kernel.getKernelInfo();
// Copy our sampler state if it exists
const auto &samplerTable = kernelInfo.kernelDescriptor.payloadMappings.samplerTable;
if (isValidOffset(samplerTable.tableOffset) && isValidOffset(samplerTable.borderColor)) {
samplerCount = samplerTable.numSamplers;
samplerStateOffset = EncodeStates<GfxFamily>::copySamplerState(&dsh, samplerTable.tableOffset,
samplerCount, samplerTable.borderColor,
kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(),
device.getRootDeviceEnvironment());
}
if (EncodeSurfaceState<GfxFamily>::doBindingTablePrefetch()) {
bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
}
}
auto &gfxCoreHelper = device.getGfxCoreHelper();
auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
@@ -260,9 +274,10 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
size_t offsetCrossThreadData = HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
auto inlineDataProgrammingRequired = EncodeDispatchKernel<GfxFamily>::inlineDataProgrammingRequired(kernel.getKernelInfo().kernelDescriptor);
size_t offsetCrossThreadData = HardwareCommandsHelper<GfxFamily>::sendCrossThreadData<WalkerType>(
ioh, kernel, inlineDataProgrammingRequired,
walkerCmd, sizeCrossThreadData);
walkerCmd, sizeCrossThreadData, scratchAddress);
size_t sizePerThreadDataTotal = 0;
size_t sizePerThreadData = 0;
@@ -275,14 +290,9 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
kernel,
localWorkSize);
uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * GfxFamily::template getInterfaceDescriptorSize<WalkerType>();
auto bindingTablePrefetchSize = 0;
if (EncodeSurfaceState<GfxFamily>::doBindingTablePrefetch()) {
bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
}
HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData<WalkerType, InterfaceDescriptorType>(
dsh,
offsetInterfaceDescriptor,
kernelStartOffset,
@@ -296,31 +306,36 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
kernel,
bindingTablePrefetchSize,
preemptionMode,
inlineInterfaceDescriptor,
device,
walkerCmd);
walkerCmd,
inlineInterfaceDescriptor);
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
PatchInfoData patchInfoData(kernelStartOffset, 0, PatchInfoAllocationType::InstructionHeap, dsh.getGraphicsAllocation()->getGpuAddress(), offsetInterfaceDescriptor, PatchInfoAllocationType::DynamicStateHeap);
kernel.getPatchInfoDataList().push_back(patchInfoData);
}
// Program media state flush to set interface descriptor offset
sendMediaStateFlush(
commandStream,
interfaceDescriptorIndex);
if constexpr (heaplessModeEnabled == false) {
// Program media state flush to set interface descriptor offset
sendMediaStateFlush(
commandStream,
interfaceDescriptorIndex);
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
walkerCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
setInterfaceDescriptorOffset(walkerCmd, interfaceDescriptorIndex);
auto indirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
walkerCmd->setIndirectDataLength(indirectDataLength);
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
walkerCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
setInterfaceDescriptorOffset(walkerCmd, interfaceDescriptorIndex);
auto indirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
WalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
walkerCmd->setIndirectDataLength(indirectDataLength);
}
return offsetCrossThreadData;
}
template <typename GfxFamily>
template <typename WalkerType>
void HardwareCommandsHelper<GfxFamily>::programInlineData(Kernel &kernel, WalkerType *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress) {}
template <typename GfxFamily>
bool HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(const Kernel &kernel) {
return kernel.getKernelInfo().kernelDescriptor.kernelAttributes.numLocalIdChannels > 0;

View File

@@ -71,13 +71,15 @@ void HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
}
template <typename GfxFamily>
template <typename WalkerType>
size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
IndirectHeap &indirectHeap,
Kernel &kernel,
bool inlineDataProgrammingRequired,
WALKER_TYPE *walkerCmd,
uint32_t &sizeCrossThreadData) {
indirectHeap.align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
WalkerType *walkerCmd,
uint32_t &sizeCrossThreadData,
uint64_t scratchAddress) {
indirectHeap.align(WalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
auto pImplicitArgs = kernel.getImplicitArgs();
if (pImplicitArgs) {

View File

@@ -52,14 +52,20 @@ void HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
}
template <typename GfxFamily>
template <typename WalkerType>
size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
IndirectHeap &indirectHeap,
Kernel &kernel,
bool inlineDataProgrammingRequired,
WALKER_TYPE *walkerCmd,
uint32_t &sizeCrossThreadData) {
WalkerType *walkerCmd,
uint32_t &sizeCrossThreadData,
[[maybe_unused]] uint64_t scratchAddress) {
indirectHeap.align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
constexpr bool heaplessModeEnabled = GfxFamily::template isHeaplessMode<WalkerType>();
if constexpr (heaplessModeEnabled == false) {
indirectHeap.align(WalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
}
auto offsetCrossThreadData = indirectHeap.getUsed();
char *dest = nullptr;
@@ -96,11 +102,16 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder), gfxCoreHelper);
}
using InlineData = typename GfxFamily::INLINE_DATA;
using GRF = typename GfxFamily::GRF;
uint32_t inlineDataSize = sizeof(InlineData);
uint32_t sizeToCopy = sizeCrossThreadData;
if (inlineDataProgrammingRequired == true) {
using InlineData = typename GfxFamily::INLINE_DATA;
uint32_t inlineDataSize = sizeof(InlineData);
if constexpr (heaplessModeEnabled) {
inlineDataSize = 64;
}
sizeToCopy = std::min(inlineDataSize, sizeCrossThreadData);
dest = reinterpret_cast<char *>(walkerCmd->getInlineDataPointer());
memcpy_s(dest, sizeToCopy, kernel.getCrossThreadData(), sizeToCopy);
@@ -114,6 +125,14 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
memcpy_s(dest, sizeCrossThreadData, src, sizeCrossThreadData);
}
if constexpr (heaplessModeEnabled) {
auto device = kernel.getContext().getDevice(0);
uint64_t indirectDataAddress = device->getMemoryManager()->getInternalHeapBaseAddress(device->getRootDeviceIndex(), indirectHeap.getGraphicsAllocation()->isAllocatedInLocalMemoryPool());
indirectDataAddress += indirectHeap.getHeapGpuStartOffset();
HardwareCommandsHelper<GfxFamily>::programInlineData<WalkerType>(kernel, walkerCmd, indirectDataAddress, scratchAddress);
}
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
FlatBatchBufferHelper::fixCrossThreadDataInfo(kernel.getPatchInfoDataList(), offsetCrossThreadData, indirectHeap.getGraphicsAllocation()->getGpuAddress());
}

View File

@@ -109,6 +109,7 @@ std::string Program::getInternalOptions() const {
auto isDebuggerActive = pClDevice->getDevice().getDebugger() != nullptr;
CompilerOptions::concatenateAppend(internalOptions, compilerProductHelper.getCachingPolicyOptions(isDebuggerActive));
CompilerOptions::applyExtraInternalOptions(internalOptions, compilerProductHelper);
return internalOptions;
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
* Copyright (C) 2021-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -12,10 +12,20 @@
namespace NEO {
template class GpgpuWalkerHelper<XeHpcCoreFamily>;
using Family = XeHpcCoreFamily;
template class HardwareInterface<XeHpcCoreFamily>;
template class GpgpuWalkerHelper<Family>;
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::WALKER_TYPE>(LinearStream *cmdStream, Family::WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::WALKER_TYPE>(Family::WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3],
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);
template struct EnqueueOperation<XeHpcCoreFamily>;
template class HardwareInterface<Family>;
template void HardwareInterface<Family>::dispatchWalker<Family::WALKER_TYPE>(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs);
template void HardwareInterface<Family>::programWalker<Family::WALKER_TYPE>(LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs);
template void HardwareInterface<Family>::dispatchKernelCommands<Family::WALKER_TYPE>(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs);
template Family::WALKER_TYPE *HardwareInterface<Family>::allocateWalkerSpace<Family::WALKER_TYPE>(LinearStream &commandStream, const Kernel &kernel);
template struct EnqueueOperation<Family>;
} // namespace NEO

View File

@@ -16,4 +16,53 @@ namespace NEO {
using FamilyType = XeHpcCoreFamily;
template struct HardwareCommandsHelper<FamilyType>;
template size_t HardwareCommandsHelper<FamilyType>::sendIndirectState<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
LinearStream &commandStream,
IndirectHeap &dsh,
IndirectHeap &ioh,
IndirectHeap &ssh,
Kernel &kernel,
uint64_t kernelStartOffset,
uint32_t simd,
const size_t localWorkSize[3],
const uint32_t threadGroupCount,
const uint64_t offsetInterfaceDescriptorTable,
uint32_t &interfaceDescriptorIndex,
PreemptionMode preemptionMode,
FamilyType::WALKER_TYPE *walkerCmd,
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
bool localIdsGenerationByRuntime,
uint64_t scratchAddress,
const Device &device);
template size_t HardwareCommandsHelper<FamilyType>::sendCrossThreadData<FamilyType::WALKER_TYPE>(
IndirectHeap &indirectHeap,
Kernel &kernel,
bool inlineDataProgrammingRequired,
FamilyType::WALKER_TYPE *walkerCmd,
uint32_t &sizeCrossThreadData,
uint64_t scratchAddress);
template size_t HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
const IndirectHeap &indirectHeap,
uint64_t offsetInterfaceDescriptor,
uint64_t kernelStartOffset,
size_t sizeCrossThreadData,
size_t sizePerThreadData,
size_t bindingTablePointer,
[[maybe_unused]] size_t offsetSamplerState,
uint32_t numSamplers,
const uint32_t threadGroupCount,
uint32_t numThreadsPerThreadGroup,
const Kernel &kernel,
uint32_t bindingTablePrefetchSize,
PreemptionMode preemptionMode,
const Device &device,
FamilyType::WALKER_TYPE *walkerCmd,
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
template void HardwareCommandsHelper<FamilyType>::programInlineData<FamilyType::WALKER_TYPE>(
Kernel &kernel,
FamilyType::WALKER_TYPE *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
* Copyright (C) 2021-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -12,10 +12,20 @@
namespace NEO {
template class GpgpuWalkerHelper<XeHpgCoreFamily>;
using Family = XeHpgCoreFamily;
template class HardwareInterface<XeHpgCoreFamily>;
template class GpgpuWalkerHelper<Family>;
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::WALKER_TYPE>(LinearStream *cmdStream, Family::WALKER_TYPE *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::WALKER_TYPE>(Family::WALKER_TYPE *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t globalOffsets[3], const size_t startWorkGroups[3],
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);
template struct EnqueueOperation<XeHpgCoreFamily>;
template class HardwareInterface<Family>;
template void HardwareInterface<Family>::dispatchWalker<Family::WALKER_TYPE>(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs);
template void HardwareInterface<Family>::programWalker<Family::WALKER_TYPE>(LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs);
template void HardwareInterface<Family>::dispatchKernelCommands<Family::WALKER_TYPE>(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, HardwareInterfaceWalkerArgs &walkerArgs);
template Family::WALKER_TYPE *HardwareInterface<Family>::allocateWalkerSpace<Family::WALKER_TYPE>(LinearStream &commandStream, const Kernel &kernel);
template struct EnqueueOperation<Family>;
} // namespace NEO

View File

@@ -16,4 +16,52 @@ namespace NEO {
using FamilyType = XeHpgCoreFamily;
template struct HardwareCommandsHelper<FamilyType>;
template size_t HardwareCommandsHelper<FamilyType>::sendIndirectState<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
LinearStream &commandStream,
IndirectHeap &dsh,
IndirectHeap &ioh,
IndirectHeap &ssh,
Kernel &kernel,
uint64_t kernelStartOffset,
uint32_t simd,
const size_t localWorkSize[3],
const uint32_t threadGroupCount,
const uint64_t offsetInterfaceDescriptorTable,
uint32_t &interfaceDescriptorIndex,
PreemptionMode preemptionMode,
FamilyType::WALKER_TYPE *walkerCmd,
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
bool localIdsGenerationByRuntime,
uint64_t scratchAddress,
const Device &device);
template size_t HardwareCommandsHelper<FamilyType>::sendCrossThreadData<FamilyType::WALKER_TYPE>(
IndirectHeap &indirectHeap,
Kernel &kernel,
bool inlineDataProgrammingRequired,
FamilyType::WALKER_TYPE *walkerCmd,
uint32_t &sizeCrossThreadData,
uint64_t scratchAddress);
template size_t HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData<FamilyType::WALKER_TYPE, FamilyType::INTERFACE_DESCRIPTOR_DATA>(
const IndirectHeap &indirectHeap,
uint64_t offsetInterfaceDescriptor,
uint64_t kernelStartOffset,
size_t sizeCrossThreadData,
size_t sizePerThreadData,
size_t bindingTablePointer,
[[maybe_unused]] size_t offsetSamplerState,
uint32_t numSamplers,
const uint32_t threadGroupCount,
uint32_t numThreadsPerThreadGroup,
const Kernel &kernel,
uint32_t bindingTablePrefetchSize,
PreemptionMode preemptionMode,
const Device &device,
FamilyType::WALKER_TYPE *walkerCmd,
FamilyType::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
template void HardwareCommandsHelper<FamilyType>::programInlineData<FamilyType::WALKER_TYPE>(
Kernel &kernel,
FamilyType::WALKER_TYPE *walkerCmd, uint64_t indirectDataAddress, uint64_t scratchAddress);
} // namespace NEO

View File

@@ -169,7 +169,7 @@ HWTEST_F(DispatchWalkerTest, WhenDispatchingWalkerThenCommandStreamMemoryIsntCha
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -212,7 +212,7 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalIdsWhenDispatchingWalkerThenWalkerIsDis
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -239,7 +239,7 @@ HWTEST_F(DispatchWalkerTest, GivenDefaultLwsAlgorithmWhenDispatchingWalkerThenDi
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -267,7 +267,7 @@ HWTEST_F(DispatchWalkerTest, GivenSquaredLwsAlgorithmWhenDispatchingWalkerThenDi
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -293,7 +293,7 @@ HWTEST_F(DispatchWalkerTest, GivenNdLwsAlgorithmWhenDispatchingWalkerThenDimensi
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -320,7 +320,7 @@ HWTEST_F(DispatchWalkerTest, GivenOldLwsAlgorithmWhenDispatchingWalkerThenDimens
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -347,7 +347,7 @@ HWTEST_F(DispatchWalkerTest, GivenNumWorkGroupsWhenDispatchingWalkerThenNumWorkG
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -377,7 +377,7 @@ HWTEST_F(DispatchWalkerTest, GivenGlobalWorkOffsetWhenDispatchingWalkerThenGloba
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -407,7 +407,7 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndDefaultAlgorithmWhenDispatch
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -437,7 +437,7 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndNdOnWhenDispatchingWalkerThe
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -468,7 +468,7 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmWhenDispatch
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -499,7 +499,7 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmOffAndNdOffW
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -528,7 +528,7 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeWhenDispatchingWalkerThenLwsIsC
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -560,7 +560,7 @@ HWTEST_F(DispatchWalkerTest, GivenTwoSetsOfLwsOffsetsWhenDispatchingWalkerThenLw
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -598,7 +598,7 @@ HWTEST_F(DispatchWalkerTest, GivenSplitKernelWhenDispatchingWalkerThenLwsIsCorre
MockMultiDispatchInfo multiDispatchInfo(std::vector<DispatchInfo *>({&di1, &di2}));
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -650,7 +650,7 @@ HWTEST_F(DispatchWalkerTest, GivenSplitWalkerWhenDispatchingWalkerThenLwsIsCorre
multiDispatchInfo.push(di2);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -703,7 +703,7 @@ HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenCommandSt
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgs.blockedCommandsData = blockedCommandsData.get();
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -735,7 +735,7 @@ HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenRequiredH
multiDispatchInfo.push(dispatchInfo);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgs.blockedCommandsData = blockedCommandsData.get();
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -782,7 +782,7 @@ HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenRequiredH
auto blockedCommandsData = createBlockedCommandsData(*pCmdQ);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgs.blockedCommandsData = blockedCommandsData.get();
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -805,7 +805,7 @@ HWTEST_F(DispatchWalkerTest, givenBlockedQueueWhenDispatchWalkerIsCalledThenComm
auto blockedCommandsData = createBlockedCommandsData(*pCmdQ);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgs.blockedCommandsData = blockedCommandsData.get();
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -831,7 +831,7 @@ HWTEST_F(DispatchWalkerTest, givenThereAreAllocationsForReuseWhenDispatchWalkerI
auto blockedCommandsData = createBlockedCommandsData(*pCmdQ);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgs.blockedCommandsData = blockedCommandsData.get();
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -851,7 +851,7 @@ HWTEST_F(DispatchWalkerTest, GivenMultipleKernelsWhenDispatchingWalkerThenWorkDi
MockMultiDispatchInfo multiDispatchInfo(pClDevice, std::vector<Kernel *>({&kernel1, &kernel2}));
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -888,7 +888,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, GivenMultipleKernelsWhenDispatch
auto dshBeforeMultiDisptach = indirectHeap.getUsed();
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -971,7 +971,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, GivenMultipleKernelsWhenDispatch
// create commandStream
auto &cmdStream = pCmdQ->getCS(0);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -1011,7 +1011,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, GivenMultipleKernelsWhenDispatch
// create commandStream
auto &cmdStream = pCmdQ->getCS(0);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -1056,7 +1056,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, GivenMultipleDispatchInfoAndSame
// create commandStream
auto &cmdStream = pCmdQ->getCS(0);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -1129,7 +1129,7 @@ HWTEST_P(DispatchWalkerTestForAuxTranslation, givenKernelWhenAuxToNonAuxWhenTran
builder.buildDispatchInfosForAuxTranslation<FamilyType>(multiDispatchInfo, builtinOpsParams);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -1178,7 +1178,7 @@ HWTEST_P(DispatchWalkerTestForAuxTranslation, givenKernelWhenNonAuxToAuxWhenTran
builder.buildDispatchInfosForAuxTranslation<FamilyType>(multiDispatchInfo, builtinOpsParams);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -1349,7 +1349,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
multiDispatchInfoWithoutImplicitArgs.push(dispatchInfoWithoutImplicitArgs);
HardwareInterfaceWalkerArgs walkerArgsWithoutImplicitArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgsWithoutImplicitArgs.blockedCommandsData = blockedCommandsData.get();
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfoWithoutImplicitArgs,
CsrDependencies(),
@@ -1364,7 +1364,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
multiDispatchInfoWithImplicitArgs.push(dispatchInfoWithImplicitArgs);
HardwareInterfaceWalkerArgs walkerArgsWithImplicitArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgsWithImplicitArgs.blockedCommandsData = blockedCommandsData.get();
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*pCmdQ,
multiDispatchInfoWithImplicitArgs,
CsrDependencies(),

View File

@@ -111,7 +111,7 @@ HWTEST2_F(Dg2AndLaterDispatchWalkerBasicTest, givenTimestampPacketWhenDispatchin
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgs.currentTimestampPacketNodes = &timestampPacketContainer;
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
cmdQ,
multiDispatchInfo,
CsrDependencies(),

View File

@@ -464,7 +464,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenTimestamp
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgs.currentTimestampPacketNodes = &timestampPacketContainer;
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<COMPUTE_WALKER>(
cmdQ,
multiDispatchInfo,
CsrDependencies(),

View File

@@ -60,8 +60,8 @@ void HardwareCommandsTest::addSpaceForSingleKernelArg() {
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenProgramInterfaceDescriptorDataIsCreatedThenOnlyRequiredSpaceOnIndirectHeapIsAllocated) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
WALKER_TYPE walkerCmd{};
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
GPGPU_WALKER walkerCmd{};
CommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, 0, false);
std::unique_ptr<Image> srcImage(Image2dHelper<>::create(pContext));
@@ -87,15 +87,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenProgramInterfaceDescriptor
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192);
auto usedIndirectHeapBefore = indirectHeap.getUsed();
indirectHeap.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
const uint32_t threadGroupCount = 1u;
size_t crossThreadDataSize = kernel->getCrossThreadDataSize();
HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
indirectHeap, 0, 0, crossThreadDataSize, 64, 0, 0, 0, threadGroupCount, 1, *kernel, 0, pDevice->getPreemptionMode(), nullptr, *pDevice, &walkerCmd);
HardwareCommandsHelper<FamilyType>::template sendInterfaceDescriptorData<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
indirectHeap, 0, 0, crossThreadDataSize, 64, 0, 0, 0, threadGroupCount, 1, *kernel, 0, pDevice->getPreemptionMode(), *pDevice, &walkerCmd, nullptr);
auto usedIndirectHeapAfter = indirectHeap.getUsed();
EXPECT_EQ(sizeof(INTERFACE_DESCRIPTOR_DATA), usedIndirectHeapAfter - usedIndirectHeapBefore);
@@ -137,7 +137,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenMediaStateFlushIsCreatedTh
HWTEST_F(HardwareCommandsTest, WhenCrossThreadDataIsCreatedThenOnlyRequiredSpaceOnIndirectHeapIsAllocated) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
CommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, 0, false);
std::unique_ptr<Image> srcImage(Image2dHelper<>::create(pContext));
@@ -166,12 +166,12 @@ HWTEST_F(HardwareCommandsTest, WhenCrossThreadDataIsCreatedThenOnlyRequiredSpace
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192);
auto usedBefore = indirectHeap.getUsed();
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
indirectHeap,
*kernel,
false,
nullptr,
sizeCrossThreadData);
sizeCrossThreadData, 0);
auto usedAfter = indirectHeap.getUsed();
EXPECT_EQ(kernel->getCrossThreadDataSize(), usedAfter - usedBefore);
@@ -179,6 +179,7 @@ HWTEST_F(HardwareCommandsTest, WhenCrossThreadDataIsCreatedThenOnlyRequiredSpace
HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsNotSetThenAddPatchInfoDataOffsetsAreNotMoved) {
CommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, 0, false);
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
MockContext context;
@@ -192,12 +193,13 @@ HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoComme
PatchInfoData patchInfoData = {0xaaaaaaaa, 0, PatchInfoAllocationType::KernelArg, 0xbbbbbbbb, 0, PatchInfoAllocationType::IndirectObjectHeap};
kernel->getPatchInfoDataList().push_back(patchInfoData);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
indirectHeap,
*kernel,
false,
nullptr,
sizeCrossThreadData);
sizeCrossThreadData,
0);
ASSERT_EQ(1u, kernel->getPatchInfoDataList().size());
EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation);
@@ -209,32 +211,36 @@ HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoComme
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenIndirectHeapNotAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenOffsetZeroIsReturned) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
auto nonInternalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize});
IndirectHeap indirectHeap(nonInternalAllocation, false);
auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
auto offset = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
auto offset = HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
indirectHeap,
*mockKernelWithInternal->mockKernel,
false,
nullptr,
sizeCrossThreadData);
sizeCrossThreadData,
0);
EXPECT_EQ(0u, offset);
pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation);
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenIndirectHeapAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenHeapBaseOffsetIsReturned) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
auto internalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties(pDevice->getRootDeviceIndex(), true, MemoryConstants::pageSize, AllocationType::INTERNAL_HEAP, pDevice->getDeviceBitfield()));
IndirectHeap indirectHeap(internalAllocation, true);
auto expectedOffset = internalAllocation->getGpuAddressToPatch();
auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
auto offset = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
auto offset = HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
indirectHeap,
*mockKernelWithInternal->mockKernel,
false,
nullptr,
sizeCrossThreadData);
sizeCrossThreadData,
0);
EXPECT_EQ(expectedOffset, offset);
pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation);
@@ -243,6 +249,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenIndirectHeapAllocatedFrom
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsSetThenAddPatchInfoDataOffsetsAreMoved) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.AddPatchInfoCommentsForAUBDump.set(true);
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
CommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, 0, false);
@@ -262,12 +269,13 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenSendCrossThreadDataWhenWh
kernel->getPatchInfoDataList().push_back(patchInfoData1);
kernel->getPatchInfoDataList().push_back(patchInfoData2);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
auto offsetCrossThreadData = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
auto offsetCrossThreadData = HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
indirectHeap,
*kernel,
false,
nullptr,
sizeCrossThreadData);
sizeCrossThreadData,
0);
ASSERT_NE(0u, offsetCrossThreadData);
EXPECT_EQ(128u, offsetCrossThreadData);
@@ -340,7 +348,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*kernel);
HardwareCommandsHelper<FamilyType>::sendIndirectState(
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
commandStream,
dsh,
ioh,
@@ -356,6 +364,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes
pWalkerCmd,
nullptr,
true,
0,
*pDevice);
// It's okay these are EXPECT_GE as they're only going to be used for
@@ -397,7 +406,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
HardwareCommandsHelper<FamilyType>::sendIndirectState(
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
commandStream,
dsh,
ioh,
@@ -413,6 +422,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
pWalkerCmd,
nullptr,
true,
0,
*pDevice);
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
@@ -445,7 +455,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
HardwareCommandsHelper<FamilyType>::sendIndirectState(
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
commandStream,
dsh,
ioh,
@@ -461,6 +471,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
pWalkerCmd,
nullptr,
true,
0,
*pDevice);
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
@@ -526,7 +537,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(mockKernel);
HardwareCommandsHelper<FamilyType>::sendIndirectState(
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
commandStream,
dsh,
ioh,
@@ -542,6 +553,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
pWalkerCmd,
nullptr,
true,
0,
*pDevice);
constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF);
@@ -567,6 +579,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenSendingIndirectStateThenBi
typedef typename FamilyType::BINDING_TABLE_STATE BINDING_TABLE_STATE;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
CommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, 0, false);
std::unique_ptr<Image> dstImage(Image2dHelper<>::create(pContext));
@@ -619,7 +632,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenSendingIndirectStateThenBi
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*kernel);
HardwareCommandsHelper<FamilyType>::sendIndirectState(
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
commandStream,
dsh,
ioh,
@@ -635,6 +648,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenSendingIndirectStateThenBi
pWalkerCmd,
nullptr,
true,
0,
*pDevice);
EXPECT_EQ(sshUsed + 0x00000000u, *(&bindingTableStatesPointers[0]));
@@ -730,7 +744,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*pKernel);
HardwareCommandsHelper<FamilyType>::sendIndirectState(
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
commandStream,
dsh,
ioh,
@@ -746,6 +760,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh
pWalkerCmd,
nullptr,
true,
0,
*pDevice);
bti = reinterpret_cast<typename FamilyType::BINDING_TABLE_STATE *>(reinterpret_cast<unsigned char *>(ssh.getCpuBase()) + localSshOffset + btiOffset);
@@ -901,7 +916,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
// Undefined Offset, Defined BorderColorOffset
mockKernelWithInternal->kernelInfo.setSamplerTable(0, 2, undefined<uint16_t>);
HardwareCommandsHelper<FamilyType>::sendIndirectState(
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
commandStream,
dsh,
ioh,
@@ -917,6 +932,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
pWalkerCmd,
nullptr,
true,
0,
*pDevice);
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
@@ -926,7 +942,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
// Defined Offset, Undefined BorderColorOffset
mockKernelWithInternal->kernelInfo.setSamplerTable(undefined<uint16_t>, 2, 0);
HardwareCommandsHelper<FamilyType>::sendIndirectState(
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
commandStream,
dsh,
ioh,
@@ -942,6 +958,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
pWalkerCmd,
nullptr,
true,
0,
*pDevice);
interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
@@ -998,7 +1015,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
HardwareCommandsHelper<FamilyType>::sendIndirectState(
HardwareCommandsHelper<FamilyType>::template sendIndirectState<GPGPU_WALKER, INTERFACE_DESCRIPTOR_DATA>(
commandStream,
dsh,
ioh,
@@ -1014,6 +1031,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
pWalkerCmd,
nullptr,
true,
0,
*pDevice);
bool isMemorySame = memcmp(borderColorPointer, mockDsh, samplerTableOffset) == 0;
@@ -1136,12 +1154,14 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernel.getDescriptor(), false, gfxCoreHelper);
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
indirectHeap,
kernel,
false,
nullptr,
sizeCrossThreadData);
sizeCrossThreadData,
0);
EXPECT_LE(implicitArgsProgrammingSize, indirectHeap.getUsed());
@@ -1272,40 +1292,46 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
using HardwareCommandsTestXeHpAndLater = HardwareCommandsTest;
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenIndirectHeapNotAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenOffsetZeroIsReturned) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
auto nonInternalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize});
IndirectHeap indirectHeap(nonInternalAllocation, false);
auto expectedOffset = is64bit ? 0u : indirectHeap.getHeapGpuBase();
auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
auto offset = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
auto offset = HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
indirectHeap,
*mockKernelWithInternal->mockKernel,
false,
nullptr,
sizeCrossThreadData);
sizeCrossThreadData,
0);
EXPECT_EQ(expectedOffset, offset);
pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenIndirectHeapAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenHeapBaseOffsetIsReturned) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
auto internalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties(pDevice->getRootDeviceIndex(), true, MemoryConstants::pageSize, AllocationType::INTERNAL_HEAP, pDevice->getDeviceBitfield()));
IndirectHeap indirectHeap(internalAllocation, true);
auto expectedOffset = is64bit ? internalAllocation->getGpuAddressToPatch() : 0u;
auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
auto offset = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
auto offset = HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
indirectHeap,
*mockKernelWithInternal->mockKernel,
false,
nullptr,
sizeCrossThreadData);
sizeCrossThreadData,
0);
EXPECT_EQ(expectedOffset, offset);
pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsSetThenAddPatchInfoDataOffsetsAreMoved) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
DebugManagerStateRestore dbgRestore;
DebugManager.flags.AddPatchInfoCommentsForAUBDump.set(true);
@@ -1327,12 +1353,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenSendCrossThr
kernel->getPatchInfoDataList().push_back(patchInfoData1);
kernel->getPatchInfoDataList().push_back(patchInfoData2);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
auto offsetCrossThreadData = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
auto offsetCrossThreadData = HardwareCommandsHelper<FamilyType>::template sendCrossThreadData<WALKER_TYPE>(
indirectHeap,
*kernel,
false,
nullptr,
sizeCrossThreadData);
sizeCrossThreadData,
0);
auto expectedOffsetRelativeToIohBase = 128u;
auto iohBaseAddress = is64bit ? 0u : indirectHeap.getHeapGpuBase();

View File

@@ -267,7 +267,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWhenDispat
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgs.currentTimestampPacketNodes = &timestampPacket;
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<GPGPU_WALKER>(
*mockCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -306,7 +306,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketDisabledWh
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgs.currentTimestampPacketNodes = &timestampPacket;
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<typename FamilyType::WALKER_TYPE>(
*mockCmdQ,
multiDispatchInfo,
CsrDependencies(),
@@ -1401,7 +1401,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenDispatchingTh
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgs.currentTimestampPacketNodes = &timestamp7;
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<WALKER>(
*mockCmdQ,
multiDispatchInfo,
csrDeps,
@@ -1475,7 +1475,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledOnDifferentCSRsFr
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgs.currentTimestampPacketNodes = &timestamp7;
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<WALKER>(
*mockCmdQ,
multiDispatchInfo,
csrDeps,
@@ -1534,7 +1534,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndDependenciesRe
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<WALKER>(
*mockCmdQ,
multiDispatchInfo,
csrDeps,

View File

@@ -58,8 +58,8 @@ static uint32_t slmSizeInKb[] = {1, 4, 8, 16, 32, 64};
HWCMDTEST_P(IGFX_GEN8_CORE, KernelSLMAndBarrierTest, GivenStaticSlmSizeWhenProgrammingSlmThenProgrammingIsCorrect) {
ASSERT_NE(nullptr, pClDevice);
CommandQueueHw<FamilyType> cmdQ(nullptr, pClDevice, 0, false);
typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
WALKER_TYPE walkerCmd{};
// define kernel info
kernelInfo.kernelDescriptor.kernelAttributes.barrierCount = 1;
@@ -74,7 +74,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, KernelSLMAndBarrierTest, GivenStaticSlmSizeWhenProgr
const uint32_t threadGroupCount = 1u;
uint64_t interfaceDescriptorOffset = indirectHeap.getUsed();
size_t offsetInterfaceDescriptorData = HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
size_t offsetInterfaceDescriptorData = HardwareCommandsHelper<FamilyType>::template sendInterfaceDescriptorData<WALKER_TYPE, INTERFACE_DESCRIPTOR_DATA>(
indirectHeap,
interfaceDescriptorOffset,
0,
@@ -88,9 +88,9 @@ HWCMDTEST_P(IGFX_GEN8_CORE, KernelSLMAndBarrierTest, GivenStaticSlmSizeWhenProgr
kernel,
4u,
pDevice->getPreemptionMode(),
nullptr,
*pDevice,
&walkerCmd);
&walkerCmd,
nullptr);
// add the heap base + offset
uint32_t *pIdData = (uint32_t *)indirectHeap.getCpuBase() + offsetInterfaceDescriptorData;
@@ -149,6 +149,7 @@ INSTANTIATE_TEST_CASE_P(
HWTEST_F(KernelSLMAndBarrierTest, GivenInterfaceDescriptorProgrammedWhenOverrideSlmAllocationSizeIsSetThenSlmSizeIsOverwritten) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
WALKER_TYPE walkerCmd{};
uint32_t expectedSlmSize = 5;
DebugManagerStateRestore dbgRestore;
@@ -166,7 +167,7 @@ HWTEST_F(KernelSLMAndBarrierTest, GivenInterfaceDescriptorProgrammedWhenOverride
uint64_t interfaceDescriptorOffset = indirectHeap.getUsed();
INTERFACE_DESCRIPTOR_DATA interfaceDescriptorData;
HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
HardwareCommandsHelper<FamilyType>::template sendInterfaceDescriptorData<WALKER_TYPE, INTERFACE_DESCRIPTOR_DATA>(
indirectHeap,
interfaceDescriptorOffset,
0,
@@ -180,9 +181,9 @@ HWTEST_F(KernelSLMAndBarrierTest, GivenInterfaceDescriptorProgrammedWhenOverride
kernel,
4u,
pDevice->getPreemptionMode(),
&interfaceDescriptorData,
*pDevice,
&walkerCmd);
&walkerCmd,
&interfaceDescriptorData);
auto pInterfaceDescriptor = HardwareCommandsHelper<FamilyType>::getInterfaceDescriptor(indirectHeap, interfaceDescriptorOffset, &interfaceDescriptorData);

View File

@@ -36,6 +36,7 @@ class MockCommandQueue : public CommandQueue {
using CommandQueue::device;
using CommandQueue::gpgpuEngine;
using CommandQueue::h2dEngines;
using CommandQueue::heaplessModeEnabled;
using CommandQueue::isCopyOnly;
using CommandQueue::isTextureCacheFlushNeeded;
using CommandQueue::migrateMultiGraphicsAllocationsIfRequired;

View File

@@ -195,7 +195,7 @@ XE_HPC_CORETEST_F(SystemMemoryFenceViaComputeWalkerTest, givenSystemMemoryFenceG
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL);
walkerArgs.currentTimestampPacketNodes = &timestampPacket;
HardwareInterface<FamilyType>::dispatchWalker(
HardwareInterface<FamilyType>::template dispatchWalker<COMPUTE_WALKER>(
commandQueue,
multiDispatchInfo,
CsrDependencies(),

View File

@@ -70,8 +70,8 @@ XE_HPC_CORETEST_F(MemoryPrefetchTestsXeHpcCore, givenKernelWhenWalkerIsProgramme
HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(workSize, wgInfo, PreemptionMode::Disabled);
mockKernel->kernelInfo.heapInfo.kernelHeapSize = 1;
HardwareInterface<FamilyType>::programWalker(commandStream, *mockKernel->mockKernel, *commandQueue,
heap, heap, heap, dispatchInfo, walkerArgs);
HardwareInterface<FamilyType>::template programWalker<COMPUTE_WALKER>(commandStream, *mockKernel->mockKernel, *commandQueue,
heap, heap, heap, dispatchInfo, walkerArgs);
HardwareParse hwParse;
hwParse.parseCommands<FamilyType>(commandStream, 0);
@@ -121,8 +121,8 @@ XE_HPC_CORETEST_F(ProgramWalkerTestsXeHpcCore, givenProperThreadGroupSizesWhenWa
hwInfo->platform.usRevId = productHelper.getHwRevIdFromStepping(REVISION_A0, *hwInfo);
{
HardwareInterface<FamilyType>::programWalker(commandStream, *mockKernel->mockKernel, *commandQueue,
heap, heap, heap, dispatchInfo, walkerArgs);
HardwareInterface<FamilyType>::template programWalker<COMPUTE_WALKER>(commandStream, *mockKernel->mockKernel, *commandQueue,
heap, heap, heap, dispatchInfo, walkerArgs);
HardwareParse hwParse;
hwParse.parseCommands<FamilyType>(commandStream, 0);
auto itorWalker = find<COMPUTE_WALKER *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
@@ -152,8 +152,8 @@ XE_HPC_CORETEST_F(ProgramWalkerTestsXeHpcCore, givenDebugVariableSetWhenProgramm
{
// default
HardwareInterface<FamilyType>::programWalker(commandStream, *mockKernel->mockKernel, *commandQueue,
heap, heap, heap, dispatchInfo, walkerArgs);
HardwareInterface<FamilyType>::template programWalker<COMPUTE_WALKER>(commandStream, *mockKernel->mockKernel, *commandQueue,
heap, heap, heap, dispatchInfo, walkerArgs);
HardwareParse hwParse;
hwParse.parseCommands<FamilyType>(commandStream, 0);
@@ -171,8 +171,8 @@ XE_HPC_CORETEST_F(ProgramWalkerTestsXeHpcCore, givenDebugVariableSetWhenProgramm
commandsOffset = commandStream.getUsed();
DebugManager.flags.ForceL3PrefetchForComputeWalker.set(1);
HardwareInterface<FamilyType>::programWalker(commandStream, *mockKernel->mockKernel, *commandQueue,
heap, heap, heap, dispatchInfo, walkerArgs);
HardwareInterface<FamilyType>::template programWalker<COMPUTE_WALKER>(commandStream, *mockKernel->mockKernel, *commandQueue,
heap, heap, heap, dispatchInfo, walkerArgs);
HardwareParse hwParse;
hwParse.parseCommands<FamilyType>(commandStream, commandsOffset);
@@ -190,8 +190,8 @@ XE_HPC_CORETEST_F(ProgramWalkerTestsXeHpcCore, givenDebugVariableSetWhenProgramm
commandsOffset = commandStream.getUsed();
DebugManager.flags.ForceL3PrefetchForComputeWalker.set(0);
HardwareInterface<FamilyType>::programWalker(commandStream, *mockKernel->mockKernel, *commandQueue,
heap, heap, heap, dispatchInfo, walkerArgs);
HardwareInterface<FamilyType>::template programWalker<COMPUTE_WALKER>(commandStream, *mockKernel->mockKernel, *commandQueue,
heap, heap, heap, dispatchInfo, walkerArgs);
HardwareParse hwParse;
hwParse.parseCommands<FamilyType>(commandStream, commandsOffset);

View File

@@ -96,12 +96,15 @@ struct EncodeDispatchKernel {
static void encode(CommandContainer &container, EncodeDispatchKernelArgs &args);
static void encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template <typename WalkerType>
static void encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs);
static void appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment,
template <typename InterfaceDescriptorType>
static void appendAdditionalIDDFields(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment,
const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
static void setGrfInfo(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData,
template <typename InterfaceDescriptorType>
static void setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData,
const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
static void *getInterfaceDescriptor(CommandContainer &container, IndirectHeap *childDsh, uint32_t &iddOffset);
@@ -129,15 +132,19 @@ struct EncodeDispatchKernel {
uint32_t requiredWorkGroupOrder,
const RootDeviceEnvironment &rootDeviceEnvironment);
static void programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
template <typename InterfaceDescriptorType>
static void programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
static void adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WALKER_TYPE &walkerCmd);
template <typename WalkerType, typename InterfaceDescriptorType>
static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
static void adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
template <typename WalkerType>
static void adjustTimestampPacket(WalkerType &walkerCmd, const HardwareInfo &hwInfo);
static void setupPostSyncMocs(WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
template <typename WalkerType>
static void setupPostSyncMocs(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
static void adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
@@ -326,6 +333,7 @@ struct EncodeStateBaseAddressArgs {
bool multiOsContextCapable = false;
bool isRcs = false;
bool doubleSbaWa = false;
bool heaplessModeEnabled = false;
};
template <typename GfxFamily>

View File

@@ -574,7 +574,8 @@ bool EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(const KernelDes
}
template <typename Family>
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {}
template <typename WalkerType>
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WalkerType &walkerCmd, const HardwareInfo &hwInfo) {}
template <typename Family>
void EncodeIndirectParams<Family>::encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr) {
@@ -716,7 +717,8 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
}
template <typename Family>
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WALKER_TYPE &walkerCmd) {}
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {}
template <typename Family>
size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) {

View File

@@ -29,7 +29,8 @@
namespace NEO {
template <typename Family>
void EncodeDispatchKernel<Family>::setGrfInfo(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf,
template <typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf,
const size_t &sizeCrossThreadData, const size_t &sizePerThreadData,
const RootDeviceEnvironment &rootDeviceEnvironment) {
auto grfSize = sizeof(typename Family::GRF);
@@ -92,7 +93,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
EncodeDispatchKernel<Family>::programBarrierEnable(idd,
kernelDescriptor.kernelAttributes.barrierCount,
hwInfo);
auto slmSize = static_cast<typename INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE>(
auto slmSize = static_cast<uint32_t>(
gfxCoreHelper.computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize()));
idd.setSharedLocalMemorySize(slmSize);
@@ -239,7 +240,9 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
false, // useGlobalAtomics
false, // multiOsContextCapable
args.isRcs, // isRcs
container.doubleSbaWaRef()}; // doubleSbaWa
container.doubleSbaWaRef(), // doubleSbaWa
false, // heaplessModeEnabled
};
EncodeStateBaseAddress<Family>::encode(encodeStateBaseAddressArgs);
container.setDirtyStateForAllHeaps(false);
args.requiresUncachedMocs = false;
@@ -394,17 +397,20 @@ void EncodeDispatchKernel<Family>::encodeThreadData(WALKER_TYPE &walkerCmd,
}
template <typename Family>
void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor,
template <typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor,
uint32_t value,
const HardwareInfo &hwInfo) {
interfaceDescriptor.setBarrierEnable(value);
}
template <typename Family>
inline void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
template <typename WalkerType>
inline void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
template <typename Family>
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {}
template <typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {}
template <typename Family>
inline bool EncodeDispatchKernel<Family>::isDshNeeded(const DeviceInfo &deviceInfo) {
@@ -592,7 +598,8 @@ inline void EncodeMiArbCheck<Family>::adjust(MI_ARB_CHECK &miArbCheck, std::opti
}
template <typename Family>
void EncodeDispatchKernel<Family>::setupPostSyncMocs(WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {}
template <typename WalkerType>
void EncodeDispatchKernel<Family>::setupPostSyncMocs(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {}
template <typename Family>
void EncodeDispatchKernel<Family>::adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment) {}

View File

@@ -37,7 +37,8 @@ constexpr size_t TimestampDestinationAddressAlignment = 16;
constexpr size_t ImmWriteDestinationAddressAlignment = 8;
template <typename Family>
void EncodeDispatchKernel<Family>::setGrfInfo(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf,
template <typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t numGrf,
const size_t &sizeCrossThreadData, const size_t &sizePerThreadData,
const RootDeviceEnvironment &rootDeviceEnvironment) {
}
@@ -77,9 +78,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
EncodeDispatchKernel<Family>::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData,
sizePerThreadData, rootDeviceEnvironment);
auto &productHelper = args.device->getProductHelper();
productHelper.updateIddCommand(&idd, kernelDescriptor.kernelAttributes.numGrfRequired,
kernelDescriptor.kernelAttributes.threadArbitrationPolicy);
bool localIdsGenerationByRuntime = args.dispatchInterface->requiresGenerationOfLocalIdsByRuntime();
auto requiredWorkgroupOrder = args.dispatchInterface->getRequiredWorkgroupOrder();
@@ -105,17 +103,18 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
hwInfo);
auto &gfxCoreHelper = args.device->getGfxCoreHelper();
auto slmSize = static_cast<SHARED_LOCAL_MEMORY_SIZE>(
auto slmSize = static_cast<uint32_t>(
gfxCoreHelper.computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize()));
if (DebugManager.flags.OverrideSlmAllocationSize.get() != -1) {
slmSize = static_cast<SHARED_LOCAL_MEMORY_SIZE>(DebugManager.flags.OverrideSlmAllocationSize.get());
slmSize = static_cast<uint32_t>(DebugManager.flags.OverrideSlmAllocationSize.get());
}
idd.setSharedLocalMemorySize(slmSize);
auto bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
bool skipSshProgramming = false;
auto &productHelper = args.device->getProductHelper();
if (productHelper.isSkippingStatefulInformationRequired(kernelDescriptor)) {
bindingTableStateCount = 0u;
skipSshProgramming = true;
@@ -272,7 +271,9 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
args.useGlobalAtomics, // useGlobalAtomics
args.partitionCount > 1, // multiOsContextCapable
args.isRcs, // isRcs
container.doubleSbaWaRef()}; // doubleSbaWa
container.doubleSbaWaRef(), // doubleSbaWa
false, // heaplessModeEnabled
};
EncodeStateBaseAddress<Family>::encode(encodeStateBaseAddressArgs);
container.setDirtyStateForAllHeaps(false);
}
@@ -392,7 +393,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
}
template <typename Family>
inline void EncodeDispatchKernel<Family>::setupPostSyncMocs(WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {
template <typename WalkerType>
inline void EncodeDispatchKernel<Family>::setupPostSyncMocs(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {
auto &postSyncData = walkerCmd.getPostSync();
auto gmmHelper = rootDeviceEnvironment.getGmmHelper();

View File

@@ -40,14 +40,16 @@ template <typename GfxFamily>
struct ImplicitScalingDispatch {
using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;
template <typename WalkerType>
static size_t getSize(bool apiSelfCleanup,
bool preferStaticPartitioning,
const DeviceBitfield &devices,
const Vec3<size_t> &groupStart,
const Vec3<size_t> &groupCount);
template <typename WalkerType>
static void dispatchCommands(LinearStream &commandStream,
WALKER_TYPE &walkerCmd,
WalkerType &walkerCmd,
void **outWalkerPtr,
const DeviceBitfield &devices,
uint32_t &partitionCount,

View File

@@ -56,22 +56,23 @@ WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(uint64_t workPar
}
template <typename GfxFamily>
template <typename WalkerType>
size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool apiSelfCleanup,
bool preferStaticPartitioning,
const DeviceBitfield &devices,
const Vec3<size_t> &groupStart,
const Vec3<size_t> &groupCount) {
typename GfxFamily::COMPUTE_WALKER::PARTITION_TYPE partitionType{};
typename WalkerType::PARTITION_TYPE partitionType{};
bool staticPartitioning = false;
const uint32_t tileCount = static_cast<uint32_t>(devices.count());
const uint32_t partitionCount = WalkerPartition::computePartitionCountAndPartitionType<GfxFamily>(tileCount,
preferStaticPartitioning,
groupStart,
groupCount,
{},
&partitionType,
&staticPartitioning);
const uint32_t partitionCount = WalkerPartition::computePartitionCountAndPartitionType<GfxFamily, WalkerType>(tileCount,
preferStaticPartitioning,
groupStart,
groupCount,
{},
&partitionType,
&staticPartitioning);
UNRECOVERABLE_IF(staticPartitioning && (tileCount != partitionCount));
WalkerPartition::WalkerPartitionArgs args = prepareWalkerPartitionArgs<GfxFamily>(0u,
tileCount,
@@ -87,8 +88,9 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool apiSelfCleanup,
}
template <typename GfxFamily>
template <typename WalkerType>
void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandStream,
WALKER_TYPE &walkerCmd,
WalkerType &walkerCmd,
void **outWalkerPtr,
const DeviceBitfield &devices,
uint32_t &partitionCount,
@@ -104,7 +106,7 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
const bool preferStaticPartitioning = workPartitionAllocationGpuVa != 0u;
bool staticPartitioning = false;
partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType<GfxFamily>(&walkerCmd, tileCount, preferStaticPartitioning, usesImages, &staticPartitioning);
partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType<GfxFamily, WalkerType>(&walkerCmd, tileCount, preferStaticPartitioning, usesImages, &staticPartitioning);
WalkerPartition::WalkerPartitionArgs args = prepareWalkerPartitionArgs<GfxFamily>(workPartitionAllocationGpuVa,
tileCount,
@@ -116,35 +118,35 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
dcFlush,
forceExecutionOnSingleTile);
auto dispatchCommandsSize = getSize(apiSelfCleanup, preferStaticPartitioning, devices, {walkerCmd.getThreadGroupIdStartingX(), walkerCmd.getThreadGroupIdStartingY(), walkerCmd.getThreadGroupIdStartingZ()}, {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()});
auto dispatchCommandsSize = getSize<WalkerType>(apiSelfCleanup, preferStaticPartitioning, devices, {walkerCmd.getThreadGroupIdStartingX(), walkerCmd.getThreadGroupIdStartingY(), walkerCmd.getThreadGroupIdStartingZ()}, {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()});
void *commandBuffer = commandStream.getSpace(dispatchCommandsSize);
uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed() - dispatchCommandsSize;
if (staticPartitioning) {
UNRECOVERABLE_IF(tileCount != partitionCount);
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandBuffer,
outWalkerPtr,
cmdBufferGpuAddress,
&walkerCmd,
totalProgrammedSize,
args,
hwInfo);
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily, WalkerType>(commandBuffer,
outWalkerPtr,
cmdBufferGpuAddress,
&walkerCmd,
totalProgrammedSize,
args,
hwInfo);
} else {
if (DebugManager.flags.ExperimentalSetWalkerPartitionCount.get()) {
partitionCount = DebugManager.flags.ExperimentalSetWalkerPartitionCount.get();
if (partitionCount == 1u) {
walkerCmd.setPartitionType(GfxFamily::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
walkerCmd.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
}
args.partitionCount = partitionCount;
}
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily>(commandBuffer,
outWalkerPtr,
cmdBufferGpuAddress,
&walkerCmd,
totalProgrammedSize,
args,
hwInfo);
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily, WalkerType>(commandBuffer,
outWalkerPtr,
cmdBufferGpuAddress,
&walkerCmd,
totalProgrammedSize,
args,
hwInfo);
}
UNRECOVERABLE_IF(totalProgrammedSize != dispatchCommandsSize);
}

View File

@@ -71,17 +71,19 @@ inline void *putCommand(void *&inputAddress, uint32_t &totalBytesProgrammed, siz
return commandToReturn;
}
template <typename GfxFamily>
template <typename GfxFamily, typename WalkerType>
uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitionCount,
bool preferStaticPartitioning,
const Vec3<size_t> &groupStart,
const Vec3<size_t> &groupCount,
std::optional<typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE> requestedPartitionType,
typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE *outSelectedPartitionType,
std::optional<typename WalkerType::PARTITION_TYPE> requestedPartitionType,
typename WalkerType::PARTITION_TYPE *outSelectedPartitionType,
bool *outSelectStaticPartitioning) {
using PARTITION_TYPE = typename WalkerType::PARTITION_TYPE;
// For non uniform starting point, there is no support for partition in Hardware. Disable partitioning and select dynamic algorithm
if (groupStart.x || groupStart.y || groupStart.z) {
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_DISABLED;
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_DISABLED;
*outSelectStaticPartitioning = false;
return 1u;
}
@@ -90,18 +92,18 @@ uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitio
bool disablePartitionForPartitionCountOne{};
if (NEO::DebugManager.flags.ExperimentalSetWalkerPartitionType.get() != -1) {
requestedPartitionType = static_cast<typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE>(NEO::DebugManager.flags.ExperimentalSetWalkerPartitionType.get());
requestedPartitionType = static_cast<PARTITION_TYPE>(NEO::DebugManager.flags.ExperimentalSetWalkerPartitionType.get());
}
if (requestedPartitionType.has_value()) {
switch (requestedPartitionType.value()) {
case COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X:
case PARTITION_TYPE::PARTITION_TYPE_X:
workgroupCount = groupCount.x;
break;
case COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Y:
case PARTITION_TYPE::PARTITION_TYPE_Y:
workgroupCount = groupCount.y;
break;
case COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Z:
case PARTITION_TYPE::PARTITION_TYPE_Z:
workgroupCount = groupCount.z;
break;
default:
@@ -124,11 +126,11 @@ uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitio
// we first try with deepest dimension to see if we can partition there
if (groupCount.z > 1 && (zImbalance <= minimalThreshold)) {
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Z;
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Z;
} else if (groupCount.y > 1 && (yImbalance < minimalThreshold)) {
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Y;
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Y;
} else if (groupCount.x % preferredMinimalPartitionCount == 0) {
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X;
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_X;
}
// if we are here then there is no dimension that results in even distribution, choose max dimension to minimize impact
else {
@@ -138,11 +140,11 @@ uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitio
if (goWithMaxAlgorithm) {
// default mode, select greatest dimension
if (maxDimension == groupCount.x) {
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X;
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_X;
} else if (maxDimension == groupCount.y) {
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Y;
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Y;
} else {
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Z;
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Z;
}
}
@@ -175,32 +177,35 @@ uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitio
}
if (partitionCount == 1u && disablePartitionForPartitionCountOne) {
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_DISABLED;
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_DISABLED;
}
return static_cast<uint32_t>(partitionCount);
}
template <typename GfxFamily>
uint32_t computePartitionCountAndSetPartitionType(COMPUTE_WALKER<GfxFamily> *walker,
template <typename GfxFamily, typename WalkerType>
uint32_t computePartitionCountAndSetPartitionType(WalkerType *walker,
uint32_t preferredMinimalPartitionCount,
bool preferStaticPartitioning,
bool usesImages,
bool *outSelectStaticPartitioning) {
using PARTITION_TYPE = typename WalkerType::PARTITION_TYPE;
const Vec3<size_t> groupStart = {walker->getThreadGroupIdStartingX(), walker->getThreadGroupIdStartingY(), walker->getThreadGroupIdStartingZ()};
const Vec3<size_t> groupCount = {walker->getThreadGroupIdXDimension(), walker->getThreadGroupIdYDimension(), walker->getThreadGroupIdZDimension()};
std::optional<typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE> requestedPartitionType{};
std::optional<PARTITION_TYPE> requestedPartitionType{};
if (usesImages) {
requestedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X;
requestedPartitionType = PARTITION_TYPE::PARTITION_TYPE_X;
}
typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE partitionType{};
const auto partitionCount = computePartitionCountAndPartitionType<GfxFamily>(preferredMinimalPartitionCount,
preferStaticPartitioning,
groupStart,
groupCount,
requestedPartitionType,
&partitionType,
outSelectStaticPartitioning);
PARTITION_TYPE partitionType{};
const auto partitionCount = computePartitionCountAndPartitionType<GfxFamily, WalkerType>(preferredMinimalPartitionCount,
preferStaticPartitioning,
groupStart,
groupCount,
requestedPartitionType,
&partitionType,
outSelectStaticPartitioning);
walker->setPartitionType(partitionType);
return partitionCount;
}
@@ -426,10 +431,10 @@ void programSelfCleanupEndSection(void *&inputAddress,
programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * args.tileCount);
}
template <typename GfxFamily>
template <typename GfxFamily, typename WalkerType>
void programTilesSynchronizationWithPostSyncs(void *&currentBatchBufferPointer,
uint32_t &totalBytesProgrammed,
COMPUTE_WALKER<GfxFamily> *inputWalker,
WalkerType *inputWalker,
uint32_t partitionCount) {
const auto postSyncAddress = inputWalker->getPostSync().getDestinationAddress() + 8llu;
for (uint32_t partitionId = 0u; partitionId < partitionCount; partitionId++) {
@@ -472,13 +477,13 @@ uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) {
computeWalkerSectionSize<GfxFamily>();
}
template <typename GfxFamily>
template <typename GfxFamily, typename WalkerType>
void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgrammed,
COMPUTE_WALKER<GfxFamily> *inputWalker,
WalkerType *inputWalker,
uint32_t partitionCount,
bool forceExecutionOnSingleTile) {
auto computeWalker = putCommand<COMPUTE_WALKER<GfxFamily>>(inputAddress, totalBytesProgrammed);
COMPUTE_WALKER<GfxFamily> cmd = *inputWalker;
auto computeWalker = putCommand<WalkerType>(inputAddress, totalBytesProgrammed);
WalkerType cmd = *inputWalker;
if (partitionCount > 1) {
auto partitionType = inputWalker->getPartitionType();
@@ -486,14 +491,14 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm
assert(inputWalker->getThreadGroupIdStartingX() == 0u);
assert(inputWalker->getThreadGroupIdStartingY() == 0u);
assert(inputWalker->getThreadGroupIdStartingZ() == 0u);
assert(partitionType != COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
assert(partitionType != WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
cmd.setWorkloadPartitionEnable(true);
auto workgroupCount = 0u;
if (partitionType == COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X) {
if (partitionType == WalkerType::PARTITION_TYPE::PARTITION_TYPE_X) {
workgroupCount = inputWalker->getThreadGroupIdXDimension();
} else if (partitionType == COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Y) {
} else if (partitionType == WalkerType::PARTITION_TYPE::PARTITION_TYPE_Y) {
workgroupCount = inputWalker->getThreadGroupIdYDimension();
} else {
workgroupCount = inputWalker->getThreadGroupIdZDimension();
@@ -540,11 +545,11 @@ void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgramm
32. BATCH_BUFFER_END ( optional )
*/
template <typename GfxFamily>
template <typename GfxFamily, typename WalkerType>
void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
void **outWalkerPtr,
uint64_t gpuAddressOfAllocation,
COMPUTE_WALKER<GfxFamily> *inputWalker,
WalkerType *inputWalker,
uint32_t &totalBytesProgrammed,
WalkerPartitionArgs &args,
const NEO::HardwareInfo &hwInfo) {
@@ -617,7 +622,7 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
args.secondaryBatchBuffer);
// Walker section
auto walkerPtr = programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.forceExecutionOnSingleTile);
auto walkerPtr = programPartitionedWalker<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.forceExecutionOnSingleTile);
if (outWalkerPtr) {
*outWalkerPtr = walkerPtr;
}
@@ -686,11 +691,11 @@ uint64_t computeStaticPartitioningControlSectionOffset(WalkerPartitionArgs &args
bbStartSize;
}
template <typename GfxFamily>
template <typename GfxFamily, typename WalkerType>
void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
void **outWalkerPtr,
uint64_t gpuAddressOfAllocation,
COMPUTE_WALKER<GfxFamily> *inputWalker,
WalkerType *inputWalker,
uint32_t &totalBytesProgrammed,
WalkerPartitionArgs &args,
const NEO::HardwareInfo &hwInfo) {
@@ -730,7 +735,7 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
// Synchronize tiles after walker
if (args.semaphoreProgrammingRequired) {
programTilesSynchronizationWithPostSyncs<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
programTilesSynchronizationWithPostSyncs<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
}
if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) {

View File

@@ -25,6 +25,7 @@
#include "shared/source/gmm_helper/page_table_mngr.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/array_count.h"
#include "shared/source/helpers/compiler_product_helper.h"
#include "shared/source/helpers/flat_batch_buffer_helper.h"
#include "shared/source/helpers/flush_stamp.h"
#include "shared/source/helpers/gfx_core_helper.h"
@@ -96,6 +97,9 @@ CommandStreamReceiver::CommandStreamReceiver(ExecutionEnvironment &executionEnvi
this->l1CachePolicyData.init(productHelper);
registeredClients.reserve(16);
auto &compilerProductHelper = rootDeviceEnvironment.getHelper<CompilerProductHelper>();
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled();
}
CommandStreamReceiver::~CommandStreamReceiver() {

View File

@@ -573,6 +573,7 @@ class CommandStreamReceiver {
volatile bool resourcesInitialized = false;
bool doubleSbaWa = false;
bool dshSupported = false;
bool heaplessModeEnabled = false;
};
typedef CommandStreamReceiver *(*CommandStreamReceiverCreateFunc)(bool withAubDump,

View File

@@ -20,8 +20,9 @@ struct PipeControlArgs;
template <typename GfxFamily>
class CommandStreamReceiverHw : public CommandStreamReceiver {
typedef typename GfxFamily::MI_BATCH_BUFFER_START MI_BATCH_BUFFER_START;
typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL;
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using STATE_BASE_ADDRESS = typename GfxFamily::STATE_BASE_ADDRESS;
struct ImmediateFlushData {
PipelineSelectArgs pipelineSelectArgs{};
@@ -176,6 +177,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
}
void dispatchRayTracingStateCommand(LinearStream &cmdStream, Device &device);
uint64_t getScratchPatchAddress();
protected:
void programPreemption(LinearStream &csr, DispatchFlags &dispatchFlags);
@@ -202,7 +204,6 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
void addPipeControlBefore3dState(LinearStream &commandStream, DispatchFlags &dispatchFlags);
bool are4GbHeapsAvailable() const;
uint64_t getScratchPatchAddress();
void createScratchSpaceController();
bool detectInitProgrammingFlagsRequired(const DispatchFlags &dispatchFlags) const;

View File

@@ -27,6 +27,7 @@
#include "shared/source/gmm_helper/page_table_mngr.h"
#include "shared/source/helpers/blit_commands_helper.h"
#include "shared/source/helpers/blit_properties.h"
#include "shared/source/helpers/compiler_product_helper.h"
#include "shared/source/helpers/definitions/command_encoder_args.h"
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/helpers/flat_batch_buffer_helper_hw.h"
@@ -505,7 +506,9 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
dispatchRayTracingStateCommand(commandStreamCSR, device);
}
programVFEState(commandStreamCSR, dispatchFlags, device.getDeviceInfo().maxFrontEndThreads);
if (this->heaplessModeEnabled == false) {
programVFEState(commandStreamCSR, dispatchFlags, device.getDeviceInfo().maxFrontEndThreads);
}
programPreemption(commandStreamCSR, dispatchFlags);

View File

@@ -78,8 +78,8 @@ class PreemptionHelper {
static PreemptionMode getDefaultPreemptionMode(const HardwareInfo &hwInfo);
template <typename GfxFamily>
static void programInterfaceDescriptorDataPreemption(INTERFACE_DESCRIPTOR_DATA<GfxFamily> *idd, PreemptionMode preemptionMode);
template <typename GfxFamily, typename InterfaceDescriptorType>
static void programInterfaceDescriptorDataPreemption(InterfaceDescriptorType *idd, PreemptionMode preemptionMode);
protected:
template <typename GfxFamily>

View File

@@ -118,8 +118,8 @@ template <typename GfxFamily>
void PreemptionHelper::applyPreemptionWaCmdsEnd(LinearStream *pCommandStream, const Device &device) {
}
template <typename GfxFamily>
void PreemptionHelper::programInterfaceDescriptorDataPreemption(INTERFACE_DESCRIPTOR_DATA<GfxFamily> *idd, PreemptionMode preemptionMode) {
template <typename GfxFamily, typename InterfaceDescriptorType>
void PreemptionHelper::programInterfaceDescriptorDataPreemption(InterfaceDescriptorType *idd, PreemptionMode preemptionMode) {
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
if (preemptionMode == PreemptionMode::MidThread) {
idd->setThreadPreemptionDisable(INTERFACE_DESCRIPTOR_DATA::THREAD_PREEMPTION_DISABLE_DISABLE);

View File

@@ -72,6 +72,14 @@ void EncodeComputeMode<Family>::programComputeModeCommand(LinearStream &csr, Sta
}
template struct EncodeDispatchKernel<Family>;
template void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::WALKER_TYPE>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template void EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
template void EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
template void EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
template void EncodeDispatchKernel<Family>::programBarrierEnable<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
template void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::WALKER_TYPE, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::WALKER_TYPE &walkerCmd);
template void EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
template struct EncodeStates<Family>;
template struct EncodeMath<Family>;
template struct EncodeMathMMIO<Family>;

View File

@@ -136,5 +136,25 @@ struct Gen11Family : public Gen11 {
static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) {
return cmdSetBaseFamily == IGFX_GEN8_CORE;
}
template <typename WalkerType = WALKER_TYPE>
static constexpr size_t getInterfaceDescriptorSize() {
return sizeof(INTERFACE_DESCRIPTOR_DATA);
}
template <typename WalkerType = WALKER_TYPE>
static WalkerType getInitGpuWalker() {
return cmdInitGpgpuWalker;
}
template <typename InterfaceDescriptorType>
static InterfaceDescriptorType getInitInterfaceDescriptor() {
return cmdInitInterfaceDescriptorData;
}
template <typename WalkerType>
static constexpr bool isHeaplessMode() {
return false;
}
};
} // namespace NEO

View File

@@ -112,6 +112,14 @@ void EncodeComputeMode<Family>::adjustPipelineSelect(CommandContainer &container
}
template struct EncodeDispatchKernel<Family>;
template void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::WALKER_TYPE>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template void EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
template void EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
template void EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
template void EncodeDispatchKernel<Family>::programBarrierEnable<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
template void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::WALKER_TYPE, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::WALKER_TYPE &walkerCmd);
template void EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
template struct EncodeStates<Family>;
template struct EncodeMath<Family>;
template struct EncodeMathMMIO<Family>;

View File

@@ -137,6 +137,21 @@ struct Gen12LpFamily : public Gen12Lp {
static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) {
return cmdSetBaseFamily == IGFX_GEN8_CORE;
}
template <typename WalkerType>
static constexpr size_t getInterfaceDescriptorSize() {
return sizeof(INTERFACE_DESCRIPTOR_DATA);
}
template <typename InterfaceDescriptorType>
static InterfaceDescriptorType getInitInterfaceDescriptor() {
return cmdInitInterfaceDescriptorData;
}
template <typename WalkerType>
static constexpr bool isHeaplessMode() {
return false;
}
};
} // namespace NEO

View File

@@ -52,6 +52,14 @@ void EncodeStateBaseAddress<Family>::setSbaAddressesForDebugger(NEO::Debugger::S
}
template struct EncodeDispatchKernel<Family>;
template void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::WALKER_TYPE>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template void EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
template void EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
template void EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
template void EncodeDispatchKernel<Family>::programBarrierEnable<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
template void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::WALKER_TYPE, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::WALKER_TYPE &walkerCmd);
template void EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
template struct EncodeStates<Family>;
template struct EncodeMath<Family>;
template struct EncodeMathMMIO<Family>;

View File

@@ -136,6 +136,26 @@ struct Gen8Family : public Gen8 {
static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) {
return cmdSetBaseFamily == IGFX_GEN8_CORE;
}
template <typename WalkerType = WALKER_TYPE>
static constexpr size_t getInterfaceDescriptorSize() {
return sizeof(INTERFACE_DESCRIPTOR_DATA);
}
template <typename WalkerType = WALKER_TYPE>
static WalkerType getInitGpuWalker() {
return cmdInitGpgpuWalker;
}
template <typename InterfaceDescriptorType>
static InterfaceDescriptorType getInitInterfaceDescriptor() {
return cmdInitInterfaceDescriptorData;
}
template <typename WalkerType>
static constexpr bool isHeaplessMode() {
return false;
}
};
} // namespace NEO

View File

@@ -57,6 +57,14 @@ void EncodeComputeMode<Family>::programComputeModeCommand(LinearStream &csr, Sta
}
template struct EncodeDispatchKernel<Family>;
template void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::WALKER_TYPE>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template void EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
template void EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
template void EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
template void EncodeDispatchKernel<Family>::programBarrierEnable<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
template void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::WALKER_TYPE, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::WALKER_TYPE &walkerCmd);
template void EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
template struct EncodeStates<Family>;
template struct EncodeMath<Family>;
template struct EncodeMathMMIO<Family>;

View File

@@ -136,6 +136,26 @@ struct Gen9Family : public Gen9 {
static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) {
return cmdSetBaseFamily == IGFX_GEN8_CORE;
}
template <typename WalkerType = WALKER_TYPE>
static constexpr size_t getInterfaceDescriptorSize() {
return sizeof(INTERFACE_DESCRIPTOR_DATA);
}
template <typename WalkerType = WALKER_TYPE>
static WalkerType getInitGpuWalker() {
return cmdInitGpgpuWalker;
}
template <typename InterfaceDescriptorType>
static InterfaceDescriptorType getInitInterfaceDescriptor() {
return cmdInitInterfaceDescriptorData;
}
template <typename WalkerType>
static constexpr bool isHeaplessMode() {
return false;
}
};
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2022 Intel Corporation
* Copyright (C) 2019-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -47,194 +47,6 @@ typedef struct tagBINDING_TABLE_STATE {
}
} BINDING_TABLE_STATE;
STATIC_ASSERT(4 == sizeof(BINDING_TABLE_STATE));
typedef struct tagGPGPU_WALKER {
union tagTheStructure {
struct tagCommon {
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
uint32_t CommandType : BITFIELD_RANGE(29, 31);
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
uint32_t ThreadGroupIdStartingX;
uint32_t Reserved_192;
uint32_t ThreadGroupIdXDimension;
uint32_t ThreadGroupIdStartingY;
uint32_t Reserved_288;
uint32_t ThreadGroupIdYDimension;
uint32_t ThreadGroupIdStartingResumeZ;
uint32_t ThreadGroupIdZDimension;
uint32_t RightExecutionMask;
uint32_t BottomExecutionMask;
} Common;
uint32_t RawData[15];
} TheStructure;
typedef enum tagDWORD_LENGTH {
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
} DWORD_LENGTH;
typedef enum tagSUBOPCODE {
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
} SUBOPCODE;
typedef enum tagMEDIA_COMMAND_OPCODE {
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
} MEDIA_COMMAND_OPCODE;
typedef enum tagPIPELINE {
PIPELINE_MEDIA = 0x2,
} PIPELINE;
typedef enum tagCOMMAND_TYPE {
COMMAND_TYPE_GFXPIPE = 0x3,
} COMMAND_TYPE;
typedef enum tagSIMD_SIZE {
SIMD_SIZE_SIMD8 = 0x0,
SIMD_SIZE_SIMD16 = 0x1,
SIMD_SIZE_SIMD32 = 0x2,
} SIMD_SIZE;
typedef enum tagPATCH_CONSTANTS {
INDIRECTDATASTARTADDRESS_BYTEOFFSET = 0xc,
INDIRECTDATASTARTADDRESS_INDEX = 0x3,
} PATCH_CONSTANTS;
inline void init() {
memset(&TheStructure, 0, sizeof(TheStructure));
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
}
static tagGPGPU_WALKER sInit() {
GPGPU_WALKER state;
state.init();
return state;
}
inline uint32_t &getRawData(const uint32_t index) {
DEBUG_BREAK_IF(index >= 15);
return TheStructure.RawData[index];
}
inline void setPredicateEnable(const bool value) {
TheStructure.Common.PredicateEnable = value;
}
inline bool getPredicateEnable() const {
return (TheStructure.Common.PredicateEnable);
}
inline void setIndirectParameterEnable(const bool value) {
TheStructure.Common.IndirectParameterEnable = value;
}
inline bool getIndirectParameterEnable() const {
return (TheStructure.Common.IndirectParameterEnable);
}
inline void setInterfaceDescriptorOffset(const uint32_t value) {
TheStructure.Common.InterfaceDescriptorOffset = value;
}
inline uint32_t getInterfaceDescriptorOffset() const {
return (TheStructure.Common.InterfaceDescriptorOffset);
}
inline void setIndirectDataLength(const uint32_t value) {
TheStructure.Common.IndirectDataLength = value;
}
inline uint32_t getIndirectDataLength() const {
return (TheStructure.Common.IndirectDataLength);
}
typedef enum tagINDIRECTDATASTARTADDRESS {
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
} INDIRECTDATASTARTADDRESS;
inline void setIndirectDataStartAddress(const uint32_t value) {
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
}
inline uint32_t getIndirectDataStartAddress() const {
return (TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT);
}
inline void setThreadWidthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
}
inline uint32_t getThreadWidthCounterMaximum() const {
return (TheStructure.Common.ThreadWidthCounterMaximum + 1);
}
inline void setThreadHeightCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
}
inline uint32_t getThreadHeightCounterMaximum() const {
return (TheStructure.Common.ThreadHeightCounterMaximum + 1);
}
inline void setThreadDepthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadDepthCounterMaximum = value;
}
inline uint32_t getThreadDepthCounterMaximum() const {
return (TheStructure.Common.ThreadDepthCounterMaximum);
}
inline void setSimdSize(const SIMD_SIZE value) {
TheStructure.Common.SimdSize = value;
}
inline SIMD_SIZE getSimdSize() const {
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
}
inline void setThreadGroupIdStartingX(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingX = value;
}
inline uint32_t getThreadGroupIdStartingX() const {
return (TheStructure.Common.ThreadGroupIdStartingX);
}
inline void setThreadGroupIdXDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdXDimension = value;
}
inline uint32_t getThreadGroupIdXDimension() const {
return (TheStructure.Common.ThreadGroupIdXDimension);
}
inline void setThreadGroupIdStartingY(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingY = value;
}
inline uint32_t getThreadGroupIdStartingY() const {
return (TheStructure.Common.ThreadGroupIdStartingY);
}
inline void setThreadGroupIdYDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdYDimension = value;
}
inline uint32_t getThreadGroupIdYDimension() const {
return (TheStructure.Common.ThreadGroupIdYDimension);
}
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
}
inline uint32_t getThreadGroupIdStartingResumeZ() const {
return (TheStructure.Common.ThreadGroupIdStartingResumeZ);
}
inline void setThreadGroupIdZDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdZDimension = value;
}
inline uint32_t getThreadGroupIdZDimension() const {
return (TheStructure.Common.ThreadGroupIdZDimension);
}
inline void setRightExecutionMask(const uint32_t value) {
TheStructure.Common.RightExecutionMask = value;
}
inline uint32_t getRightExecutionMask() const {
return (TheStructure.Common.RightExecutionMask);
}
inline void setBottomExecutionMask(const uint32_t value) {
TheStructure.Common.BottomExecutionMask = value;
}
inline uint32_t getBottomExecutionMask() const {
return (TheStructure.Common.BottomExecutionMask);
}
} GPGPU_WALKER;
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
typedef struct tagINTERFACE_DESCRIPTOR_DATA {
union tagTheStructure {
@@ -465,11 +277,11 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
inline uint32_t getNumberOfThreadsInGpgpuThreadGroup() const {
return (TheStructure.Common.NumberOfThreadsInGpgpuThreadGroup);
}
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
inline void setSharedLocalMemorySize(const uint32_t value) { // patched
TheStructure.Common.SharedLocalMemorySize = value;
}
inline SHARED_LOCAL_MEMORY_SIZE getSharedLocalMemorySize() const {
return static_cast<SHARED_LOCAL_MEMORY_SIZE>(TheStructure.Common.SharedLocalMemorySize);
inline uint32_t getSharedLocalMemorySize() const { // patched
return static_cast<uint32_t>(TheStructure.Common.SharedLocalMemorySize);
}
inline void setBarrierEnable(const uint32_t value) {
TheStructure.Common.BarrierEnable = (value > 0u) ? 1u : 0u;
@@ -492,6 +304,197 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
} INTERFACE_DESCRIPTOR_DATA;
STATIC_ASSERT(32 == sizeof(INTERFACE_DESCRIPTOR_DATA));
typedef struct tagGPGPU_WALKER {
union tagTheStructure {
struct tagCommon {
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
uint32_t CommandType : BITFIELD_RANGE(29, 31);
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
uint32_t ThreadGroupIdStartingX;
uint32_t Reserved_192;
uint32_t ThreadGroupIdXDimension;
uint32_t ThreadGroupIdStartingY;
uint32_t Reserved_288;
uint32_t ThreadGroupIdYDimension;
uint32_t ThreadGroupIdStartingResumeZ;
uint32_t ThreadGroupIdZDimension;
uint32_t RightExecutionMask;
uint32_t BottomExecutionMask;
} Common;
uint32_t RawData[15];
} TheStructure;
typedef enum tagDWORD_LENGTH {
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
} DWORD_LENGTH;
typedef enum tagSUBOPCODE {
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
} SUBOPCODE;
typedef enum tagMEDIA_COMMAND_OPCODE {
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
} MEDIA_COMMAND_OPCODE;
typedef enum tagPIPELINE {
PIPELINE_MEDIA = 0x2,
} PIPELINE;
typedef enum tagCOMMAND_TYPE {
COMMAND_TYPE_GFXPIPE = 0x3,
} COMMAND_TYPE;
typedef enum tagSIMD_SIZE {
SIMD_SIZE_SIMD8 = 0x0,
SIMD_SIZE_SIMD16 = 0x1,
SIMD_SIZE_SIMD32 = 0x2,
} SIMD_SIZE;
typedef enum tagPATCH_CONSTANTS {
INDIRECTDATASTARTADDRESS_BYTEOFFSET = 0xc,
INDIRECTDATASTARTADDRESS_INDEX = 0x3,
} PATCH_CONSTANTS;
inline void init() {
memset(&TheStructure, 0, sizeof(TheStructure));
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
}
static tagGPGPU_WALKER sInit() {
GPGPU_WALKER state;
state.init();
return state;
}
inline uint32_t &getRawData(const uint32_t index) {
DEBUG_BREAK_IF(index >= 15);
return TheStructure.RawData[index];
}
inline void setPredicateEnable(const bool value) {
TheStructure.Common.PredicateEnable = value;
}
inline bool getPredicateEnable() const {
return (TheStructure.Common.PredicateEnable);
}
inline void setIndirectParameterEnable(const bool value) {
TheStructure.Common.IndirectParameterEnable = value;
}
inline bool getIndirectParameterEnable() const {
return (TheStructure.Common.IndirectParameterEnable);
}
inline void setInterfaceDescriptorOffset(const uint32_t value) {
TheStructure.Common.InterfaceDescriptorOffset = value;
}
inline uint32_t getInterfaceDescriptorOffset() const {
return (TheStructure.Common.InterfaceDescriptorOffset);
}
inline void setIndirectDataLength(const uint32_t value) {
TheStructure.Common.IndirectDataLength = value;
}
inline uint32_t getIndirectDataLength() const {
return (TheStructure.Common.IndirectDataLength);
}
typedef enum tagINDIRECTDATASTARTADDRESS {
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
} INDIRECTDATASTARTADDRESS;
inline void setIndirectDataStartAddress(const uint32_t value) {
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
}
inline uint32_t getIndirectDataStartAddress() const {
return (TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT);
}
inline void setThreadWidthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
}
inline uint32_t getThreadWidthCounterMaximum() const {
return (TheStructure.Common.ThreadWidthCounterMaximum + 1);
}
inline void setThreadHeightCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
}
inline uint32_t getThreadHeightCounterMaximum() const {
return (TheStructure.Common.ThreadHeightCounterMaximum + 1);
}
inline void setThreadDepthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadDepthCounterMaximum = value;
}
inline uint32_t getThreadDepthCounterMaximum() const {
return (TheStructure.Common.ThreadDepthCounterMaximum);
}
inline void setSimdSize(const SIMD_SIZE value) {
TheStructure.Common.SimdSize = value;
}
inline SIMD_SIZE getSimdSize() const {
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
}
inline void setThreadGroupIdStartingX(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingX = value;
}
inline uint32_t getThreadGroupIdStartingX() const {
return (TheStructure.Common.ThreadGroupIdStartingX);
}
inline void setThreadGroupIdXDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdXDimension = value;
}
inline uint32_t getThreadGroupIdXDimension() const {
return (TheStructure.Common.ThreadGroupIdXDimension);
}
inline void setThreadGroupIdStartingY(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingY = value;
}
inline uint32_t getThreadGroupIdStartingY() const {
return (TheStructure.Common.ThreadGroupIdStartingY);
}
inline void setThreadGroupIdYDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdYDimension = value;
}
inline uint32_t getThreadGroupIdYDimension() const {
return (TheStructure.Common.ThreadGroupIdYDimension);
}
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
}
inline uint32_t getThreadGroupIdStartingResumeZ() const {
return (TheStructure.Common.ThreadGroupIdStartingResumeZ);
}
inline void setThreadGroupIdZDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdZDimension = value;
}
inline uint32_t getThreadGroupIdZDimension() const {
return (TheStructure.Common.ThreadGroupIdZDimension);
}
inline void setRightExecutionMask(const uint32_t value) {
TheStructure.Common.RightExecutionMask = value;
}
inline uint32_t getRightExecutionMask() const {
return (TheStructure.Common.RightExecutionMask);
}
inline void setBottomExecutionMask(const uint32_t value) {
TheStructure.Common.BottomExecutionMask = value;
}
inline uint32_t getBottomExecutionMask() const {
return (TheStructure.Common.BottomExecutionMask);
}
using InterfaceDescriptorType = INTERFACE_DESCRIPTOR_DATA; // patched
} GPGPU_WALKER;
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
typedef struct tagMEDIA_INTERFACE_DESCRIPTOR_LOAD {
union tagTheStructure {
struct tagCommon {

View File

@@ -48,205 +48,6 @@ typedef struct tagBINDING_TABLE_STATE {
} BINDING_TABLE_STATE;
STATIC_ASSERT(4 == sizeof(BINDING_TABLE_STATE));
typedef struct tagGPGPU_WALKER {
union tagTheStructure {
struct tagCommon {
// DWORD 0
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
uint32_t CommandType : BITFIELD_RANGE(29, 31);
// DWORD 1
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
// DWORD 2
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
// DWORD 3
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
// DWORD 4
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
// DWORD 5
uint32_t ThreadGroupIdStartingX;
// DWORD 6
uint32_t Reserved_192;
// DWORD 7
uint32_t ThreadGroupIdXDimension;
// DWORD 8
uint32_t ThreadGroupIdStartingY;
// DWORD 9
uint32_t Reserved_288;
// DWORD 10
uint32_t ThreadGroupIdYDimension;
// DWORD 11
uint32_t ThreadGroupIdStartingResumeZ;
// DWORD 12
uint32_t ThreadGroupIdZDimension;
// DWORD 13
uint32_t RightExecutionMask;
// DWORD 14
uint32_t BottomExecutionMask;
} Common;
uint32_t RawData[15];
} TheStructure;
typedef enum tagDWORD_LENGTH {
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
} DWORD_LENGTH;
typedef enum tagSUBOPCODE {
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
} SUBOPCODE;
typedef enum tagMEDIA_COMMAND_OPCODE {
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
} MEDIA_COMMAND_OPCODE;
typedef enum tagPIPELINE {
PIPELINE_MEDIA = 0x2,
} PIPELINE;
typedef enum tagCOMMAND_TYPE {
COMMAND_TYPE_GFXPIPE = 0x3,
} COMMAND_TYPE;
typedef enum tagSIMD_SIZE {
SIMD_SIZE_SIMD8 = 0x0,
SIMD_SIZE_SIMD16 = 0x1,
SIMD_SIZE_SIMD32 = 0x2,
} SIMD_SIZE;
inline void init() {
memset(&TheStructure, 0, sizeof(TheStructure));
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
}
static tagGPGPU_WALKER sInit() {
GPGPU_WALKER state;
state.init();
return state;
}
inline uint32_t &getRawData(const uint32_t index) {
return TheStructure.RawData[index];
}
inline void setPredicateEnable(const bool value) {
TheStructure.Common.PredicateEnable = value;
}
inline bool getPredicateEnable() const {
return TheStructure.Common.PredicateEnable;
}
inline void setIndirectParameterEnable(const bool value) {
TheStructure.Common.IndirectParameterEnable = value;
}
inline bool getIndirectParameterEnable() const {
return TheStructure.Common.IndirectParameterEnable;
}
inline void setInterfaceDescriptorOffset(const uint32_t value) {
TheStructure.Common.InterfaceDescriptorOffset = value;
}
inline uint32_t getInterfaceDescriptorOffset() const {
return TheStructure.Common.InterfaceDescriptorOffset;
}
inline void setIndirectDataLength(const uint32_t value) {
TheStructure.Common.IndirectDataLength = value;
}
inline uint32_t getIndirectDataLength() const {
return TheStructure.Common.IndirectDataLength;
}
typedef enum tagINDIRECTDATASTARTADDRESS {
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
} INDIRECTDATASTARTADDRESS;
inline void setIndirectDataStartAddress(const uint32_t value) {
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
}
inline uint32_t getIndirectDataStartAddress() const {
return TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT;
}
inline void setThreadWidthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
}
inline uint32_t getThreadWidthCounterMaximum() const {
return TheStructure.Common.ThreadWidthCounterMaximum + 1;
}
inline void setThreadHeightCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
}
inline uint32_t getThreadHeightCounterMaximum() const {
return TheStructure.Common.ThreadHeightCounterMaximum + 1;
}
inline void setThreadDepthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadDepthCounterMaximum = value;
}
inline uint32_t getThreadDepthCounterMaximum() const {
return TheStructure.Common.ThreadDepthCounterMaximum;
}
inline void setSimdSize(const SIMD_SIZE value) {
TheStructure.Common.SimdSize = value;
}
inline SIMD_SIZE getSimdSize() const {
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
}
inline void setThreadGroupIdStartingX(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingX = value;
}
inline uint32_t getThreadGroupIdStartingX() const {
return TheStructure.Common.ThreadGroupIdStartingX;
}
inline void setThreadGroupIdXDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdXDimension = value;
}
inline uint32_t getThreadGroupIdXDimension() const {
return TheStructure.Common.ThreadGroupIdXDimension;
}
inline void setThreadGroupIdStartingY(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingY = value;
}
inline uint32_t getThreadGroupIdStartingY() const {
return TheStructure.Common.ThreadGroupIdStartingY;
}
inline void setThreadGroupIdYDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdYDimension = value;
}
inline uint32_t getThreadGroupIdYDimension() const {
return TheStructure.Common.ThreadGroupIdYDimension;
}
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
}
inline uint32_t getThreadGroupIdStartingResumeZ() const {
return TheStructure.Common.ThreadGroupIdStartingResumeZ;
}
inline void setThreadGroupIdZDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdZDimension = value;
}
inline uint32_t getThreadGroupIdZDimension() const {
return TheStructure.Common.ThreadGroupIdZDimension;
}
inline void setRightExecutionMask(const uint32_t value) {
TheStructure.Common.RightExecutionMask = value;
}
inline uint32_t getRightExecutionMask() const {
return TheStructure.Common.RightExecutionMask;
}
inline void setBottomExecutionMask(const uint32_t value) {
TheStructure.Common.BottomExecutionMask = value;
}
inline uint32_t getBottomExecutionMask() const {
return TheStructure.Common.BottomExecutionMask;
}
} GPGPU_WALKER;
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
typedef struct tagINTERFACE_DESCRIPTOR_DATA {
union tagTheStructure {
struct tagCommon {
@@ -486,11 +287,11 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
inline OVER_DISPATCH_CONTROL getOverDispatchControl() const {
return static_cast<OVER_DISPATCH_CONTROL>(TheStructure.Common.OverDispatchControl);
}
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
inline void setSharedLocalMemorySize(const uint32_t value) { // patched
TheStructure.Common.SharedLocalMemorySize = value;
}
inline SHARED_LOCAL_MEMORY_SIZE getSharedLocalMemorySize() const {
return static_cast<SHARED_LOCAL_MEMORY_SIZE>(TheStructure.Common.SharedLocalMemorySize);
inline uint32_t getSharedLocalMemorySize() const { // patched
return static_cast<uint32_t>(TheStructure.Common.SharedLocalMemorySize);
}
inline void setBarrierEnable(const uint32_t value) {
TheStructure.Common.BarrierEnable = (value > 0u) ? 1u : 0u;
@@ -513,6 +314,208 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
} INTERFACE_DESCRIPTOR_DATA;
STATIC_ASSERT(32 == sizeof(INTERFACE_DESCRIPTOR_DATA));
typedef struct tagGPGPU_WALKER {
union tagTheStructure {
struct tagCommon {
// DWORD 0
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
uint32_t CommandType : BITFIELD_RANGE(29, 31);
// DWORD 1
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
// DWORD 2
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
// DWORD 3
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
// DWORD 4
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
// DWORD 5
uint32_t ThreadGroupIdStartingX;
// DWORD 6
uint32_t Reserved_192;
// DWORD 7
uint32_t ThreadGroupIdXDimension;
// DWORD 8
uint32_t ThreadGroupIdStartingY;
// DWORD 9
uint32_t Reserved_288;
// DWORD 10
uint32_t ThreadGroupIdYDimension;
// DWORD 11
uint32_t ThreadGroupIdStartingResumeZ;
// DWORD 12
uint32_t ThreadGroupIdZDimension;
// DWORD 13
uint32_t RightExecutionMask;
// DWORD 14
uint32_t BottomExecutionMask;
} Common;
uint32_t RawData[15];
} TheStructure;
typedef enum tagDWORD_LENGTH {
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
} DWORD_LENGTH;
typedef enum tagSUBOPCODE {
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
} SUBOPCODE;
typedef enum tagMEDIA_COMMAND_OPCODE {
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
} MEDIA_COMMAND_OPCODE;
typedef enum tagPIPELINE {
PIPELINE_MEDIA = 0x2,
} PIPELINE;
typedef enum tagCOMMAND_TYPE {
COMMAND_TYPE_GFXPIPE = 0x3,
} COMMAND_TYPE;
typedef enum tagSIMD_SIZE {
SIMD_SIZE_SIMD8 = 0x0,
SIMD_SIZE_SIMD16 = 0x1,
SIMD_SIZE_SIMD32 = 0x2,
} SIMD_SIZE;
inline void init() {
memset(&TheStructure, 0, sizeof(TheStructure));
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
}
static tagGPGPU_WALKER sInit() {
GPGPU_WALKER state;
state.init();
return state;
}
inline uint32_t &getRawData(const uint32_t index) {
return TheStructure.RawData[index];
}
inline void setPredicateEnable(const bool value) {
TheStructure.Common.PredicateEnable = value;
}
inline bool getPredicateEnable() const {
return TheStructure.Common.PredicateEnable;
}
inline void setIndirectParameterEnable(const bool value) {
TheStructure.Common.IndirectParameterEnable = value;
}
inline bool getIndirectParameterEnable() const {
return TheStructure.Common.IndirectParameterEnable;
}
inline void setInterfaceDescriptorOffset(const uint32_t value) {
TheStructure.Common.InterfaceDescriptorOffset = value;
}
inline uint32_t getInterfaceDescriptorOffset() const {
return TheStructure.Common.InterfaceDescriptorOffset;
}
inline void setIndirectDataLength(const uint32_t value) {
TheStructure.Common.IndirectDataLength = value;
}
inline uint32_t getIndirectDataLength() const {
return TheStructure.Common.IndirectDataLength;
}
typedef enum tagINDIRECTDATASTARTADDRESS {
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
} INDIRECTDATASTARTADDRESS;
inline void setIndirectDataStartAddress(const uint32_t value) {
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
}
inline uint32_t getIndirectDataStartAddress() const {
return TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT;
}
inline void setThreadWidthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
}
inline uint32_t getThreadWidthCounterMaximum() const {
return TheStructure.Common.ThreadWidthCounterMaximum + 1;
}
inline void setThreadHeightCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
}
inline uint32_t getThreadHeightCounterMaximum() const {
return TheStructure.Common.ThreadHeightCounterMaximum + 1;
}
inline void setThreadDepthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadDepthCounterMaximum = value;
}
inline uint32_t getThreadDepthCounterMaximum() const {
return TheStructure.Common.ThreadDepthCounterMaximum;
}
inline void setSimdSize(const SIMD_SIZE value) {
TheStructure.Common.SimdSize = value;
}
inline SIMD_SIZE getSimdSize() const {
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
}
inline void setThreadGroupIdStartingX(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingX = value;
}
inline uint32_t getThreadGroupIdStartingX() const {
return TheStructure.Common.ThreadGroupIdStartingX;
}
inline void setThreadGroupIdXDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdXDimension = value;
}
inline uint32_t getThreadGroupIdXDimension() const {
return TheStructure.Common.ThreadGroupIdXDimension;
}
inline void setThreadGroupIdStartingY(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingY = value;
}
inline uint32_t getThreadGroupIdStartingY() const {
return TheStructure.Common.ThreadGroupIdStartingY;
}
inline void setThreadGroupIdYDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdYDimension = value;
}
inline uint32_t getThreadGroupIdYDimension() const {
return TheStructure.Common.ThreadGroupIdYDimension;
}
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
}
inline uint32_t getThreadGroupIdStartingResumeZ() const {
return TheStructure.Common.ThreadGroupIdStartingResumeZ;
}
inline void setThreadGroupIdZDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdZDimension = value;
}
inline uint32_t getThreadGroupIdZDimension() const {
return TheStructure.Common.ThreadGroupIdZDimension;
}
inline void setRightExecutionMask(const uint32_t value) {
TheStructure.Common.RightExecutionMask = value;
}
inline uint32_t getRightExecutionMask() const {
return TheStructure.Common.RightExecutionMask;
}
inline void setBottomExecutionMask(const uint32_t value) {
TheStructure.Common.BottomExecutionMask = value;
}
inline uint32_t getBottomExecutionMask() const {
return TheStructure.Common.BottomExecutionMask;
}
using InterfaceDescriptorType = INTERFACE_DESCRIPTOR_DATA; // patched
} GPGPU_WALKER;
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
typedef struct tagMEDIA_INTERFACE_DESCRIPTOR_LOAD {
union tagTheStructure {
struct tagCommon {

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2022 Intel Corporation
* Copyright (C) 2019-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -48,195 +48,6 @@ typedef struct tagBINDING_TABLE_STATE {
} BINDING_TABLE_STATE;
STATIC_ASSERT(4 == sizeof(BINDING_TABLE_STATE));
typedef struct tagGPGPU_WALKER {
union tagTheStructure {
struct tagCommon {
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
uint32_t CommandType : BITFIELD_RANGE(29, 31);
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
uint32_t ThreadGroupIdStartingX;
uint32_t Reserved_192;
uint32_t ThreadGroupIdXDimension;
uint32_t ThreadGroupIdStartingY;
uint32_t Reserved_288;
uint32_t ThreadGroupIdYDimension;
uint32_t ThreadGroupIdStartingResumeZ;
uint32_t ThreadGroupIdZDimension;
uint32_t RightExecutionMask;
uint32_t BottomExecutionMask;
} Common;
uint32_t RawData[15];
} TheStructure;
typedef enum tagDWORD_LENGTH {
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
} DWORD_LENGTH;
typedef enum tagSUBOPCODE {
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
} SUBOPCODE;
typedef enum tagMEDIA_COMMAND_OPCODE {
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
} MEDIA_COMMAND_OPCODE;
typedef enum tagPIPELINE {
PIPELINE_MEDIA = 0x2,
} PIPELINE;
typedef enum tagCOMMAND_TYPE {
COMMAND_TYPE_GFXPIPE = 0x3,
} COMMAND_TYPE;
typedef enum tagSIMD_SIZE {
SIMD_SIZE_SIMD8 = 0x0,
SIMD_SIZE_SIMD16 = 0x1,
SIMD_SIZE_SIMD32 = 0x2,
} SIMD_SIZE;
typedef enum tagPATCH_CONSTANTS {
INDIRECTDATASTARTADDRESS_BYTEOFFSET = 0xc,
INDIRECTDATASTARTADDRESS_INDEX = 0x3,
} PATCH_CONSTANTS;
inline void init() {
memset(&TheStructure, 0, sizeof(TheStructure));
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
}
static tagGPGPU_WALKER sInit() {
GPGPU_WALKER state;
state.init();
return state;
}
inline uint32_t &getRawData(const uint32_t index) {
DEBUG_BREAK_IF(index >= 15);
return TheStructure.RawData[index];
}
inline void setPredicateEnable(const bool value) {
TheStructure.Common.PredicateEnable = value;
}
inline bool getPredicateEnable() const {
return (TheStructure.Common.PredicateEnable);
}
inline void setIndirectParameterEnable(const bool value) {
TheStructure.Common.IndirectParameterEnable = value;
}
inline bool getIndirectParameterEnable() const {
return (TheStructure.Common.IndirectParameterEnable);
}
inline void setInterfaceDescriptorOffset(const uint32_t value) {
TheStructure.Common.InterfaceDescriptorOffset = value;
}
inline uint32_t getInterfaceDescriptorOffset() const {
return (TheStructure.Common.InterfaceDescriptorOffset);
}
inline void setIndirectDataLength(const uint32_t value) {
TheStructure.Common.IndirectDataLength = value;
}
inline uint32_t getIndirectDataLength() const {
return (TheStructure.Common.IndirectDataLength);
}
typedef enum tagINDIRECTDATASTARTADDRESS {
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
} INDIRECTDATASTARTADDRESS;
inline void setIndirectDataStartAddress(const uint32_t value) {
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
}
inline uint32_t getIndirectDataStartAddress() const {
return (TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT);
}
inline void setThreadWidthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
}
inline uint32_t getThreadWidthCounterMaximum() const {
return (TheStructure.Common.ThreadWidthCounterMaximum + 1);
}
inline void setThreadHeightCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
}
inline uint32_t getThreadHeightCounterMaximum() const {
return (TheStructure.Common.ThreadHeightCounterMaximum + 1);
}
inline void setThreadDepthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadDepthCounterMaximum = value;
}
inline uint32_t getThreadDepthCounterMaximum() const {
return (TheStructure.Common.ThreadDepthCounterMaximum);
}
inline void setSimdSize(const SIMD_SIZE value) {
TheStructure.Common.SimdSize = value;
}
inline SIMD_SIZE getSimdSize() const {
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
}
inline void setThreadGroupIdStartingX(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingX = value;
}
inline uint32_t getThreadGroupIdStartingX() const {
return (TheStructure.Common.ThreadGroupIdStartingX);
}
inline void setThreadGroupIdXDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdXDimension = value;
}
inline uint32_t getThreadGroupIdXDimension() const {
return (TheStructure.Common.ThreadGroupIdXDimension);
}
inline void setThreadGroupIdStartingY(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingY = value;
}
inline uint32_t getThreadGroupIdStartingY() const {
return (TheStructure.Common.ThreadGroupIdStartingY);
}
inline void setThreadGroupIdYDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdYDimension = value;
}
inline uint32_t getThreadGroupIdYDimension() const {
return (TheStructure.Common.ThreadGroupIdYDimension);
}
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
}
inline uint32_t getThreadGroupIdStartingResumeZ() const {
return (TheStructure.Common.ThreadGroupIdStartingResumeZ);
}
inline void setThreadGroupIdZDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdZDimension = value;
}
inline uint32_t getThreadGroupIdZDimension() const {
return (TheStructure.Common.ThreadGroupIdZDimension);
}
inline void setRightExecutionMask(const uint32_t value) {
TheStructure.Common.RightExecutionMask = value;
}
inline uint32_t getRightExecutionMask() const {
return (TheStructure.Common.RightExecutionMask);
}
inline void setBottomExecutionMask(const uint32_t value) {
TheStructure.Common.BottomExecutionMask = value;
}
inline uint32_t getBottomExecutionMask() const {
return (TheStructure.Common.BottomExecutionMask);
}
} GPGPU_WALKER;
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
typedef struct tagINTERFACE_DESCRIPTOR_DATA {
union tagTheStructure {
struct tagCommon {
@@ -452,11 +263,11 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
inline uint32_t getNumberOfThreadsInGpgpuThreadGroup() const {
return (TheStructure.Common.NumberOfThreadsInGpgpuThreadGroup);
}
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
inline void setSharedLocalMemorySize(const uint32_t value) { // patched
TheStructure.Common.SharedLocalMemorySize = value;
}
inline SHARED_LOCAL_MEMORY_SIZE getSharedLocalMemorySize() const {
return static_cast<SHARED_LOCAL_MEMORY_SIZE>(TheStructure.Common.SharedLocalMemorySize);
inline uint32_t getSharedLocalMemorySize() const { // patched
return static_cast<uint32_t>(TheStructure.Common.SharedLocalMemorySize);
}
inline void setBarrierEnable(const uint32_t value) {
TheStructure.Common.BarrierEnable = (value > 0u) ? 1u : 0u;
@@ -479,6 +290,196 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
} INTERFACE_DESCRIPTOR_DATA;
STATIC_ASSERT(32 == sizeof(INTERFACE_DESCRIPTOR_DATA));
typedef struct tagGPGPU_WALKER {
union tagTheStructure {
struct tagCommon {
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
uint32_t CommandType : BITFIELD_RANGE(29, 31);
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
uint32_t ThreadGroupIdStartingX;
uint32_t Reserved_192;
uint32_t ThreadGroupIdXDimension;
uint32_t ThreadGroupIdStartingY;
uint32_t Reserved_288;
uint32_t ThreadGroupIdYDimension;
uint32_t ThreadGroupIdStartingResumeZ;
uint32_t ThreadGroupIdZDimension;
uint32_t RightExecutionMask;
uint32_t BottomExecutionMask;
} Common;
uint32_t RawData[15];
} TheStructure;
typedef enum tagDWORD_LENGTH {
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
} DWORD_LENGTH;
typedef enum tagSUBOPCODE {
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
} SUBOPCODE;
typedef enum tagMEDIA_COMMAND_OPCODE {
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
} MEDIA_COMMAND_OPCODE;
typedef enum tagPIPELINE {
PIPELINE_MEDIA = 0x2,
} PIPELINE;
typedef enum tagCOMMAND_TYPE {
COMMAND_TYPE_GFXPIPE = 0x3,
} COMMAND_TYPE;
typedef enum tagSIMD_SIZE {
SIMD_SIZE_SIMD8 = 0x0,
SIMD_SIZE_SIMD16 = 0x1,
SIMD_SIZE_SIMD32 = 0x2,
} SIMD_SIZE;
typedef enum tagPATCH_CONSTANTS {
INDIRECTDATASTARTADDRESS_BYTEOFFSET = 0xc,
INDIRECTDATASTARTADDRESS_INDEX = 0x3,
} PATCH_CONSTANTS;
inline void init() {
memset(&TheStructure, 0, sizeof(TheStructure));
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
}
static tagGPGPU_WALKER sInit() {
GPGPU_WALKER state;
state.init();
return state;
}
inline uint32_t &getRawData(const uint32_t index) {
DEBUG_BREAK_IF(index >= 15);
return TheStructure.RawData[index];
}
inline void setPredicateEnable(const bool value) {
TheStructure.Common.PredicateEnable = value;
}
inline bool getPredicateEnable() const {
return (TheStructure.Common.PredicateEnable);
}
inline void setIndirectParameterEnable(const bool value) {
TheStructure.Common.IndirectParameterEnable = value;
}
inline bool getIndirectParameterEnable() const {
return (TheStructure.Common.IndirectParameterEnable);
}
inline void setInterfaceDescriptorOffset(const uint32_t value) {
TheStructure.Common.InterfaceDescriptorOffset = value;
}
inline uint32_t getInterfaceDescriptorOffset() const {
return (TheStructure.Common.InterfaceDescriptorOffset);
}
inline void setIndirectDataLength(const uint32_t value) {
TheStructure.Common.IndirectDataLength = value;
}
inline uint32_t getIndirectDataLength() const {
return (TheStructure.Common.IndirectDataLength);
}
typedef enum tagINDIRECTDATASTARTADDRESS {
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
} INDIRECTDATASTARTADDRESS;
inline void setIndirectDataStartAddress(const uint32_t value) {
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
}
inline uint32_t getIndirectDataStartAddress() const {
return (TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT);
}
inline void setThreadWidthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
}
inline uint32_t getThreadWidthCounterMaximum() const {
return (TheStructure.Common.ThreadWidthCounterMaximum + 1);
}
inline void setThreadHeightCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
}
inline uint32_t getThreadHeightCounterMaximum() const {
return (TheStructure.Common.ThreadHeightCounterMaximum + 1);
}
inline void setThreadDepthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadDepthCounterMaximum = value;
}
inline uint32_t getThreadDepthCounterMaximum() const {
return (TheStructure.Common.ThreadDepthCounterMaximum);
}
inline void setSimdSize(const SIMD_SIZE value) {
TheStructure.Common.SimdSize = value;
}
inline SIMD_SIZE getSimdSize() const {
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
}
inline void setThreadGroupIdStartingX(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingX = value;
}
inline uint32_t getThreadGroupIdStartingX() const {
return (TheStructure.Common.ThreadGroupIdStartingX);
}
inline void setThreadGroupIdXDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdXDimension = value;
}
inline uint32_t getThreadGroupIdXDimension() const {
return (TheStructure.Common.ThreadGroupIdXDimension);
}
inline void setThreadGroupIdStartingY(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingY = value;
}
inline uint32_t getThreadGroupIdStartingY() const {
return (TheStructure.Common.ThreadGroupIdStartingY);
}
inline void setThreadGroupIdYDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdYDimension = value;
}
inline uint32_t getThreadGroupIdYDimension() const {
return (TheStructure.Common.ThreadGroupIdYDimension);
}
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
}
inline uint32_t getThreadGroupIdStartingResumeZ() const {
return (TheStructure.Common.ThreadGroupIdStartingResumeZ);
}
inline void setThreadGroupIdZDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdZDimension = value;
}
inline uint32_t getThreadGroupIdZDimension() const {
return (TheStructure.Common.ThreadGroupIdZDimension);
}
inline void setRightExecutionMask(const uint32_t value) {
TheStructure.Common.RightExecutionMask = value;
}
inline uint32_t getRightExecutionMask() const {
return (TheStructure.Common.RightExecutionMask);
}
inline void setBottomExecutionMask(const uint32_t value) {
TheStructure.Common.BottomExecutionMask = value;
}
inline uint32_t getBottomExecutionMask() const {
return (TheStructure.Common.BottomExecutionMask);
}
using InterfaceDescriptorType = INTERFACE_DESCRIPTOR_DATA; // patched
} GPGPU_WALKER;
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
typedef struct tagMEDIA_INTERFACE_DESCRIPTOR_LOAD {
union tagTheStructure {
struct tagCommon {

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2022 Intel Corporation
* Copyright (C) 2019-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -48,195 +48,6 @@ typedef struct tagBINDING_TABLE_STATE {
} BINDING_TABLE_STATE;
STATIC_ASSERT(4 == sizeof(BINDING_TABLE_STATE));
typedef struct tagGPGPU_WALKER {
union tagTheStructure {
struct tagCommon {
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
uint32_t CommandType : BITFIELD_RANGE(29, 31);
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
uint32_t ThreadGroupIdStartingX;
uint32_t Reserved_192;
uint32_t ThreadGroupIdXDimension;
uint32_t ThreadGroupIdStartingY;
uint32_t Reserved_288;
uint32_t ThreadGroupIdYDimension;
uint32_t ThreadGroupIdStartingResumeZ;
uint32_t ThreadGroupIdZDimension;
uint32_t RightExecutionMask;
uint32_t BottomExecutionMask;
} Common;
uint32_t RawData[15];
} TheStructure;
typedef enum tagDWORD_LENGTH {
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
} DWORD_LENGTH;
typedef enum tagSUBOPCODE {
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
} SUBOPCODE;
typedef enum tagMEDIA_COMMAND_OPCODE {
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
} MEDIA_COMMAND_OPCODE;
typedef enum tagPIPELINE {
PIPELINE_MEDIA = 0x2,
} PIPELINE;
typedef enum tagCOMMAND_TYPE {
COMMAND_TYPE_GFXPIPE = 0x3,
} COMMAND_TYPE;
typedef enum tagSIMD_SIZE {
SIMD_SIZE_SIMD8 = 0x0,
SIMD_SIZE_SIMD16 = 0x1,
SIMD_SIZE_SIMD32 = 0x2,
} SIMD_SIZE;
typedef enum tagPATCH_CONSTANTS {
INDIRECTDATASTARTADDRESS_BYTEOFFSET = 0xc,
INDIRECTDATASTARTADDRESS_INDEX = 0x3,
} PATCH_CONSTANTS;
inline void init() {
memset(&TheStructure, 0, sizeof(TheStructure));
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
}
static tagGPGPU_WALKER sInit() {
GPGPU_WALKER state;
state.init();
return state;
}
inline uint32_t &getRawData(const uint32_t index) {
DEBUG_BREAK_IF(index >= 15);
return TheStructure.RawData[index];
}
inline void setPredicateEnable(const bool value) {
TheStructure.Common.PredicateEnable = value;
}
inline bool getPredicateEnable() const {
return (TheStructure.Common.PredicateEnable);
}
inline void setIndirectParameterEnable(const bool value) {
TheStructure.Common.IndirectParameterEnable = value;
}
inline bool getIndirectParameterEnable() const {
return (TheStructure.Common.IndirectParameterEnable);
}
inline void setInterfaceDescriptorOffset(const uint32_t value) {
TheStructure.Common.InterfaceDescriptorOffset = value;
}
inline uint32_t getInterfaceDescriptorOffset() const {
return (TheStructure.Common.InterfaceDescriptorOffset);
}
inline void setIndirectDataLength(const uint32_t value) {
TheStructure.Common.IndirectDataLength = value;
}
inline uint32_t getIndirectDataLength() const {
return (TheStructure.Common.IndirectDataLength);
}
typedef enum tagINDIRECTDATASTARTADDRESS {
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
} INDIRECTDATASTARTADDRESS;
inline void setIndirectDataStartAddress(const uint32_t value) {
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
}
inline uint32_t getIndirectDataStartAddress() const {
return (TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT);
}
inline void setThreadWidthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
}
inline uint32_t getThreadWidthCounterMaximum() const {
return (TheStructure.Common.ThreadWidthCounterMaximum + 1);
}
inline void setThreadHeightCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
}
inline uint32_t getThreadHeightCounterMaximum() const {
return (TheStructure.Common.ThreadHeightCounterMaximum + 1);
}
inline void setThreadDepthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadDepthCounterMaximum = value;
}
inline uint32_t getThreadDepthCounterMaximum() const {
return (TheStructure.Common.ThreadDepthCounterMaximum);
}
inline void setSimdSize(const SIMD_SIZE value) {
TheStructure.Common.SimdSize = value;
}
inline SIMD_SIZE getSimdSize() const {
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
}
inline void setThreadGroupIdStartingX(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingX = value;
}
inline uint32_t getThreadGroupIdStartingX() const {
return (TheStructure.Common.ThreadGroupIdStartingX);
}
inline void setThreadGroupIdXDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdXDimension = value;
}
inline uint32_t getThreadGroupIdXDimension() const {
return (TheStructure.Common.ThreadGroupIdXDimension);
}
inline void setThreadGroupIdStartingY(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingY = value;
}
inline uint32_t getThreadGroupIdStartingY() const {
return (TheStructure.Common.ThreadGroupIdStartingY);
}
inline void setThreadGroupIdYDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdYDimension = value;
}
inline uint32_t getThreadGroupIdYDimension() const {
return (TheStructure.Common.ThreadGroupIdYDimension);
}
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
}
inline uint32_t getThreadGroupIdStartingResumeZ() const {
return (TheStructure.Common.ThreadGroupIdStartingResumeZ);
}
inline void setThreadGroupIdZDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdZDimension = value;
}
inline uint32_t getThreadGroupIdZDimension() const {
return (TheStructure.Common.ThreadGroupIdZDimension);
}
inline void setRightExecutionMask(const uint32_t value) {
TheStructure.Common.RightExecutionMask = value;
}
inline uint32_t getRightExecutionMask() const {
return (TheStructure.Common.RightExecutionMask);
}
inline void setBottomExecutionMask(const uint32_t value) {
TheStructure.Common.BottomExecutionMask = value;
}
inline uint32_t getBottomExecutionMask() const {
return (TheStructure.Common.BottomExecutionMask);
}
} GPGPU_WALKER;
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
typedef struct tagINTERFACE_DESCRIPTOR_DATA {
union tagTheStructure {
struct tagCommon {
@@ -461,11 +272,11 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
inline bool getGlobalBarrierEnable() const {
return (TheStructure.Common.GlobalBarrierEnable);
}
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
inline void setSharedLocalMemorySize(const uint32_t value) { // patched
TheStructure.Common.SharedLocalMemorySize = value;
}
inline SHARED_LOCAL_MEMORY_SIZE getSharedLocalMemorySize() const {
return static_cast<SHARED_LOCAL_MEMORY_SIZE>(TheStructure.Common.SharedLocalMemorySize);
inline uint32_t getSharedLocalMemorySize() const { // patched
return static_cast<uint32_t>(TheStructure.Common.SharedLocalMemorySize);
}
inline void setBarrierEnable(const uint32_t value) {
TheStructure.Common.BarrierEnable = (value > 0u) ? 1u : 0u;
@@ -488,6 +299,196 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
} INTERFACE_DESCRIPTOR_DATA;
STATIC_ASSERT(32 == sizeof(INTERFACE_DESCRIPTOR_DATA));
typedef struct tagGPGPU_WALKER {
union tagTheStructure {
struct tagCommon {
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
uint32_t PredicateEnable : BITFIELD_RANGE(8, 8);
uint32_t Reserved_9 : BITFIELD_RANGE(9, 9);
uint32_t IndirectParameterEnable : BITFIELD_RANGE(10, 10);
uint32_t Reserved_11 : BITFIELD_RANGE(11, 15);
uint32_t Subopcode : BITFIELD_RANGE(16, 23);
uint32_t MediaCommandOpcode : BITFIELD_RANGE(24, 26);
uint32_t Pipeline : BITFIELD_RANGE(27, 28);
uint32_t CommandType : BITFIELD_RANGE(29, 31);
uint32_t InterfaceDescriptorOffset : BITFIELD_RANGE(0, 5);
uint32_t Reserved_38 : BITFIELD_RANGE(6, 31);
uint32_t IndirectDataLength : BITFIELD_RANGE(0, 16);
uint32_t Reserved_81 : BITFIELD_RANGE(17, 31);
uint32_t Reserved_96 : BITFIELD_RANGE(0, 5);
uint32_t IndirectDataStartAddress : BITFIELD_RANGE(6, 31);
uint32_t ThreadWidthCounterMaximum : BITFIELD_RANGE(0, 5);
uint32_t Reserved_134 : BITFIELD_RANGE(6, 7);
uint32_t ThreadHeightCounterMaximum : BITFIELD_RANGE(8, 13);
uint32_t Reserved_142 : BITFIELD_RANGE(14, 15);
uint32_t ThreadDepthCounterMaximum : BITFIELD_RANGE(16, 21);
uint32_t Reserved_150 : BITFIELD_RANGE(22, 29);
uint32_t SimdSize : BITFIELD_RANGE(30, 31);
uint32_t ThreadGroupIdStartingX;
uint32_t Reserved_192;
uint32_t ThreadGroupIdXDimension;
uint32_t ThreadGroupIdStartingY;
uint32_t Reserved_288;
uint32_t ThreadGroupIdYDimension;
uint32_t ThreadGroupIdStartingResumeZ;
uint32_t ThreadGroupIdZDimension;
uint32_t RightExecutionMask;
uint32_t BottomExecutionMask;
} Common;
uint32_t RawData[15];
} TheStructure;
typedef enum tagDWORD_LENGTH {
DWORD_LENGTH_DWORD_COUNT_N = 0xd,
} DWORD_LENGTH;
typedef enum tagSUBOPCODE {
SUBOPCODE_GPGPU_WALKER_SUBOP = 0x5,
} SUBOPCODE;
typedef enum tagMEDIA_COMMAND_OPCODE {
MEDIA_COMMAND_OPCODE_GPGPU_WALKER = 0x1,
} MEDIA_COMMAND_OPCODE;
typedef enum tagPIPELINE {
PIPELINE_MEDIA = 0x2,
} PIPELINE;
typedef enum tagCOMMAND_TYPE {
COMMAND_TYPE_GFXPIPE = 0x3,
} COMMAND_TYPE;
typedef enum tagSIMD_SIZE {
SIMD_SIZE_SIMD8 = 0x0,
SIMD_SIZE_SIMD16 = 0x1,
SIMD_SIZE_SIMD32 = 0x2,
} SIMD_SIZE;
typedef enum tagPATCH_CONSTANTS {
INDIRECTDATASTARTADDRESS_BYTEOFFSET = 0xc,
INDIRECTDATASTARTADDRESS_INDEX = 0x3,
} PATCH_CONSTANTS;
inline void init() {
memset(&TheStructure, 0, sizeof(TheStructure));
TheStructure.Common.DwordLength = DWORD_LENGTH_DWORD_COUNT_N;
TheStructure.Common.Subopcode = SUBOPCODE_GPGPU_WALKER_SUBOP;
TheStructure.Common.MediaCommandOpcode = MEDIA_COMMAND_OPCODE_GPGPU_WALKER;
TheStructure.Common.Pipeline = PIPELINE_MEDIA;
TheStructure.Common.CommandType = COMMAND_TYPE_GFXPIPE;
TheStructure.Common.SimdSize = SIMD_SIZE_SIMD8;
}
static tagGPGPU_WALKER sInit() {
GPGPU_WALKER state;
state.init();
return state;
}
inline uint32_t &getRawData(const uint32_t index) {
DEBUG_BREAK_IF(index >= 15);
return TheStructure.RawData[index];
}
inline void setPredicateEnable(const bool value) {
TheStructure.Common.PredicateEnable = value;
}
inline bool getPredicateEnable() const {
return (TheStructure.Common.PredicateEnable);
}
inline void setIndirectParameterEnable(const bool value) {
TheStructure.Common.IndirectParameterEnable = value;
}
inline bool getIndirectParameterEnable() const {
return (TheStructure.Common.IndirectParameterEnable);
}
inline void setInterfaceDescriptorOffset(const uint32_t value) {
TheStructure.Common.InterfaceDescriptorOffset = value;
}
inline uint32_t getInterfaceDescriptorOffset() const {
return (TheStructure.Common.InterfaceDescriptorOffset);
}
inline void setIndirectDataLength(const uint32_t value) {
TheStructure.Common.IndirectDataLength = value;
}
inline uint32_t getIndirectDataLength() const {
return (TheStructure.Common.IndirectDataLength);
}
typedef enum tagINDIRECTDATASTARTADDRESS {
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
} INDIRECTDATASTARTADDRESS;
inline void setIndirectDataStartAddress(const uint32_t value) {
TheStructure.Common.IndirectDataStartAddress = value >> INDIRECTDATASTARTADDRESS_BIT_SHIFT;
}
inline uint32_t getIndirectDataStartAddress() const {
return (TheStructure.Common.IndirectDataStartAddress << INDIRECTDATASTARTADDRESS_BIT_SHIFT);
}
inline void setThreadWidthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadWidthCounterMaximum = value - 1;
}
inline uint32_t getThreadWidthCounterMaximum() const {
return (TheStructure.Common.ThreadWidthCounterMaximum + 1);
}
inline void setThreadHeightCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadHeightCounterMaximum = value - 1;
}
inline uint32_t getThreadHeightCounterMaximum() const {
return (TheStructure.Common.ThreadHeightCounterMaximum + 1);
}
inline void setThreadDepthCounterMaximum(const uint32_t value) {
TheStructure.Common.ThreadDepthCounterMaximum = value;
}
inline uint32_t getThreadDepthCounterMaximum() const {
return (TheStructure.Common.ThreadDepthCounterMaximum);
}
inline void setSimdSize(const SIMD_SIZE value) {
TheStructure.Common.SimdSize = value;
}
inline SIMD_SIZE getSimdSize() const {
return static_cast<SIMD_SIZE>(TheStructure.Common.SimdSize);
}
inline void setThreadGroupIdStartingX(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingX = value;
}
inline uint32_t getThreadGroupIdStartingX() const {
return (TheStructure.Common.ThreadGroupIdStartingX);
}
inline void setThreadGroupIdXDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdXDimension = value;
}
inline uint32_t getThreadGroupIdXDimension() const {
return (TheStructure.Common.ThreadGroupIdXDimension);
}
inline void setThreadGroupIdStartingY(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingY = value;
}
inline uint32_t getThreadGroupIdStartingY() const {
return (TheStructure.Common.ThreadGroupIdStartingY);
}
inline void setThreadGroupIdYDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdYDimension = value;
}
inline uint32_t getThreadGroupIdYDimension() const {
return (TheStructure.Common.ThreadGroupIdYDimension);
}
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
TheStructure.Common.ThreadGroupIdStartingResumeZ = value;
}
inline uint32_t getThreadGroupIdStartingResumeZ() const {
return (TheStructure.Common.ThreadGroupIdStartingResumeZ);
}
inline void setThreadGroupIdZDimension(const uint32_t value) {
TheStructure.Common.ThreadGroupIdZDimension = value;
}
inline uint32_t getThreadGroupIdZDimension() const {
return (TheStructure.Common.ThreadGroupIdZDimension);
}
inline void setRightExecutionMask(const uint32_t value) {
TheStructure.Common.RightExecutionMask = value;
}
inline uint32_t getRightExecutionMask() const {
return (TheStructure.Common.RightExecutionMask);
}
inline void setBottomExecutionMask(const uint32_t value) {
TheStructure.Common.BottomExecutionMask = value;
}
inline uint32_t getBottomExecutionMask() const {
return (TheStructure.Common.BottomExecutionMask);
}
using InterfaceDescriptorType = INTERFACE_DESCRIPTOR_DATA; // patched
} GPGPU_WALKER;
STATIC_ASSERT(60 == sizeof(GPGPU_WALKER));
typedef struct tagMEDIA_INTERFACE_DESCRIPTOR_LOAD {
union tagTheStructure {
struct tagCommon {

View File

@@ -5382,11 +5382,11 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
inline uint32_t getNumberOfThreadsInGpgpuThreadGroup() const {
return TheStructure.Common.NumberOfThreadsInGpgpuThreadGroup;
}
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
inline void setSharedLocalMemorySize(const uint32_t value) { // patched
TheStructure.Common.SharedLocalMemorySize = value;
}
inline SHARED_LOCAL_MEMORY_SIZE getSharedLocalMemorySize() const {
return static_cast<SHARED_LOCAL_MEMORY_SIZE>(TheStructure.Common.SharedLocalMemorySize);
inline uint32_t getSharedLocalMemorySize() const { // patched
return static_cast<uint32_t>(TheStructure.Common.SharedLocalMemorySize);
}
inline void setRoundingMode(const ROUNDING_MODE value) {
TheStructure.Common.RoundingMode = value;
@@ -5808,6 +5808,7 @@ typedef struct tagCOMPUTE_WALKER {
inline uint32_t *getInlineDataPointer() {
return reinterpret_cast<uint32_t *>(&TheStructure.Common.InlineData);
}
using InterfaceDescriptorType = INTERFACE_DESCRIPTOR_DATA; // patched
} COMPUTE_WALKER;
STATIC_ASSERT(156 == sizeof(COMPUTE_WALKER));

View File

@@ -5149,11 +5149,11 @@ typedef struct tagINTERFACE_DESCRIPTOR_DATA {
inline uint32_t getNumberOfThreadsInGpgpuThreadGroup() const {
return TheStructure.Common.NumberOfThreadsInGpgpuThreadGroup;
}
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
inline void setSharedLocalMemorySize(const uint32_t value) { // patched
TheStructure.Common.SharedLocalMemorySize = value;
}
inline SHARED_LOCAL_MEMORY_SIZE getSharedLocalMemorySize() const {
return static_cast<SHARED_LOCAL_MEMORY_SIZE>(TheStructure.Common.SharedLocalMemorySize);
inline uint32_t getSharedLocalMemorySize() const { // patched
return static_cast<uint32_t>(TheStructure.Common.SharedLocalMemorySize);
}
inline void setRoundingMode(const ROUNDING_MODE value) {
TheStructure.Common.RoundingMode = value;
@@ -5579,6 +5579,7 @@ typedef struct tagCOMPUTE_WALKER {
inline uint32_t *getInlineDataPointer() {
return reinterpret_cast<uint32_t *>(&TheStructure.Common.InlineData);
}
using InterfaceDescriptorType = INTERFACE_DESCRIPTOR_DATA; // patched
} COMPUTE_WALKER;
STATIC_ASSERT(156 == sizeof(COMPUTE_WALKER));

View File

@@ -16,7 +16,6 @@ enum class MemoryCompressionState;
class GmmHelper;
class IndirectHeap;
class LinearStream;
struct DispatchFlags;
struct HardwareInfo;
struct StateBaseAddressProperties;

View File

@@ -11,6 +11,7 @@
#include "shared/source/gmm_helper/cache_settings_helper.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/helpers/cache_policy.h"
#include "shared/source/helpers/compiler_product_helper.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/state_base_address.h"
#include "shared/source/indirect_heap/indirect_heap.h"
@@ -21,6 +22,7 @@ namespace NEO {
template <typename GfxFamily>
void StateBaseAddressHelper<GfxFamily>::programStateBaseAddressIntoCommandStream(StateBaseAddressHelperArgs<GfxFamily> &args, NEO::LinearStream &commandStream) {
StateBaseAddressHelper<GfxFamily>::programStateBaseAddress(args);
auto cmdSpace = StateBaseAddressHelper<GfxFamily>::getSpaceForSbaCmd(commandStream);
*cmdSpace = *args.stateBaseAddressCmd;

View File

@@ -86,7 +86,6 @@ class ProductHelper {
virtual uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const = 0;
virtual void setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const = 0;
virtual void updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const = 0;
virtual void updateIddCommand(void *const commandPtr, uint32_t numGrf, int32_t threadArbitrationPolicy) const = 0;
virtual bool obtainBlitterPreference(const HardwareInfo &hwInfo) const = 0;
virtual bool isBlitterFullySupported(const HardwareInfo &hwInfo) const = 0;
virtual bool isPageTableManagerSupported(const HardwareInfo &hwInfo) const = 0;

View File

@@ -203,9 +203,6 @@ void ProductHelperHw<gfxProduct>::setForceNonCoherent(void *const commandPtr, co
template <PRODUCT_FAMILY gfxProduct>
void ProductHelperHw<gfxProduct>::updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const {}
template <PRODUCT_FAMILY gfxProduct>
void ProductHelperHw<gfxProduct>::updateIddCommand(void *const commandPtr, uint32_t numGrf, int32_t threadArbitrationPolicy) const {}
template <PRODUCT_FAMILY gfxProduct>
bool ProductHelperHw<gfxProduct>::isPageTableManagerSupported(const HardwareInfo &hwInfo) const {
return false;

View File

@@ -38,7 +38,6 @@ class ProductHelperHw : public ProductHelper {
uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const override;
void setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const override;
void updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const override;
void updateIddCommand(void *const commandPtr, uint32_t numGrf, int32_t threadArbitrationPolicy) const override;
bool obtainBlitterPreference(const HardwareInfo &hwInfo) const override;
bool isBlitterFullySupported(const HardwareInfo &hwInfo) const override;
bool isPageTableManagerSupported(const HardwareInfo &hwInfo) const override;

View File

@@ -27,12 +27,14 @@ using Family = NEO::XeHpcCoreFamily;
namespace NEO {
template <>
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {
template <typename WalkerType>
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WalkerType &walkerCmd, const HardwareInfo &hwInfo) {
walkerCmd.getPostSync().setDataportSubsliceCacheFlush(true);
}
template <>
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WALKER_TYPE &walkerCmd) {
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
const auto &productHelper = device.getProductHelper();
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
@@ -255,6 +257,7 @@ inline void EncodeMiFlushDW<Family>::adjust(MI_FLUSH_DW *miFlushDwCmd, const Pro
miFlushDwCmd->setFlushLlc(1);
}
template <>
template <>
void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor,
uint32_t value,
@@ -273,7 +276,8 @@ void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DAT
}
template <>
void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
const auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
auto programGlobalFenceAsPostSyncOperationInComputeWalker = productHelper.isGlobalFenceInCommandStreamRequired(hwInfo) &&
@@ -297,8 +301,9 @@ void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDevice
}
template <>
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
using PREFERRED_SLM_ALLOCATION_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::PREFERRED_SLM_ALLOCATION_SIZE;
template <typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
using PREFERRED_SLM_ALLOCATION_SIZE = typename InterfaceDescriptorType::PREFERRED_SLM_ALLOCATION_SIZE;
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
const uint32_t workGroupCountPerDss = static_cast<uint32_t>(Math::divideAndRoundUp(threadsPerDssCount, threadsPerThreadGroup));
@@ -367,6 +372,13 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
}
template struct EncodeDispatchKernel<Family>;
template void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::WALKER_TYPE>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template void EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
template void EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
template void EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
template void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::WALKER_TYPE, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::WALKER_TYPE &walkerCmd);
template void EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
template struct EncodeStates<Family>;
template struct EncodeMath<Family>;
template struct EncodeMathMMIO<Family>;

View File

@@ -133,6 +133,26 @@ struct XeHpcCoreFamily : public XeHpcCore {
static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) {
return cmdSetBaseFamily == IGFX_XE_HP_CORE;
}
template <typename WalkerType = WALKER_TYPE>
static WalkerType getInitGpuWalker() {
return cmdInitGpgpuWalker;
}
template <typename WalkerType = WALKER_TYPE>
static constexpr size_t getInterfaceDescriptorSize() {
return sizeof(INTERFACE_DESCRIPTOR_DATA);
}
template <typename InterfaceDescriptorType>
static InterfaceDescriptorType getInitInterfaceDescriptor() {
return cmdInitInterfaceDescriptorData;
}
template <typename WalkerType>
static constexpr bool isHeaplessMode() {
return false;
}
};
} // namespace NEO

View File

@@ -30,4 +30,7 @@ bool ImplicitScalingDispatch<Family>::platformSupportsImplicitScaling(const Root
}
template struct ImplicitScalingDispatch<Family>;
template void ImplicitScalingDispatch<Family>::dispatchCommands<Family::WALKER_TYPE>(LinearStream &commandStream, Family::WALKER_TYPE &walkerCmd, void **outWalkerPtr, const DeviceBitfield &devices, uint32_t &partitionCount, bool useSecondaryBatchBuffer, bool apiSelfCleanup, bool usesImages, bool dcFlush, bool forceExecutionOnSingleTile, uint64_t workPartitionAllocationGpuVa, const HardwareInfo &hwInfo);
template size_t ImplicitScalingDispatch<Family>::getSize<Family::WALKER_TYPE>(bool apiSelfCleanup, bool preferStaticPartitioning, const DeviceBitfield &devices, const Vec3<size_t> &groupStart, const Vec3<size_t> &groupCount);
} // namespace NEO

View File

@@ -26,15 +26,17 @@ using Family = NEO::XeHpgCoreFamily;
namespace NEO {
template <>
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {
template <typename WalkerType>
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WalkerType &walkerCmd, const HardwareInfo &hwInfo) {
auto &postSyncData = walkerCmd.getPostSync();
postSyncData.setDataportSubsliceCacheFlush(true);
}
template <>
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
using PREFERRED_SLM_ALLOCATION_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::PREFERRED_SLM_ALLOCATION_SIZE;
template <typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
using PREFERRED_SLM_ALLOCATION_SIZE = typename InterfaceDescriptorType::PREFERRED_SLM_ALLOCATION_SIZE;
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
const uint32_t workGroupCountPerDss = threadsPerDssCount / threadsPerThreadGroup;
@@ -92,7 +94,8 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTO
}
template <>
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WALKER_TYPE &walkerCmd) {
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
const auto &productHelper = device.getProductHelper();
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
if (interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup() == 1) {
@@ -108,6 +111,7 @@ void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCR
}
}
template <>
template <>
void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo) {
using BARRIERS = INTERFACE_DESCRIPTOR_DATA::NUMBER_OF_BARRIERS;
@@ -118,7 +122,8 @@ void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DAT
}
template <>
void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
auto *releaseHelper = rootDeviceEnvironment.getReleaseHelper();
bool l3PrefetchDisable = releaseHelper->isPrefetchDisablingRequired();
int32_t overrideL3PrefetchDisable = DebugManager.flags.ForceL3PrefetchForComputeWalker.get();
@@ -217,6 +222,13 @@ size_t EncodeMiFlushDW<Family>::getWaSize(const EncodeDummyBlitWaArgs &waArgs) {
template void flushGpuCache<Family>(LinearStream *commandStream, const Range<L3Range> &ranges, uint64_t postSyncAddress, const HardwareInfo &hwInfo);
template struct EncodeDispatchKernel<Family>;
template void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::WALKER_TYPE>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::WALKER_TYPE &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template void EncodeDispatchKernel<Family>::adjustTimestampPacket<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
template void EncodeDispatchKernel<Family>::setGrfInfo<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
template void EncodeDispatchKernel<Family>::appendAdditionalIDDFields<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
template void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData<Family::WALKER_TYPE, Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, Family::WALKER_TYPE &walkerCmd);
template void EncodeDispatchKernel<Family>::setupPostSyncMocs<Family::WALKER_TYPE>(Family::WALKER_TYPE &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush);
template struct EncodeStates<Family>;
template struct EncodeMath<Family>;
template struct EncodeMathMMIO<Family>;

View File

@@ -149,6 +149,26 @@ struct XeHpgCoreFamily : public XeHpgCore {
static constexpr bool supportsCmdSet(GFXCORE_FAMILY cmdSetBaseFamily) {
return cmdSetBaseFamily == IGFX_XE_HP_CORE;
}
template <typename WalkerType = WALKER_TYPE>
static WalkerType getInitGpuWalker() {
return cmdInitGpgpuWalker;
}
template <typename WalkerType = WALKER_TYPE>
static constexpr size_t getInterfaceDescriptorSize() {
return sizeof(INTERFACE_DESCRIPTOR_DATA);
}
template <typename InterfaceDescriptorType>
static InterfaceDescriptorType getInitInterfaceDescriptor() {
return cmdInitInterfaceDescriptorData;
}
template <typename WalkerType>
static constexpr bool isHeaplessMode() {
return false;
}
};
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
* Copyright (C) 2021-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -17,4 +17,7 @@ template <>
bool ImplicitScalingDispatch<Family>::pipeControlStallRequired = true;
template struct ImplicitScalingDispatch<Family>;
template void ImplicitScalingDispatch<Family>::dispatchCommands<Family::WALKER_TYPE>(LinearStream &commandStream, Family::WALKER_TYPE &walkerCmd, void **outWalkerPtr, const DeviceBitfield &devices, uint32_t &partitionCount, bool useSecondaryBatchBuffer, bool apiSelfCleanup, bool usesImages, bool dcFlush, bool forceExecutionOnSingleTile, uint64_t workPartitionAllocationGpuVa, const HardwareInfo &hwInfo);
template size_t ImplicitScalingDispatch<Family>::getSize<Family::WALKER_TYPE>(bool apiSelfCleanup, bool preferStaticPartitioning, const DeviceBitfield &devices, const Vec3<size_t> &groupStart, const Vec3<size_t> &groupCount);
} // namespace NEO

View File

@@ -319,10 +319,6 @@ template <>
void ProductHelperHw<IGFX_UNKNOWN>::updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const {
}
template <>
void ProductHelperHw<IGFX_UNKNOWN>::updateIddCommand(void *const commandPtr, uint32_t numGrf, int32_t threadArbitrationPolicy) const {
}
template <>
void ProductHelperHw<IGFX_UNKNOWN>::enableCompression(HardwareInfo *hwInfo) const {
}

View File

@@ -33,9 +33,9 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenOverrideSlmTotalSizeDebugVari
bool requiresUncachedMocs = false;
int32_t maxValueToProgram = 0xC;
uint32_t maxValueToProgram = 0xC;
for (int32_t valueToProgram = 0x0; valueToProgram < maxValueToProgram; valueToProgram++) {
for (uint32_t valueToProgram = 0x0; valueToProgram < maxValueToProgram; valueToProgram++) {
DebugManager.flags.OverrideSlmAllocationSize.set(valueToProgram);
cmdContainer->reset();
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);

View File

@@ -176,9 +176,9 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenOverrideSlmTotalSizeD
dispatchInterface->getSlmTotalSizeResult = slmTotalSize;
bool requiresUncachedMocs = false;
int32_t maxValueToProgram = 0x8;
uint32_t maxValueToProgram = 0x8;
for (int32_t valueToProgram = 0x0; valueToProgram < maxValueToProgram; valueToProgram++) {
for (uint32_t valueToProgram = 0x0; valueToProgram < maxValueToProgram; valueToProgram++) {
DebugManager.flags.OverrideSlmAllocationSize.set(valueToProgram);
cmdContainer->reset();
@@ -1164,7 +1164,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
size_t total = cmdContainer->getCommandStream()->getUsed();
size_t partitionedWalkerSize = total - containerUsedAfterBase;
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::getSize(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize);
GenCmdList partitionedWalkerList;
@@ -1215,7 +1215,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
EXPECT_EQ(2u, dispatchArgs.partitionCount);
size_t partitionedWalkerSize = cmdContainer->getCommandStream()->getUsed();
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::getSize(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize);
GenCmdList partitionedWalkerList;
@@ -1314,7 +1314,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling,
EXPECT_EQ(2u, dispatchArgs.partitionCount);
size_t partitionedWalkerSize = cmdContainer->getCommandStream()->getUsed();
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::getSize(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize);
GenCmdList partitionedWalkerList;

View File

@@ -33,7 +33,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenGetSizeWhenDispatchingCm
size_t expectedSize = 0;
size_t totalBytesProgrammed = 0;
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
@@ -76,7 +76,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenWorkgroupOneAndNoPartiti
size_t expectedSize = 0;
size_t totalBytesProgrammed = 0;
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, false, false, false, dcFlushFlag,
@@ -120,7 +120,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenWorkgroupOneAndPartition
size_t expectedSize = 0;
size_t totalBytesProgrammed = 0;
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
@@ -167,7 +167,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenDi
size_t expectedSize = 0;
size_t totalBytesProgrammed = 0;
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
@@ -219,7 +219,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningWhenPa
size_t expectedSize = 0;
size_t totalBytesProgrammed = 0;
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
@@ -273,7 +273,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
size_t expectedSize = 0;
size_t totalBytesProgrammed = 0;
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
@@ -324,7 +324,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
size_t expectedSize = 0;
size_t totalBytesProgrammed = 0;
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
@@ -361,7 +361,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
size_t expectedSize = 0;
size_t totalBytesProgrammed = 0;
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
@@ -398,7 +398,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenDynamicPartitioningPrefe
size_t expectedSize = 0;
size_t totalBytesProgrammed = 0;
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
expectedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
uint32_t partitionCount = 0;
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, nullptr, twoTile, partitionCount, true, false, false, dcFlushFlag,
@@ -445,7 +445,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
size_t estimatedSize = 0;
size_t totalBytesProgrammed = 0;
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
@@ -513,7 +513,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
size_t estimatedSize = 0;
size_t totalBytesProgrammed = 0;
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
@@ -573,7 +573,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
size_t estimatedSize = 0;
size_t totalBytesProgrammed = 0;
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
@@ -633,7 +633,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
size_t estimatedSize = 0;
size_t totalBytesProgrammed = 0;
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
@@ -700,7 +700,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
size_t estimatedSize = 0;
size_t totalBytesProgrammed = 0;
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
@@ -763,7 +763,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
size_t estimatedSize = 0;
size_t totalBytesProgrammed = 0;
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
@@ -828,7 +828,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
size_t estimatedSize = 0;
size_t totalBytesProgrammed = 0;
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
@@ -896,7 +896,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
size_t estimatedSize = 0;
size_t totalBytesProgrammed = 0;
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(true, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;
@@ -963,7 +963,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
size_t estimatedSize = 0;
size_t totalBytesProgrammed = 0;
estimatedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
estimatedSize = ImplicitScalingDispatch<FamilyType>::template getSize<WALKER_TYPE>(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(32, 1, 1));
EXPECT_EQ(expectedSize, estimatedSize);
uint32_t partitionCount = 0;

View File

@@ -53,7 +53,9 @@ class CommandEncodeStatesFixture : public DeviceFixture {
false, // useGlobalAtomics
false, // multiOsContextCapable
false, // isRcs
container->doubleSbaWaRef()}; // doubleSbaWa
container->doubleSbaWaRef(), // doubleSbaWa
false // heaplessModeEnabled
};
return args;
}