Refactor interface to hardware interface

Related-To: NEO-6959

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2022-07-01 18:03:54 +00:00
committed by Compute-Runtime-Automation
parent 789dd1900e
commit 461a2eb8c7
13 changed files with 244 additions and 396 deletions

View File

@@ -440,16 +440,19 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
hwPerfCounter = event->getHwPerfCounterNode();
}
HardwareInterfaceWalkerArgs dispatchWalkerArgs = {};
dispatchWalkerArgs.blockedCommandsData = blockedCommandsData;
dispatchWalkerArgs.hwTimeStamps = hwTimeStamps;
dispatchWalkerArgs.hwPerfCounter = hwPerfCounter;
dispatchWalkerArgs.timestampPacketDependencies = &timestampPacketDependencies;
dispatchWalkerArgs.currentTimestampPacketNodes = timestampPacketContainer.get();
dispatchWalkerArgs.commandType = commandType;
HardwareInterface<GfxFamily>::dispatchWalker(
*this,
multiDispatchInfo,
csrDeps,
blockedCommandsData,
hwTimeStamps,
hwPerfCounter,
&timestampPacketDependencies,
timestampPacketContainer.get(),
commandType);
dispatchWalkerArgs);
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
for (auto &dispatchInfo : multiDispatchInfo) {

View File

@@ -27,6 +27,24 @@ struct MultiDispatchInfo;
template <class T>
class TagNode;
struct HardwareInterfaceWalkerArgs {
size_t globalWorkSizes[3] = {};
size_t localWorkSizes[3] = {};
TagNodeBase *hwTimeStamps = nullptr;
TagNodeBase *hwPerfCounter = nullptr;
TimestampPacketDependencies *timestampPacketDependencies = nullptr;
TimestampPacketContainer *currentTimestampPacketNodes = nullptr;
const Vec3<size_t> *numberOfWorkgroups = nullptr;
const Vec3<size_t> *startOfWorkgroups = nullptr;
KernelOperation *blockedCommandsData = nullptr;
size_t currentDispatchIndex = 0;
size_t offsetInterfaceDescriptorTable = 0;
PreemptionMode preemptionMode = PreemptionMode::Initial;
uint32_t commandType = 0;
uint32_t interfaceDescriptorIndex = 0;
bool isMainKernel = false;
};
template <typename GfxFamily>
class HardwareInterface {
public:
@@ -37,12 +55,7 @@ class HardwareInterface {
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
const CsrDependencies &csrDependencies,
KernelOperation *blockedCommandsData,
TagNodeBase *hwTimeStamps,
TagNodeBase *hwPerfCounter,
TimestampPacketDependencies *timestampPacketDependencies,
TimestampPacketContainer *currentTimestampPacketNodes,
uint32_t commandType);
HardwareInterfaceWalkerArgs &walkerArgs);
static void getDefaultDshSpace(
const size_t &offsetInterfaceDescriptorTable,
@@ -81,19 +94,11 @@ class HardwareInterface {
LinearStream &commandStream,
Kernel &kernel,
CommandQueue &commandQueue,
TimestampPacketContainer *currentTimestampPacketNodes,
IndirectHeap &dsh,
IndirectHeap &ioh,
IndirectHeap &ssh,
size_t globalWorkSizes[3],
size_t localWorkSizes[3],
PreemptionMode preemptionMode,
size_t currentDispatchIndex,
uint32_t &interfaceDescriptorIndex,
const DispatchInfo &dispatchInfo,
size_t offsetInterfaceDescriptorTable,
const Vec3<size_t> &numberOfWorkgroups,
const Vec3<size_t> &startOfWorkgroups);
HardwareInterfaceWalkerArgs &walkerArgs);
static WALKER_TYPE *allocateWalkerSpace(LinearStream &commandStream,
const Kernel &kernel);
@@ -101,11 +106,9 @@ class HardwareInterface {
static void obtainIndirectHeaps(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo,
bool blockedQueue, IndirectHeap *&dsh, IndirectHeap *&ioh, IndirectHeap *&ssh);
static void dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, uint32_t commandType,
LinearStream &commandStream, bool isMainKernel, size_t currentDispatchIndex,
TimestampPacketContainer *currentTimestampPacketNodes, PreemptionMode preemptionMode,
uint32_t &interfaceDescriptorIndex, size_t offsetInterfaceDescriptorTable,
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh);
static void dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream,
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh,
HardwareInterfaceWalkerArgs &walkerArgs);
};
} // namespace NEO

View File

@@ -64,17 +64,12 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
const CsrDependencies &csrDependencies,
KernelOperation *blockedCommandsData,
TagNodeBase *hwTimeStamps,
TagNodeBase *hwPerfCounter,
TimestampPacketDependencies *timestampPacketDependencies,
TimestampPacketContainer *currentTimestampPacketNodes,
uint32_t commandType) {
HardwareInterfaceWalkerArgs &walkerArgs) {
LinearStream *commandStream = nullptr;
IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
auto mainKernel = multiDispatchInfo.peekMainKernel();
auto preemptionMode = ClPreemptionHelper::taskPreemptionMode(commandQueue.getDevice(), multiDispatchInfo);
walkerArgs.preemptionMode = ClPreemptionHelper::taskPreemptionMode(commandQueue.getDevice(), multiDispatchInfo);
for (auto &dispatchInfo : multiDispatchInfo) {
// Compute local workgroup sizes
@@ -85,11 +80,11 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
}
// Allocate command stream and indirect heaps
bool blockedQueue = (blockedCommandsData != nullptr);
bool blockedQueue = (walkerArgs.blockedCommandsData != nullptr);
obtainIndirectHeaps(commandQueue, multiDispatchInfo, blockedQueue, dsh, ioh, ssh);
if (blockedQueue) {
blockedCommandsData->setHeaps(dsh, ioh, ssh);
commandStream = blockedCommandsData->commandStream.get();
walkerArgs.blockedCommandsData->setHeaps(dsh, ioh, ssh);
commandStream = walkerArgs.blockedCommandsData->commandStream.get();
} else {
commandStream = &commandQueue.getCS(0);
}
@@ -119,22 +114,22 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
dsh->align(EncodeStates<GfxFamily>::alignInterfaceDescriptorData);
uint32_t interfaceDescriptorIndex = 0;
const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
walkerArgs.interfaceDescriptorIndex = 0;
walkerArgs.offsetInterfaceDescriptorTable = dsh->getUsed();
size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize, dsh, commandStream);
getDefaultDshSpace(walkerArgs.offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize, dsh, commandStream);
// Program media interface descriptor load
HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
*commandStream,
offsetInterfaceDescriptorTable,
walkerArgs.offsetInterfaceDescriptorTable,
totalInterfaceDescriptorTableSize);
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
DEBUG_BREAK_IF(walkerArgs.offsetInterfaceDescriptorTable % 64 != 0);
dispatchProfilingPerfStartCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
dispatchProfilingPerfStartCommands(walkerArgs.hwTimeStamps, walkerArgs.hwPerfCounter, commandStream, commandQueue);
const auto &hwInfo = commandQueue.getDevice().getHardwareInfo();
if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnEnqueue.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount(), PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
@@ -146,25 +141,23 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
multiDispatchInfo.begin()->getLocalWorkgroupSize(),
multiDispatchInfo.begin()->getActualWorkgroupSize(),
multiDispatchInfo.begin()->getOffset(),
currentTimestampPacketNodes);
walkerArgs.currentTimestampPacketNodes);
size_t currentDispatchIndex = 0;
walkerArgs.currentDispatchIndex = 0;
for (auto &dispatchInfo : multiDispatchInfo) {
dispatchInfo.dispatchInitCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo());
bool isMainKernel = (dispatchInfo.getKernel() == mainKernel);
dispatchInfo.dispatchInitCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo());
walkerArgs.isMainKernel = (dispatchInfo.getKernel() == mainKernel);
dispatchKernelCommands(commandQueue, dispatchInfo, commandType, *commandStream, isMainKernel,
currentDispatchIndex, currentTimestampPacketNodes, preemptionMode, interfaceDescriptorIndex,
offsetInterfaceDescriptorTable, *dsh, *ioh, *ssh);
dispatchKernelCommands(commandQueue, dispatchInfo, *commandStream, *dsh, *ioh, *ssh, walkerArgs);
currentDispatchIndex++;
dispatchInfo.dispatchEpilogueCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo());
walkerArgs.currentDispatchIndex++;
dispatchInfo.dispatchEpilogueCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo());
}
if (mainKernel->requiresCacheFlushCommand(commandQueue)) {
uint64_t postSyncAddress = 0;
if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
auto timestampPacketNodeForPostSync = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
auto timestampPacketNodeForPostSync = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
timestampPacketNodeForPostSync->setProfilingCapable(false);
postSyncAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNodeForPostSync);
}
@@ -182,15 +175,13 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
DebugPauseState::hasUserEndConfirmation, hwInfo);
}
dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
dispatchProfilingPerfEndCommands(walkerArgs.hwTimeStamps, walkerArgs.hwPerfCounter, commandStream, commandQueue);
}
template <typename GfxFamily>
void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, uint32_t commandType,
LinearStream &commandStream, bool isMainKernel, size_t currentDispatchIndex,
TimestampPacketContainer *currentTimestampPacketNodes, PreemptionMode preemptionMode,
uint32_t &interfaceDescriptorIndex, size_t offsetInterfaceDescriptorTable,
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh) {
void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream,
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh,
HardwareInterfaceWalkerArgs &walkerArgs) {
auto &kernel = *dispatchInfo.getKernel();
DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
@@ -199,7 +190,7 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
// If we don't have a required WGS, compute one opportunistically
if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
if (walkerArgs.commandType == CL_COMMAND_NDRANGE_KERNEL) {
provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), dispatchInfo);
}
@@ -207,7 +198,7 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
auto dim = dispatchInfo.getDim();
const auto &gws = dispatchInfo.getGWS();
const auto &offset = dispatchInfo.getOffset();
const auto &startOfWorkgroups = dispatchInfo.getStartOfWorkgroups();
walkerArgs.startOfWorkgroups = &dispatchInfo.getStartOfWorkgroups();
// Compute local workgroup sizes
const auto &lws = dispatchInfo.getLocalWorkgroupSize();
@@ -215,37 +206,39 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
// Compute number of work groups
const auto &totalNumberOfWorkgroups = dispatchInfo.getTotalNumberOfWorkgroups();
const auto &numberOfWorkgroups = dispatchInfo.getNumberOfWorkgroups();
walkerArgs.numberOfWorkgroups = &dispatchInfo.getNumberOfWorkgroups();
UNRECOVERABLE_IF(totalNumberOfWorkgroups.x == 0);
UNRECOVERABLE_IF(numberOfWorkgroups.x == 0);
UNRECOVERABLE_IF(walkerArgs.numberOfWorkgroups->x == 0);
size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
walkerArgs.globalWorkSizes[0] = gws.x;
walkerArgs.globalWorkSizes[1] = gws.y;
walkerArgs.globalWorkSizes[2] = gws.z;
// Patch our kernel constants
kernel.setGlobalWorkOffsetValues(static_cast<uint32_t>(offset.x), static_cast<uint32_t>(offset.y), static_cast<uint32_t>(offset.z));
kernel.setGlobalWorkSizeValues(static_cast<uint32_t>(gws.x), static_cast<uint32_t>(gws.y), static_cast<uint32_t>(gws.z));
if (isMainKernel || (!kernel.isLocalWorkSize2Patchable())) {
if (walkerArgs.isMainKernel || (!kernel.isLocalWorkSize2Patchable())) {
kernel.setLocalWorkSizeValues(static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
}
kernel.setLocalWorkSize2Values(static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
kernel.setEnqueuedLocalWorkSizeValues(static_cast<uint32_t>(elws.x), static_cast<uint32_t>(elws.y), static_cast<uint32_t>(elws.z));
if (isMainKernel) {
if (walkerArgs.isMainKernel) {
kernel.setNumWorkGroupsValues(static_cast<uint32_t>(totalNumberOfWorkgroups.x), static_cast<uint32_t>(totalNumberOfWorkgroups.y), static_cast<uint32_t>(totalNumberOfWorkgroups.z));
}
kernel.setWorkDim(dim);
// Send our indirect object data
size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
walkerArgs.localWorkSizes[0] = lws.x;
walkerArgs.localWorkSizes[1] = lws.y;
walkerArgs.localWorkSizes[2] = lws.z;
dispatchWorkarounds(&commandStream, commandQueue, kernel, true);
programWalker(commandStream, kernel, commandQueue, currentTimestampPacketNodes, dsh, ioh, ssh, globalWorkSizes,
localWorkSizes, preemptionMode, currentDispatchIndex, interfaceDescriptorIndex, dispatchInfo,
offsetInterfaceDescriptorTable, numberOfWorkgroups, startOfWorkgroups);
programWalker(commandStream, kernel, commandQueue, dsh, ioh, ssh, dispatchInfo, walkerArgs);
dispatchWorkarounds(&commandStream, commandQueue, kernel, false);
}

View File

@@ -51,19 +51,11 @@ inline void HardwareInterface<GfxFamily>::programWalker(
LinearStream &commandStream,
Kernel &kernel,
CommandQueue &commandQueue,
TimestampPacketContainer *currentTimestampPacketNodes,
IndirectHeap &dsh,
IndirectHeap &ioh,
IndirectHeap &ssh,
size_t globalWorkSizes[3],
size_t localWorkSizes[3],
PreemptionMode preemptionMode,
size_t currentDispatchIndex,
uint32_t &interfaceDescriptorIndex,
const DispatchInfo &dispatchInfo,
size_t offsetInterfaceDescriptorTable,
const Vec3<size_t> &numberOfWorkgroups,
const Vec3<size_t> &startOfWorkgroups) {
HardwareInterfaceWalkerArgs &walkerArgs) {
auto walkerCmdBuf = allocateWalkerSpace(commandStream, kernel);
WALKER_TYPE walkerCmd = GfxFamily::cmdInitGpgpuWalker;
@@ -71,11 +63,11 @@ inline void HardwareInterface<GfxFamily>::programWalker(
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
size_t startWorkGroups[3] = {startOfWorkgroups.x, startOfWorkgroups.y, startOfWorkgroups.z};
size_t numWorkGroups[3] = {numberOfWorkgroups.x, numberOfWorkgroups.y, numberOfWorkgroups.z};
size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z};
size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z};
if (currentTimestampPacketNodes && commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
auto timestampPacketNode = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
if (walkerArgs.currentTimestampPacketNodes && commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
auto timestampPacketNode = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacketNode, commandQueue.getDevice().getRootDeviceEnvironment());
}
@@ -90,10 +82,10 @@ inline void HardwareInterface<GfxFamily>::programWalker(
kernel,
kernel.getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
simd,
localWorkSizes,
offsetInterfaceDescriptorTable,
interfaceDescriptorIndex,
preemptionMode,
walkerArgs.localWorkSizes,
walkerArgs.offsetInterfaceDescriptorTable,
walkerArgs.interfaceDescriptorIndex,
walkerArgs.preemptionMode,
&walkerCmd,
nullptr,
true,
@@ -101,11 +93,11 @@ inline void HardwareInterface<GfxFamily>::programWalker(
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernel.getKernelInfo().kernelDescriptor,
globalOffsets, startWorkGroups,
numWorkGroups, localWorkSizes, simd, dim,
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
false, false, 0u);
EncodeWalkerArgs walkerArgs{kernel.getExecutionType(), false};
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(commandQueue.getDevice().getHardwareInfo(), walkerCmd, walkerArgs);
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), false};
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(commandQueue.getDevice().getHardwareInfo(), walkerCmd, encodeWalkerArgs);
*walkerCmdBuf = walkerCmd;
}
} // namespace NEO

View File

@@ -41,19 +41,11 @@ inline void HardwareInterface<GfxFamily>::programWalker(
LinearStream &commandStream,
Kernel &kernel,
CommandQueue &commandQueue,
TimestampPacketContainer *currentTimestampPacketNodes,
IndirectHeap &dsh,
IndirectHeap &ioh,
IndirectHeap &ssh,
size_t globalWorkSizes[3],
size_t localWorkSizes[3],
PreemptionMode preemptionMode,
size_t currentDispatchIndex,
uint32_t &interfaceDescriptorIndex,
const DispatchInfo &dispatchInfo,
size_t offsetInterfaceDescriptorTable,
const Vec3<size_t> &numberOfWorkgroups,
const Vec3<size_t> &startOfWorkgroups) {
HardwareInterfaceWalkerArgs &walkerArgs) {
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
@@ -66,13 +58,13 @@ inline void HardwareInterface<GfxFamily>::programWalker(
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
size_t startWorkGroups[3] = {startOfWorkgroups.x, startOfWorkgroups.y, startOfWorkgroups.z};
size_t numWorkGroups[3] = {numberOfWorkgroups.x, numberOfWorkgroups.y, numberOfWorkgroups.z};
size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z};
size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z};
uint32_t requiredWalkOrder = 0u;
bool localIdsGenerationByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
numChannels,
localWorkSizes,
walkerArgs.localWorkSizes,
std::array<uint8_t, 3>{{kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},
@@ -84,8 +76,8 @@ inline void HardwareInterface<GfxFamily>::programWalker(
auto idd = &walkerCmd.getInterfaceDescriptor();
auto &queueCsr = commandQueue.getGpgpuCommandStreamReceiver();
if (currentTimestampPacketNodes && queueCsr.peekTimestampPacketWriteEnabled()) {
auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
if (walkerArgs.currentTimestampPacketNodes && queueCsr.peekTimestampPacketWriteEnabled()) {
auto timestampPacket = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacket, commandQueue.getDevice().getRootDeviceEnvironment());
}
@@ -105,21 +97,21 @@ inline void HardwareInterface<GfxFamily>::programWalker(
kernel,
kernel.getKernelStartAddress(localIdsGenerationByRuntime, kernelUsesLocalIds, isCcsUsed, false),
simd,
localWorkSizes,
offsetInterfaceDescriptorTable,
interfaceDescriptorIndex,
preemptionMode,
walkerArgs.localWorkSizes,
walkerArgs.offsetInterfaceDescriptorTable,
walkerArgs.interfaceDescriptorIndex,
walkerArgs.preemptionMode,
&walkerCmd,
idd,
localIdsGenerationByRuntime,
commandQueue.getDevice());
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups,
numWorkGroups, localWorkSizes, simd, dim,
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder);
EncodeWalkerArgs walkerArgs{kernel.getExecutionType(), true};
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(hwInfo, walkerCmd, walkerArgs);
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), true};
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(hwInfo, walkerCmd, encodeWalkerArgs);
auto devices = queueCsr.getOsContext().getDeviceBitfield();
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, !kernel.isSingleSubdevicePreferred());
@@ -139,7 +131,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
if (queueCsr.isStaticWorkPartitioningEnabled()) {
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount));
}
auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
auto timestampPacket = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
timestampPacket->setPacketsUsed(partitionCount);
} else {
auto computeWalkerOnStream = commandStream.getSpaceForCmd<typename GfxFamily::COMPUTE_WALKER>();