mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-30 01:35:20 +08:00
Refactor interface to hardware interface
Related-To: NEO-6959 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
789dd1900e
commit
461a2eb8c7
@@ -440,16 +440,19 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
|
||||
hwPerfCounter = event->getHwPerfCounterNode();
|
||||
}
|
||||
|
||||
HardwareInterfaceWalkerArgs dispatchWalkerArgs = {};
|
||||
dispatchWalkerArgs.blockedCommandsData = blockedCommandsData;
|
||||
dispatchWalkerArgs.hwTimeStamps = hwTimeStamps;
|
||||
dispatchWalkerArgs.hwPerfCounter = hwPerfCounter;
|
||||
dispatchWalkerArgs.timestampPacketDependencies = ×tampPacketDependencies;
|
||||
dispatchWalkerArgs.currentTimestampPacketNodes = timestampPacketContainer.get();
|
||||
dispatchWalkerArgs.commandType = commandType;
|
||||
|
||||
HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
*this,
|
||||
multiDispatchInfo,
|
||||
csrDeps,
|
||||
blockedCommandsData,
|
||||
hwTimeStamps,
|
||||
hwPerfCounter,
|
||||
×tampPacketDependencies,
|
||||
timestampPacketContainer.get(),
|
||||
commandType);
|
||||
dispatchWalkerArgs);
|
||||
|
||||
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
|
||||
@@ -27,6 +27,24 @@ struct MultiDispatchInfo;
|
||||
template <class T>
|
||||
class TagNode;
|
||||
|
||||
struct HardwareInterfaceWalkerArgs {
|
||||
size_t globalWorkSizes[3] = {};
|
||||
size_t localWorkSizes[3] = {};
|
||||
TagNodeBase *hwTimeStamps = nullptr;
|
||||
TagNodeBase *hwPerfCounter = nullptr;
|
||||
TimestampPacketDependencies *timestampPacketDependencies = nullptr;
|
||||
TimestampPacketContainer *currentTimestampPacketNodes = nullptr;
|
||||
const Vec3<size_t> *numberOfWorkgroups = nullptr;
|
||||
const Vec3<size_t> *startOfWorkgroups = nullptr;
|
||||
KernelOperation *blockedCommandsData = nullptr;
|
||||
size_t currentDispatchIndex = 0;
|
||||
size_t offsetInterfaceDescriptorTable = 0;
|
||||
PreemptionMode preemptionMode = PreemptionMode::Initial;
|
||||
uint32_t commandType = 0;
|
||||
uint32_t interfaceDescriptorIndex = 0;
|
||||
bool isMainKernel = false;
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
class HardwareInterface {
|
||||
public:
|
||||
@@ -37,12 +55,7 @@ class HardwareInterface {
|
||||
CommandQueue &commandQueue,
|
||||
const MultiDispatchInfo &multiDispatchInfo,
|
||||
const CsrDependencies &csrDependencies,
|
||||
KernelOperation *blockedCommandsData,
|
||||
TagNodeBase *hwTimeStamps,
|
||||
TagNodeBase *hwPerfCounter,
|
||||
TimestampPacketDependencies *timestampPacketDependencies,
|
||||
TimestampPacketContainer *currentTimestampPacketNodes,
|
||||
uint32_t commandType);
|
||||
HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
|
||||
static void getDefaultDshSpace(
|
||||
const size_t &offsetInterfaceDescriptorTable,
|
||||
@@ -81,19 +94,11 @@ class HardwareInterface {
|
||||
LinearStream &commandStream,
|
||||
Kernel &kernel,
|
||||
CommandQueue &commandQueue,
|
||||
TimestampPacketContainer *currentTimestampPacketNodes,
|
||||
IndirectHeap &dsh,
|
||||
IndirectHeap &ioh,
|
||||
IndirectHeap &ssh,
|
||||
size_t globalWorkSizes[3],
|
||||
size_t localWorkSizes[3],
|
||||
PreemptionMode preemptionMode,
|
||||
size_t currentDispatchIndex,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
const DispatchInfo &dispatchInfo,
|
||||
size_t offsetInterfaceDescriptorTable,
|
||||
const Vec3<size_t> &numberOfWorkgroups,
|
||||
const Vec3<size_t> &startOfWorkgroups);
|
||||
HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
|
||||
static WALKER_TYPE *allocateWalkerSpace(LinearStream &commandStream,
|
||||
const Kernel &kernel);
|
||||
@@ -101,11 +106,9 @@ class HardwareInterface {
|
||||
static void obtainIndirectHeaps(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo,
|
||||
bool blockedQueue, IndirectHeap *&dsh, IndirectHeap *&ioh, IndirectHeap *&ssh);
|
||||
|
||||
static void dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, uint32_t commandType,
|
||||
LinearStream &commandStream, bool isMainKernel, size_t currentDispatchIndex,
|
||||
TimestampPacketContainer *currentTimestampPacketNodes, PreemptionMode preemptionMode,
|
||||
uint32_t &interfaceDescriptorIndex, size_t offsetInterfaceDescriptorTable,
|
||||
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh);
|
||||
static void dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream,
|
||||
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh,
|
||||
HardwareInterfaceWalkerArgs &walkerArgs);
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -64,17 +64,12 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
CommandQueue &commandQueue,
|
||||
const MultiDispatchInfo &multiDispatchInfo,
|
||||
const CsrDependencies &csrDependencies,
|
||||
KernelOperation *blockedCommandsData,
|
||||
TagNodeBase *hwTimeStamps,
|
||||
TagNodeBase *hwPerfCounter,
|
||||
TimestampPacketDependencies *timestampPacketDependencies,
|
||||
TimestampPacketContainer *currentTimestampPacketNodes,
|
||||
uint32_t commandType) {
|
||||
HardwareInterfaceWalkerArgs &walkerArgs) {
|
||||
|
||||
LinearStream *commandStream = nullptr;
|
||||
IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
|
||||
auto mainKernel = multiDispatchInfo.peekMainKernel();
|
||||
auto preemptionMode = ClPreemptionHelper::taskPreemptionMode(commandQueue.getDevice(), multiDispatchInfo);
|
||||
walkerArgs.preemptionMode = ClPreemptionHelper::taskPreemptionMode(commandQueue.getDevice(), multiDispatchInfo);
|
||||
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
// Compute local workgroup sizes
|
||||
@@ -85,11 +80,11 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
}
|
||||
|
||||
// Allocate command stream and indirect heaps
|
||||
bool blockedQueue = (blockedCommandsData != nullptr);
|
||||
bool blockedQueue = (walkerArgs.blockedCommandsData != nullptr);
|
||||
obtainIndirectHeaps(commandQueue, multiDispatchInfo, blockedQueue, dsh, ioh, ssh);
|
||||
if (blockedQueue) {
|
||||
blockedCommandsData->setHeaps(dsh, ioh, ssh);
|
||||
commandStream = blockedCommandsData->commandStream.get();
|
||||
walkerArgs.blockedCommandsData->setHeaps(dsh, ioh, ssh);
|
||||
commandStream = walkerArgs.blockedCommandsData->commandStream.get();
|
||||
} else {
|
||||
commandStream = &commandQueue.getCS(0);
|
||||
}
|
||||
@@ -119,22 +114,22 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
|
||||
dsh->align(EncodeStates<GfxFamily>::alignInterfaceDescriptorData);
|
||||
|
||||
uint32_t interfaceDescriptorIndex = 0;
|
||||
const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
|
||||
walkerArgs.interfaceDescriptorIndex = 0;
|
||||
walkerArgs.offsetInterfaceDescriptorTable = dsh->getUsed();
|
||||
|
||||
size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
|
||||
getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize, dsh, commandStream);
|
||||
getDefaultDshSpace(walkerArgs.offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize, dsh, commandStream);
|
||||
|
||||
// Program media interface descriptor load
|
||||
HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
|
||||
*commandStream,
|
||||
offsetInterfaceDescriptorTable,
|
||||
walkerArgs.offsetInterfaceDescriptorTable,
|
||||
totalInterfaceDescriptorTableSize);
|
||||
|
||||
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
|
||||
DEBUG_BREAK_IF(walkerArgs.offsetInterfaceDescriptorTable % 64 != 0);
|
||||
|
||||
dispatchProfilingPerfStartCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
|
||||
dispatchProfilingPerfStartCommands(walkerArgs.hwTimeStamps, walkerArgs.hwPerfCounter, commandStream, commandQueue);
|
||||
|
||||
const auto &hwInfo = commandQueue.getDevice().getHardwareInfo();
|
||||
if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnEnqueue.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount(), PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
|
||||
@@ -146,25 +141,23 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
multiDispatchInfo.begin()->getLocalWorkgroupSize(),
|
||||
multiDispatchInfo.begin()->getActualWorkgroupSize(),
|
||||
multiDispatchInfo.begin()->getOffset(),
|
||||
currentTimestampPacketNodes);
|
||||
walkerArgs.currentTimestampPacketNodes);
|
||||
|
||||
size_t currentDispatchIndex = 0;
|
||||
walkerArgs.currentDispatchIndex = 0;
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
dispatchInfo.dispatchInitCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo());
|
||||
bool isMainKernel = (dispatchInfo.getKernel() == mainKernel);
|
||||
dispatchInfo.dispatchInitCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo());
|
||||
walkerArgs.isMainKernel = (dispatchInfo.getKernel() == mainKernel);
|
||||
|
||||
dispatchKernelCommands(commandQueue, dispatchInfo, commandType, *commandStream, isMainKernel,
|
||||
currentDispatchIndex, currentTimestampPacketNodes, preemptionMode, interfaceDescriptorIndex,
|
||||
offsetInterfaceDescriptorTable, *dsh, *ioh, *ssh);
|
||||
dispatchKernelCommands(commandQueue, dispatchInfo, *commandStream, *dsh, *ioh, *ssh, walkerArgs);
|
||||
|
||||
currentDispatchIndex++;
|
||||
dispatchInfo.dispatchEpilogueCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo());
|
||||
walkerArgs.currentDispatchIndex++;
|
||||
dispatchInfo.dispatchEpilogueCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo());
|
||||
}
|
||||
|
||||
if (mainKernel->requiresCacheFlushCommand(commandQueue)) {
|
||||
uint64_t postSyncAddress = 0;
|
||||
if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacketNodeForPostSync = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
|
||||
auto timestampPacketNodeForPostSync = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
|
||||
timestampPacketNodeForPostSync->setProfilingCapable(false);
|
||||
postSyncAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNodeForPostSync);
|
||||
}
|
||||
@@ -182,15 +175,13 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
DebugPauseState::hasUserEndConfirmation, hwInfo);
|
||||
}
|
||||
|
||||
dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
|
||||
dispatchProfilingPerfEndCommands(walkerArgs.hwTimeStamps, walkerArgs.hwPerfCounter, commandStream, commandQueue);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, uint32_t commandType,
|
||||
LinearStream &commandStream, bool isMainKernel, size_t currentDispatchIndex,
|
||||
TimestampPacketContainer *currentTimestampPacketNodes, PreemptionMode preemptionMode,
|
||||
uint32_t &interfaceDescriptorIndex, size_t offsetInterfaceDescriptorTable,
|
||||
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh) {
|
||||
void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream,
|
||||
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh,
|
||||
HardwareInterfaceWalkerArgs &walkerArgs) {
|
||||
auto &kernel = *dispatchInfo.getKernel();
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
|
||||
@@ -199,7 +190,7 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
|
||||
|
||||
// If we don't have a required WGS, compute one opportunistically
|
||||
if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
|
||||
if (walkerArgs.commandType == CL_COMMAND_NDRANGE_KERNEL) {
|
||||
provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), dispatchInfo);
|
||||
}
|
||||
|
||||
@@ -207,7 +198,7 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
|
||||
auto dim = dispatchInfo.getDim();
|
||||
const auto &gws = dispatchInfo.getGWS();
|
||||
const auto &offset = dispatchInfo.getOffset();
|
||||
const auto &startOfWorkgroups = dispatchInfo.getStartOfWorkgroups();
|
||||
walkerArgs.startOfWorkgroups = &dispatchInfo.getStartOfWorkgroups();
|
||||
|
||||
// Compute local workgroup sizes
|
||||
const auto &lws = dispatchInfo.getLocalWorkgroupSize();
|
||||
@@ -215,37 +206,39 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
|
||||
|
||||
// Compute number of work groups
|
||||
const auto &totalNumberOfWorkgroups = dispatchInfo.getTotalNumberOfWorkgroups();
|
||||
const auto &numberOfWorkgroups = dispatchInfo.getNumberOfWorkgroups();
|
||||
walkerArgs.numberOfWorkgroups = &dispatchInfo.getNumberOfWorkgroups();
|
||||
UNRECOVERABLE_IF(totalNumberOfWorkgroups.x == 0);
|
||||
UNRECOVERABLE_IF(numberOfWorkgroups.x == 0);
|
||||
UNRECOVERABLE_IF(walkerArgs.numberOfWorkgroups->x == 0);
|
||||
|
||||
size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
|
||||
walkerArgs.globalWorkSizes[0] = gws.x;
|
||||
walkerArgs.globalWorkSizes[1] = gws.y;
|
||||
walkerArgs.globalWorkSizes[2] = gws.z;
|
||||
|
||||
// Patch our kernel constants
|
||||
kernel.setGlobalWorkOffsetValues(static_cast<uint32_t>(offset.x), static_cast<uint32_t>(offset.y), static_cast<uint32_t>(offset.z));
|
||||
kernel.setGlobalWorkSizeValues(static_cast<uint32_t>(gws.x), static_cast<uint32_t>(gws.y), static_cast<uint32_t>(gws.z));
|
||||
|
||||
if (isMainKernel || (!kernel.isLocalWorkSize2Patchable())) {
|
||||
if (walkerArgs.isMainKernel || (!kernel.isLocalWorkSize2Patchable())) {
|
||||
kernel.setLocalWorkSizeValues(static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
|
||||
}
|
||||
|
||||
kernel.setLocalWorkSize2Values(static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
|
||||
kernel.setEnqueuedLocalWorkSizeValues(static_cast<uint32_t>(elws.x), static_cast<uint32_t>(elws.y), static_cast<uint32_t>(elws.z));
|
||||
|
||||
if (isMainKernel) {
|
||||
if (walkerArgs.isMainKernel) {
|
||||
kernel.setNumWorkGroupsValues(static_cast<uint32_t>(totalNumberOfWorkgroups.x), static_cast<uint32_t>(totalNumberOfWorkgroups.y), static_cast<uint32_t>(totalNumberOfWorkgroups.z));
|
||||
}
|
||||
|
||||
kernel.setWorkDim(dim);
|
||||
|
||||
// Send our indirect object data
|
||||
size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
|
||||
walkerArgs.localWorkSizes[0] = lws.x;
|
||||
walkerArgs.localWorkSizes[1] = lws.y;
|
||||
walkerArgs.localWorkSizes[2] = lws.z;
|
||||
|
||||
dispatchWorkarounds(&commandStream, commandQueue, kernel, true);
|
||||
|
||||
programWalker(commandStream, kernel, commandQueue, currentTimestampPacketNodes, dsh, ioh, ssh, globalWorkSizes,
|
||||
localWorkSizes, preemptionMode, currentDispatchIndex, interfaceDescriptorIndex, dispatchInfo,
|
||||
offsetInterfaceDescriptorTable, numberOfWorkgroups, startOfWorkgroups);
|
||||
programWalker(commandStream, kernel, commandQueue, dsh, ioh, ssh, dispatchInfo, walkerArgs);
|
||||
|
||||
dispatchWorkarounds(&commandStream, commandQueue, kernel, false);
|
||||
}
|
||||
|
||||
@@ -51,19 +51,11 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
LinearStream &commandStream,
|
||||
Kernel &kernel,
|
||||
CommandQueue &commandQueue,
|
||||
TimestampPacketContainer *currentTimestampPacketNodes,
|
||||
IndirectHeap &dsh,
|
||||
IndirectHeap &ioh,
|
||||
IndirectHeap &ssh,
|
||||
size_t globalWorkSizes[3],
|
||||
size_t localWorkSizes[3],
|
||||
PreemptionMode preemptionMode,
|
||||
size_t currentDispatchIndex,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
const DispatchInfo &dispatchInfo,
|
||||
size_t offsetInterfaceDescriptorTable,
|
||||
const Vec3<size_t> &numberOfWorkgroups,
|
||||
const Vec3<size_t> &startOfWorkgroups) {
|
||||
HardwareInterfaceWalkerArgs &walkerArgs) {
|
||||
|
||||
auto walkerCmdBuf = allocateWalkerSpace(commandStream, kernel);
|
||||
WALKER_TYPE walkerCmd = GfxFamily::cmdInitGpgpuWalker;
|
||||
@@ -71,11 +63,11 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
|
||||
|
||||
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
|
||||
size_t startWorkGroups[3] = {startOfWorkgroups.x, startOfWorkgroups.y, startOfWorkgroups.z};
|
||||
size_t numWorkGroups[3] = {numberOfWorkgroups.x, numberOfWorkgroups.y, numberOfWorkgroups.z};
|
||||
size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z};
|
||||
size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z};
|
||||
|
||||
if (currentTimestampPacketNodes && commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacketNode = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
|
||||
if (walkerArgs.currentTimestampPacketNodes && commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacketNode = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
|
||||
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacketNode, commandQueue.getDevice().getRootDeviceEnvironment());
|
||||
}
|
||||
|
||||
@@ -90,10 +82,10 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
kernel,
|
||||
kernel.getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
|
||||
simd,
|
||||
localWorkSizes,
|
||||
offsetInterfaceDescriptorTable,
|
||||
interfaceDescriptorIndex,
|
||||
preemptionMode,
|
||||
walkerArgs.localWorkSizes,
|
||||
walkerArgs.offsetInterfaceDescriptorTable,
|
||||
walkerArgs.interfaceDescriptorIndex,
|
||||
walkerArgs.preemptionMode,
|
||||
&walkerCmd,
|
||||
nullptr,
|
||||
true,
|
||||
@@ -101,11 +93,11 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernel.getKernelInfo().kernelDescriptor,
|
||||
globalOffsets, startWorkGroups,
|
||||
numWorkGroups, localWorkSizes, simd, dim,
|
||||
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
|
||||
false, false, 0u);
|
||||
|
||||
EncodeWalkerArgs walkerArgs{kernel.getExecutionType(), false};
|
||||
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(commandQueue.getDevice().getHardwareInfo(), walkerCmd, walkerArgs);
|
||||
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), false};
|
||||
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(commandQueue.getDevice().getHardwareInfo(), walkerCmd, encodeWalkerArgs);
|
||||
*walkerCmdBuf = walkerCmd;
|
||||
}
|
||||
} // namespace NEO
|
||||
|
||||
@@ -41,19 +41,11 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
LinearStream &commandStream,
|
||||
Kernel &kernel,
|
||||
CommandQueue &commandQueue,
|
||||
TimestampPacketContainer *currentTimestampPacketNodes,
|
||||
IndirectHeap &dsh,
|
||||
IndirectHeap &ioh,
|
||||
IndirectHeap &ssh,
|
||||
size_t globalWorkSizes[3],
|
||||
size_t localWorkSizes[3],
|
||||
PreemptionMode preemptionMode,
|
||||
size_t currentDispatchIndex,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
const DispatchInfo &dispatchInfo,
|
||||
size_t offsetInterfaceDescriptorTable,
|
||||
const Vec3<size_t> &numberOfWorkgroups,
|
||||
const Vec3<size_t> &startOfWorkgroups) {
|
||||
HardwareInterfaceWalkerArgs &walkerArgs) {
|
||||
|
||||
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
|
||||
|
||||
@@ -66,13 +58,13 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||
|
||||
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
|
||||
size_t startWorkGroups[3] = {startOfWorkgroups.x, startOfWorkgroups.y, startOfWorkgroups.z};
|
||||
size_t numWorkGroups[3] = {numberOfWorkgroups.x, numberOfWorkgroups.y, numberOfWorkgroups.z};
|
||||
size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z};
|
||||
size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z};
|
||||
uint32_t requiredWalkOrder = 0u;
|
||||
|
||||
bool localIdsGenerationByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
|
||||
numChannels,
|
||||
localWorkSizes,
|
||||
walkerArgs.localWorkSizes,
|
||||
std::array<uint8_t, 3>{{kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
|
||||
kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
|
||||
kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},
|
||||
@@ -84,8 +76,8 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
auto idd = &walkerCmd.getInterfaceDescriptor();
|
||||
auto &queueCsr = commandQueue.getGpgpuCommandStreamReceiver();
|
||||
|
||||
if (currentTimestampPacketNodes && queueCsr.peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
|
||||
if (walkerArgs.currentTimestampPacketNodes && queueCsr.peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacket = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
|
||||
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacket, commandQueue.getDevice().getRootDeviceEnvironment());
|
||||
}
|
||||
|
||||
@@ -105,21 +97,21 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
kernel,
|
||||
kernel.getKernelStartAddress(localIdsGenerationByRuntime, kernelUsesLocalIds, isCcsUsed, false),
|
||||
simd,
|
||||
localWorkSizes,
|
||||
offsetInterfaceDescriptorTable,
|
||||
interfaceDescriptorIndex,
|
||||
preemptionMode,
|
||||
walkerArgs.localWorkSizes,
|
||||
walkerArgs.offsetInterfaceDescriptorTable,
|
||||
walkerArgs.interfaceDescriptorIndex,
|
||||
walkerArgs.preemptionMode,
|
||||
&walkerCmd,
|
||||
idd,
|
||||
localIdsGenerationByRuntime,
|
||||
commandQueue.getDevice());
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups,
|
||||
numWorkGroups, localWorkSizes, simd, dim,
|
||||
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
|
||||
localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder);
|
||||
|
||||
EncodeWalkerArgs walkerArgs{kernel.getExecutionType(), true};
|
||||
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(hwInfo, walkerCmd, walkerArgs);
|
||||
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), true};
|
||||
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(hwInfo, walkerCmd, encodeWalkerArgs);
|
||||
|
||||
auto devices = queueCsr.getOsContext().getDeviceBitfield();
|
||||
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, !kernel.isSingleSubdevicePreferred());
|
||||
@@ -139,7 +131,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
if (queueCsr.isStaticWorkPartitioningEnabled()) {
|
||||
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount));
|
||||
}
|
||||
auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
|
||||
auto timestampPacket = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
|
||||
timestampPacket->setPacketsUsed(partitionCount);
|
||||
} else {
|
||||
auto computeWalkerOnStream = commandStream.getSpaceForCmd<typename GfxFamily::COMPUTE_WALKER>();
|
||||
|
||||
Reference in New Issue
Block a user