Fix execution of cooperative kernels on multi-tile device

Add flag for forcing execution of kernels on single tile
Force cooperative kernels to use only single tile

Related-to: NEO-6729
Signed-off-by: Naklicki, Mateusz <mateusz.naklicki@intel.com>
This commit is contained in:
Naklicki, Mateusz
2022-11-15 13:48:45 +00:00
committed by Compute-Runtime-Automation
parent 359b9278b8
commit 914939c377
15 changed files with 182 additions and 62 deletions

View File

@ -799,7 +799,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
!eventBuilder.getEvent() || getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), // outOfOrderExecutionAllowed
false, // epilogueRequired
false, // usePerDssBackedBuffer
kernel->isSingleSubdevicePreferred(), // useSingleSubdevice
false, // useSingleSubdevice
useGlobalAtomics, // useGlobalAtomics
kernel->areMultipleSubDevicesInContext(), // areMultipleSubDevicesInContext
kernel->requiresMemoryMigration(), // memoryMigrationRequired

View File

@ -138,8 +138,7 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilin
HardwareCommandsHelper<GfxFamily>::getSizeRequiredCS() +
EncodeMemoryPrefetch<GfxFamily>::getSizeForMemoryPrefetch(pKernel->getKernelInfo().heapInfo.KernelHeapSize, commandQueue.getDevice().getHardwareInfo());
auto devices = commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getDeviceBitfield();
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices,
!pKernel->isSingleSubdevicePreferred());
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true);
if (partitionWalker) {
Vec3<size_t> groupStart = dispatchInfo.getStartOfWorkgroups();
Vec3<size_t> groupCount = dispatchInfo.getNumberOfWorkgroups();

View File

@ -122,7 +122,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(hwInfo, walkerCmd, encodeWalkerArgs);
auto devices = queueCsr.getOsContext().getDeviceBitfield();
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, !kernel.isSingleSubdevicePreferred());
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true);
if (partitionWalker) {
const uint64_t workPartitionAllocationGpuVa = commandQueue.getDevice().getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
@ -135,6 +135,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
false,
kernel.usesImages(),
queueCsr.getDcFlushSupport(),
kernel.isSingleSubdevicePreferred(),
workPartitionAllocationGpuVa,
hwInfo);
if (queueCsr.isStaticWorkPartitioningEnabled()) {

View File

@ -205,7 +205,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
commandQueue.getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), // outOfOrderExecutionAllowed
false, // epilogueRequired
false, // usePerDssBackedBuffer
kernel->isSingleSubdevicePreferred(), // useSingleSubdevice
false, // useSingleSubdevice
kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, // useGlobalAtomics
kernel->areMultipleSubDevicesInContext(), // areMultipleSubDevicesInContext
kernel->requiresMemoryMigration(), // memoryMigrationRequired

View File

@ -1319,29 +1319,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenProgramWal
EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenKernelThatPrefersSingleSubdeviceWhenProgramWalkerThenPartitioningIsNotUsed) {
if (!OSInterface::osEnableLocalMemory) {
GTEST_SKIP();
}
struct SingleSubdeviceKernel : public MockKernel {
using MockKernel::MockKernel;
bool isSingleSubdevicePreferred() const override { return true; }
};
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
size_t gws[] = {2, 1, 1};
size_t lws[] = {1, 1, 1};
SingleSubdeviceKernel subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device);
cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
ClHardwareParse hwParser;
hwParser.parseCommands<FamilyType>(*cmdQ);
auto computeWalker = reinterpret_cast<typename FamilyType::COMPUTE_WALKER *>(hwParser.cmdWalker);
ASSERT_NE(nullptr, computeWalker);
EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, computeWalker->getPartitionType());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenProgramWalkerIsCalledWithPartitionLogicDisabledThenWalkerPartitionLogicIsNotExecuted) {
if (!OSInterface::osEnableLocalMemory) {
GTEST_SKIP();
@ -1914,3 +1891,69 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenSimdSize1TWhenCheckToGener
EXPECT_TRUE(EncodeDispatchKernel<FamilyType>::isRuntimeLocalIdsGenerationRequired(
workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd));
}
struct XeHPAndLaterDispatchWalkerTestMultiTileDevice : public XeHPAndLaterDispatchWalkerBasicTest {
void SetUp() override {
DebugManager.flags.CreateMultipleSubDevices.set(2u);
XeHPAndLaterDispatchWalkerBasicTest::SetUp();
}
void TearDown() override {
XeHPAndLaterDispatchWalkerBasicTest::TearDown();
}
};
struct KernelWithSingleSubdevicePreferences : public MockKernel {
using MockKernel::MockKernel;
bool isSingleSubdevicePreferred() const override { return singleSubdevicePreferred; }
bool singleSubdevicePreferred = true;
};
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerTestMultiTileDevice, givenKernelThatPrefersSingleSubdeviceWhenProgramWalkerThenKernelIsExecutedOnSingleTile) {
if (!OSInterface::osEnableLocalMemory) {
GTEST_SKIP();
}
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
size_t gws[] = {2, 1, 1};
size_t lws[] = {1, 1, 1};
auto &commandStreamReceiver = cmdQ->getUltCommandStreamReceiver();
if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) {
commandStreamReceiver.createPreemptionAllocation();
}
KernelWithSingleSubdevicePreferences subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device);
subdeviceKernel.singleSubdevicePreferred = true;
cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
ClHardwareParse hwParser;
hwParser.parseCommands<FamilyType>(*cmdQ);
auto computeWalker = reinterpret_cast<typename FamilyType::COMPUTE_WALKER *>(hwParser.cmdWalker);
ASSERT_NE(nullptr, computeWalker);
EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType());
EXPECT_EQ(2u, computeWalker->getPartitionSize());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerTestMultiTileDevice, givenKernelThatDoesntPreferSingleSubdeviceWhenProgramWalkerThenKernelIsExecutedOnAllTiles) {
if (!OSInterface::osEnableLocalMemory) {
GTEST_SKIP();
}
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
size_t gws[] = {2, 1, 1};
size_t lws[] = {1, 1, 1};
auto &commandStreamReceiver = cmdQ->getUltCommandStreamReceiver();
if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) {
commandStreamReceiver.createPreemptionAllocation();
}
KernelWithSingleSubdevicePreferences subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device);
subdeviceKernel.singleSubdevicePreferred = false;
cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
ClHardwareParse hwParser;
hwParser.parseCommands<FamilyType>(*cmdQ);
auto computeWalker = reinterpret_cast<typename FamilyType::COMPUTE_WALKER *>(hwParser.cmdWalker);
ASSERT_NE(nullptr, computeWalker);
EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType());
EXPECT_EQ(1u, computeWalker->getPartitionSize());
}