mirror of
https://github.com/intel/compute-runtime.git
synced 2025-09-10 12:53:42 +08:00
Fix execution of cooperative kernels on multi-tile device
Add flag for forcing execution of kernels on single tile Force cooperative kernels to use only single tile Related-to: NEO-6729 Signed-off-by: Naklicki, Mateusz <mateusz.naklicki@intel.com>
This commit is contained in:

committed by
Compute-Runtime-Automation

parent
359b9278b8
commit
914939c377
@ -799,7 +799,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
|
||||
!eventBuilder.getEvent() || getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), // outOfOrderExecutionAllowed
|
||||
false, // epilogueRequired
|
||||
false, // usePerDssBackedBuffer
|
||||
kernel->isSingleSubdevicePreferred(), // useSingleSubdevice
|
||||
false, // useSingleSubdevice
|
||||
useGlobalAtomics, // useGlobalAtomics
|
||||
kernel->areMultipleSubDevicesInContext(), // areMultipleSubDevicesInContext
|
||||
kernel->requiresMemoryMigration(), // memoryMigrationRequired
|
||||
|
@ -138,8 +138,7 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilin
|
||||
HardwareCommandsHelper<GfxFamily>::getSizeRequiredCS() +
|
||||
EncodeMemoryPrefetch<GfxFamily>::getSizeForMemoryPrefetch(pKernel->getKernelInfo().heapInfo.KernelHeapSize, commandQueue.getDevice().getHardwareInfo());
|
||||
auto devices = commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getDeviceBitfield();
|
||||
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices,
|
||||
!pKernel->isSingleSubdevicePreferred());
|
||||
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true);
|
||||
if (partitionWalker) {
|
||||
Vec3<size_t> groupStart = dispatchInfo.getStartOfWorkgroups();
|
||||
Vec3<size_t> groupCount = dispatchInfo.getNumberOfWorkgroups();
|
||||
|
@ -122,7 +122,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(hwInfo, walkerCmd, encodeWalkerArgs);
|
||||
|
||||
auto devices = queueCsr.getOsContext().getDeviceBitfield();
|
||||
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, !kernel.isSingleSubdevicePreferred());
|
||||
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true);
|
||||
|
||||
if (partitionWalker) {
|
||||
const uint64_t workPartitionAllocationGpuVa = commandQueue.getDevice().getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
|
||||
@ -135,6 +135,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
false,
|
||||
kernel.usesImages(),
|
||||
queueCsr.getDcFlushSupport(),
|
||||
kernel.isSingleSubdevicePreferred(),
|
||||
workPartitionAllocationGpuVa,
|
||||
hwInfo);
|
||||
if (queueCsr.isStaticWorkPartitioningEnabled()) {
|
||||
|
@ -205,7 +205,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
|
||||
commandQueue.getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), // outOfOrderExecutionAllowed
|
||||
false, // epilogueRequired
|
||||
false, // usePerDssBackedBuffer
|
||||
kernel->isSingleSubdevicePreferred(), // useSingleSubdevice
|
||||
false, // useSingleSubdevice
|
||||
kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, // useGlobalAtomics
|
||||
kernel->areMultipleSubDevicesInContext(), // areMultipleSubDevicesInContext
|
||||
kernel->requiresMemoryMigration(), // memoryMigrationRequired
|
||||
|
@ -1319,29 +1319,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenProgramWal
|
||||
EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType());
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenKernelThatPrefersSingleSubdeviceWhenProgramWalkerThenPartitioningIsNotUsed) {
|
||||
if (!OSInterface::osEnableLocalMemory) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
|
||||
struct SingleSubdeviceKernel : public MockKernel {
|
||||
using MockKernel::MockKernel;
|
||||
bool isSingleSubdevicePreferred() const override { return true; }
|
||||
};
|
||||
|
||||
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
|
||||
size_t gws[] = {2, 1, 1};
|
||||
size_t lws[] = {1, 1, 1};
|
||||
SingleSubdeviceKernel subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device);
|
||||
cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
|
||||
|
||||
ClHardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(*cmdQ);
|
||||
auto computeWalker = reinterpret_cast<typename FamilyType::COMPUTE_WALKER *>(hwParser.cmdWalker);
|
||||
ASSERT_NE(nullptr, computeWalker);
|
||||
EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED, computeWalker->getPartitionType());
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenProgramWalkerIsCalledWithPartitionLogicDisabledThenWalkerPartitionLogicIsNotExecuted) {
|
||||
if (!OSInterface::osEnableLocalMemory) {
|
||||
GTEST_SKIP();
|
||||
@ -1914,3 +1891,69 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerDispatchTest, givenSimdSize1TWhenCheckToGener
|
||||
EXPECT_TRUE(EncodeDispatchKernel<FamilyType>::isRuntimeLocalIdsGenerationRequired(
|
||||
workDim, lws.data(), walkOrder, false, requiredWalkOrder, simd));
|
||||
}
|
||||
|
||||
struct XeHPAndLaterDispatchWalkerTestMultiTileDevice : public XeHPAndLaterDispatchWalkerBasicTest {
|
||||
void SetUp() override {
|
||||
DebugManager.flags.CreateMultipleSubDevices.set(2u);
|
||||
|
||||
XeHPAndLaterDispatchWalkerBasicTest::SetUp();
|
||||
}
|
||||
void TearDown() override {
|
||||
XeHPAndLaterDispatchWalkerBasicTest::TearDown();
|
||||
}
|
||||
};
|
||||
|
||||
struct KernelWithSingleSubdevicePreferences : public MockKernel {
|
||||
using MockKernel::MockKernel;
|
||||
bool isSingleSubdevicePreferred() const override { return singleSubdevicePreferred; }
|
||||
|
||||
bool singleSubdevicePreferred = true;
|
||||
};
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerTestMultiTileDevice, givenKernelThatPrefersSingleSubdeviceWhenProgramWalkerThenKernelIsExecutedOnSingleTile) {
|
||||
if (!OSInterface::osEnableLocalMemory) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
|
||||
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
|
||||
size_t gws[] = {2, 1, 1};
|
||||
size_t lws[] = {1, 1, 1};
|
||||
auto &commandStreamReceiver = cmdQ->getUltCommandStreamReceiver();
|
||||
if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) {
|
||||
commandStreamReceiver.createPreemptionAllocation();
|
||||
}
|
||||
KernelWithSingleSubdevicePreferences subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device);
|
||||
subdeviceKernel.singleSubdevicePreferred = true;
|
||||
cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
|
||||
|
||||
ClHardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(*cmdQ);
|
||||
auto computeWalker = reinterpret_cast<typename FamilyType::COMPUTE_WALKER *>(hwParser.cmdWalker);
|
||||
ASSERT_NE(nullptr, computeWalker);
|
||||
EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType());
|
||||
EXPECT_EQ(2u, computeWalker->getPartitionSize());
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerTestMultiTileDevice, givenKernelThatDoesntPreferSingleSubdeviceWhenProgramWalkerThenKernelIsExecutedOnAllTiles) {
|
||||
if (!OSInterface::osEnableLocalMemory) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
|
||||
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
|
||||
size_t gws[] = {2, 1, 1};
|
||||
size_t lws[] = {1, 1, 1};
|
||||
auto &commandStreamReceiver = cmdQ->getUltCommandStreamReceiver();
|
||||
if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) {
|
||||
commandStreamReceiver.createPreemptionAllocation();
|
||||
}
|
||||
KernelWithSingleSubdevicePreferences subdeviceKernel(kernel->mockProgram, kernel->kernelInfo, *device);
|
||||
subdeviceKernel.singleSubdevicePreferred = false;
|
||||
cmdQ->enqueueKernel(&subdeviceKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr);
|
||||
|
||||
ClHardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(*cmdQ);
|
||||
auto computeWalker = reinterpret_cast<typename FamilyType::COMPUTE_WALKER *>(hwParser.cmdWalker);
|
||||
ASSERT_NE(nullptr, computeWalker);
|
||||
EXPECT_EQ(FamilyType::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_X, computeWalker->getPartitionType());
|
||||
EXPECT_EQ(1u, computeWalker->getPartitionSize());
|
||||
}
|
Reference in New Issue
Block a user