fix(ocl) use correct lws and groupCount to disable eu fusion check

Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
This commit is contained in:
Maciej Plewka 2023-02-14 14:32:36 +00:00 committed by Compute-Runtime-Automation
parent 90cd433766
commit 75a3f99685
2 changed files with 7 additions and 12 deletions

View File

@ -855,9 +855,10 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired;
dispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode = systolicPipelineSelectMode;
uint32_t lws[3] = {static_cast<uint32_t>(multiDispatchInfo.begin()->getLocalWorkgroupSize().x), static_cast<uint32_t>(multiDispatchInfo.begin()->getLocalWorkgroupSize().y), static_cast<uint32_t>(multiDispatchInfo.begin()->getLocalWorkgroupSize().z)};
uint32_t groupCount[3] = {static_cast<uint32_t>(multiDispatchInfo.begin()->getNumberOfWorkgroups().x), static_cast<uint32_t>(multiDispatchInfo.begin()->getNumberOfWorkgroups().y), static_cast<uint32_t>(multiDispatchInfo.begin()->getNumberOfWorkgroups().z)};
dispatchFlags.disableEUFusion = kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion ||
device->getProductHelper().isFusedEuDisabledForDpas(kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode, *kernel->getLocalWorkSizeValues().data(), *kernel->getNumWorkGroupsValues().data());
device->getProductHelper().isFusedEuDisabledForDpas(systolicPipelineSelectMode, lws, groupCount);
const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();

View File

@ -73,18 +73,12 @@ DG2TEST_F(CommandQueueHwTest, GivenKernelWithDpasAndOddWorkGroupWhenenqueueNonBl
EventsRequest eventsRequest(0, nullptr, nullptr);
EventBuilder eventBuilder;
LinearStream commandStream;
const_cast<NEO::KernelDescriptor &>(pKernel->getDescriptor()).payloadMappings.dispatchTraits.localWorkSize[0] = 0;
const_cast<NEO::KernelDescriptor &>(pKernel->getDescriptor()).payloadMappings.dispatchTraits.localWorkSize[1] = 4;
const_cast<NEO::KernelDescriptor &>(pKernel->getDescriptor()).payloadMappings.dispatchTraits.localWorkSize[2] = 8;
const_cast<NEO::KernelDescriptor &>(pKernel->getDescriptor()).payloadMappings.dispatchTraits.numWorkGroups[0] = 12;
const_cast<NEO::KernelDescriptor &>(pKernel->getDescriptor()).payloadMappings.dispatchTraits.numWorkGroups[1] = 16;
const_cast<NEO::KernelDescriptor &>(pKernel->getDescriptor()).payloadMappings.dispatchTraits.numWorkGroups[2] = 20;
pKernel->setLocalWorkSizeValues(3, 7, 1);
pKernel->setNumWorkGroupsValues(5, 1, 1);
DispatchInfo &dispatchInfo = *multiDispatchInfo.begin();
dispatchInfo.setLWS({3, 7, 1});
dispatchInfo.setNumberOfWorkgroups({5, 1, 1});
bool blocking = false;
const_cast<NEO::KernelDescriptor &>(pKernel->getDescriptor()).kernelAttributes.flags.usesSystolicPipelineSelectMode = true;
pKernel->setSystolicPipelineSelectMode(true);
cmdQ.template enqueueNonBlocked<CL_COMMAND_NDRANGE_KERNEL>(nullptr, 0, commandStream, commandStream.getUsed(), blocking, true, multiDispatchInfo, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, nullptr);
EXPECT_TRUE(csr->disableEuFusionPassed);
}