fix: Few enqueue handler fixes

-do not wait for event TS under cmdQ's lock
-determine latest enqueue operation in correct order
-do not recognize marker as a barrier in some cases
-fix mutex order in enqueu blit

Related-To: HSD-16027856705

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2025-11-24 10:18:22 +00:00
committed by Compute-Runtime-Automation
parent 56b30d1803
commit 9228fa1251
4 changed files with 24 additions and 22 deletions

View File

@@ -189,7 +189,8 @@ bool CommandQueueHw<Family>::waitForTimestamps(std::span<CopyEngineState> copyEn
if (isWaitForTimestampsEnabled()) {
{
TakeOwnershipWrapper<CommandQueue> queueOwnership(*this);
// mainContainer == this->timestampPacketContainer.get() means wait is called from command queue on its TS. Lock is needed, bacuase another enqueue might generate TS and modify container
TakeOwnershipWrapper<CommandQueue> queueOwnership(*this, mainContainer == this->timestampPacketContainer.get());
waited = waitForTimestampsWithinContainer<TSPacketType>(mainContainer, getGpgpuCommandStreamReceiver(), status);
}

View File

@@ -198,7 +198,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
const auto &hwInfo = this->getDevice().getHardwareInfo();
auto &productHelper = getDevice().getProductHelper();
bool canUsePipeControlInsteadOfSemaphoresForOnCsrDependencies = false;
bool isNonStallingIoqBarrier = isFlushForProfilingRequired(commandType) && !isOOQEnabled() && (debugManager.flags.OptimizeIoqBarriersHandling.get() != 0);
bool isNonStallingIoqBarrier = commandType == CL_COMMAND_BARRIER && !isOOQEnabled() && (debugManager.flags.OptimizeIoqBarriersHandling.get() != 0);
const bool isNonStallingIoqBarrierWithDependencies = isNonStallingIoqBarrier && (eventsRequest.numEventsInWaitList > 0);
if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
@@ -1550,17 +1550,17 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
if (deferredMultiRootSyncNodes.get()) {
csrDeps.copyRootDeviceSyncNodesToNewContainer(*deferredMultiRootSyncNodes);
}
if (debugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() != 1) {
commandStreamReceiverOwnership.unlock();
}
queueOwnership.unlock();
if (migratedMemory) {
bcsCsr.flushBatchedSubmissions();
bcsCsr.flushTagUpdate();
}
if (debugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() != 1) {
commandStreamReceiverOwnership.unlock();
}
bcsCommandStreamReceiverOwnership.unlock();
queueOwnership.unlock();
if (blocking) {
const auto waitStatus = waitForAllEngines(blockQueue, nullptr, false);
if (waitStatus == WaitStatus::gpuHang) {

View File

@@ -41,13 +41,13 @@ struct EnqueueProperties {
return;
}
if (flushDependenciesOnly) {
operation = Operation::dependencyResolveOnGpu;
if (isFlushWithEvent) {
operation = Operation::profilingOnly;
return;
}
if (isFlushWithEvent) {
operation = Operation::profilingOnly;
if (flushDependenciesOnly) {
operation = Operation::dependencyResolveOnGpu;
return;
}

View File

@@ -37,7 +37,7 @@ using MultiIoqCmdQSynchronizationTest = CommandQueueHwBlitTest<false>;
HWTEST_F(MultiIoqCmdQSynchronizationTest, givenTwoIoqCmdQsWhenEnqueuesSynchronizedWithMarkersThenCorrectSynchronizationIsApplied) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
if (pCmdQ->getTimestampPacketContainer() == nullptr) {
GTEST_SKIP();
@@ -73,7 +73,11 @@ HWTEST_F(MultiIoqCmdQSynchronizationTest, givenTwoIoqCmdQsWhenEnqueuesSynchroniz
LinearStream &bcsStream = pCmdQ2->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)->getCS(0);
HardwareParse bcsHwParser;
bcsHwParser.parseCommands<FamilyType>(bcsStream, bcsStart);
auto semaphoreCmdBcs = genCmdCast<MI_SEMAPHORE_WAIT *>(*bcsHwParser.cmdList.begin());
auto semaphoreBcsItor = find<MI_SEMAPHORE_WAIT *>(bcsHwParser.cmdList.begin(), bcsHwParser.cmdList.end());
if (pClDevice->getProductHelper().isDcFlushAllowed()) {
++semaphoreBcsItor;
}
auto semaphoreCmdBcs = genCmdCast<MI_SEMAPHORE_WAIT *>(*semaphoreBcsItor);
EXPECT_NE(nullptr, semaphoreCmdBcs);
EXPECT_EQ(1u, semaphoreCmdBcs->getSemaphoreDataDword());
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, semaphoreCmdBcs->getCompareOperation());
@@ -91,18 +95,15 @@ HWTEST_F(MultiIoqCmdQSynchronizationTest, givenTwoIoqCmdQsWhenEnqueuesSynchroniz
EXPECT_EQ(nodeGpuAddress, semaphoreCmd->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, semaphoreCmd->getCompareOperation());
bool pipeControlForBcsSemaphoreFound = false;
auto pipeControlsAfterSemaphore = findAll<PIPE_CONTROL *>(semaphoreCcsItor, ccsHwParser.cmdList.end());
for (auto pipeControlIter : pipeControlsAfterSemaphore) {
auto pipeControlCmd = genCmdCast<PIPE_CONTROL *>(*pipeControlIter);
if (0u == pipeControlCmd->getImmediateData() &&
PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA == pipeControlCmd->getPostSyncOperation() &&
NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControlCmd) == bcsSemaphoreAddress) {
pipeControlForBcsSemaphoreFound = true;
break;
bool storeRegmemForBcsSemaphoreFound = false;
auto storeRegMems = findAll<MI_STORE_REGISTER_MEM *>(semaphoreCcsItor, ccsHwParser.cmdList.end());
for (auto storeRegMemIter : storeRegMems) {
auto storeRegMemCmd = genCmdCast<MI_STORE_REGISTER_MEM *>(*storeRegMemIter);
if (bcsSemaphoreAddress == storeRegMemCmd->getMemoryAddress()) {
storeRegmemForBcsSemaphoreFound = true;
}
}
EXPECT_TRUE(pipeControlForBcsSemaphoreFound);
EXPECT_TRUE(storeRegmemForBcsSemaphoreFound);
}
EXPECT_EQ(CL_SUCCESS, pCmdQ->finish(false));