OCL: Allow for RelaxedOrdering barriers in IOQ mode

Related-To: NEO-7458

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-03-24 13:21:15 +00:00
committed by Compute-Runtime-Automation
parent e49e245bec
commit f522744886
4 changed files with 79 additions and 19 deletions

View File

@@ -407,7 +407,8 @@ class CommandQueueHw : public CommandQueue {
EventBuilder &eventBuilder,
TaskCountType taskLevel,
CsrDependencies &csrDeps,
CommandStreamReceiver *bcsCsr);
CommandStreamReceiver *bcsCsr,
bool hasRelaxedOrderingDependencies);
void processDispatchForCacheFlush(Surface **surfaces,
size_t numSurfaces,
LinearStream *commandStream,

View File

@@ -279,7 +279,15 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
}
if (flushDependenciesForNonKernelCommand) {
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, csrDeps, false);
if (isNonStallingIoqBarrierWithDependencies) {
relaxedOrderingEnabled = relaxedOrderingForGpgpuAllowed(static_cast<uint32_t>(csrDeps.timestampPacketContainer.size()));
}
if (relaxedOrderingEnabled) {
RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers<GfxFamily>(commandStream);
}
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, csrDeps, relaxedOrderingEnabled);
}
if (isNonStallingIoqBarrierWithDependencies) {
@@ -357,7 +365,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
eventBuilder,
taskLevel,
csrDeps,
nullptr);
nullptr,
relaxedOrderingEnabled);
} else {
UNRECOVERABLE_IF(enqueueProperties.operation != EnqueueProperties::Operation::EnqueueWithoutSubmission);
@@ -1061,7 +1070,8 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
EventBuilder &eventBuilder,
TaskCountType taskLevel,
CsrDependencies &csrDeps,
CommandStreamReceiver *bcsCsr) {
CommandStreamReceiver *bcsCsr,
bool hasRelaxedOrderingDependencies) {
CompletionStamp completionStamp = {this->taskCount, this->taskLevel, this->flushStamp->peekStamp()};
bool flushGpgpuCsr = true;
@@ -1122,8 +1132,8 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
context->containsMultipleSubDevices(rootDeviceIndex), // areMultipleSubDevicesInContext
false, // memoryMigrationRequired
false, // textureCacheFlush
true, // hasStallingCmds
false, // hasRelaxedOrderingDependencies
!hasRelaxedOrderingDependencies, // hasStallingCmds
hasRelaxedOrderingDependencies, // hasRelaxedOrderingDependencies
stateCacheInvalidationNeeded, // stateCacheInvalidation
isStallingCommandsOnNextFlushRequired()); // isStallingCommandsOnNextFlushRequired
@@ -1401,7 +1411,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
if (!blockQueue) {
completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking,
enqueueProperties, timestampPacketDependencies, eventsRequest,
eventBuilder, taskLevel, csrDeps, &bcsCsr);
eventBuilder, taskLevel, csrDeps, &bcsCsr, false);
if (completionStamp.taskCount > CompletionStamp::notReady) {
return CommandQueue::getErrorCodeFromTaskCount(completionStamp.taskCount);
}

View File

@@ -60,7 +60,7 @@ HWTEST_F(EnqueueHandlerTest, GivenCommandStreamWithoutKernelWhenCommandEnqueuedT
EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr);
mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
eventsRequest, eventBuilder, 0, csrDeps, nullptr);
eventsRequest, eventBuilder, 0, csrDeps, nullptr, false);
EXPECT_EQ(allocation->getTaskCount(mockCmdQ->getGpgpuCommandStreamReceiver().getOsContext().getContextId()), 1u);
}
@@ -88,7 +88,7 @@ HWTEST_F(EnqueueHandlerTest, givenLogicalStateHelperWhenDispatchingCommandsThenA
EXPECT_EQ(0u, logicalStateHelper->writeStreamInlineCalledCounter);
mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
eventsRequest, eventBuilder, 0, csrDeps, nullptr);
eventsRequest, eventBuilder, 0, csrDeps, nullptr, false);
EXPECT_EQ(1u, logicalStateHelper->writeStreamInlineCalledCounter);
@@ -164,7 +164,7 @@ HWTEST_F(EnqueueHandlerTimestampEnabledTest, givenProflingAndTimeStampPacketsEna
EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u);
mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
eventsRequest, eventBuilder, 0, csrDeps, nullptr);
eventsRequest, eventBuilder, 0, csrDeps, nullptr, false);
EXPECT_NE(ev->submitTimeStamp.CPUTimeinNS, 0u);
EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u);
@@ -202,7 +202,7 @@ HWTEST_F(EnqueueHandlerTimestampDisabledTest, givenProflingEnabledTimeStampPacke
EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u);
mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
eventsRequest, eventBuilder, 0, csrDeps, nullptr);
eventsRequest, eventBuilder, 0, csrDeps, nullptr, false);
EXPECT_NE(ev->submitTimeStamp.CPUTimeinNS, 0u);
EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u);
@@ -289,7 +289,7 @@ HWTEST_F(DispatchFlagsTests, whenEnqueueCommandWithoutKernelThenPassCorrectDispa
EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr);
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
eventsRequest, eventBuilder, 0, csrDeps, nullptr);
eventsRequest, eventBuilder, 0, csrDeps, nullptr, false);
EXPECT_EQ(blocking, mockCsr->passedDispatchFlags.blocking);
EXPECT_FALSE(mockCsr->passedDispatchFlags.implicitFlush);
@@ -317,7 +317,7 @@ HWTEST_F(DispatchFlagsTests, whenEnqueueCommandWithoutKernelThenPassCorrectThrot
bool blocking = true;
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
eventsRequest, eventBuilder, 0, csrDeps, nullptr);
eventsRequest, eventBuilder, 0, csrDeps, nullptr, false);
EXPECT_EQ(mockCmdQ->throttle, mockCsr->passedDispatchFlags.throttle);
}
@@ -359,7 +359,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitEnqueueWhenDispatchingCommandsWithoutK
EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer);
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr);
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false);
EXPECT_TRUE(mockCsr->passedDispatchFlags.implicitFlush);
EXPECT_TRUE(mockCsr->passedDispatchFlags.guardCommandBufferWithPipeControl);
@@ -398,7 +398,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitOperationWhenEnqueueCommandWithoutKern
EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer);
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr);
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false);
auto expectedValue = mockCmdQ->getGpgpuCommandStreamReceiver().getDcFlushSupport();
EXPECT_EQ(expectedValue, mockCsr->passedDispatchFlags.stateCacheInvalidation);
@@ -423,7 +423,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitOperationWhenEnqueueCommandWithoutKern
EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer);
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr);
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false);
EXPECT_TRUE(mockCsr->passedDispatchFlags.isStallingCommandsOnNextFlushRequired);
}
@@ -465,12 +465,12 @@ HWTEST_F(DispatchFlagsBlitTests, givenN1EnabledWhenDispatchingWithoutKernelThenA
mockCsr->nTo1SubmissionModelEnabled = false;
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocked, enqueueProperties, timestampPacketDependencies,
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr);
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false);
EXPECT_FALSE(mockCsr->passedDispatchFlags.outOfOrderExecutionAllowed);
mockCsr->nTo1SubmissionModelEnabled = true;
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocked, enqueueProperties, timestampPacketDependencies,
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr);
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false);
EXPECT_TRUE(mockCsr->passedDispatchFlags.outOfOrderExecutionAllowed);
}

View File

@@ -1083,9 +1083,9 @@ struct RelaxedOrderingEnqueueKernelTests : public EnqueueKernelTest {
};
HWTEST2_F(RelaxedOrderingEnqueueKernelTests, givenEnqueueKernelWhenProgrammingDependenciesThenUseConditionalBbStarts, IsAtLeastXeHpcCore) {
DebugManager.flags.OptimizeIoqBarriersHandling.set(0);
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
auto &ultCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
auto directSubmission = new MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>>(ultCsr);
@@ -1126,12 +1126,61 @@ HWTEST2_F(RelaxedOrderingEnqueueKernelTests, givenEnqueueKernelWhenProgrammingDe
mockCmdQueueHw.enqueueBarrierWithWaitList(1, &outEvent, nullptr);
// OptimizeIoqBarriersHandling disabled by debug flag
EXPECT_TRUE(ultCsr.recordedDispatchFlags.hasStallingCmds);
EXPECT_FALSE(ultCsr.recordedDispatchFlags.hasRelaxedOrderingDependencies);
clReleaseEvent(outEvent);
}
HWTEST2_F(RelaxedOrderingEnqueueKernelTests, givenBarrierWithDependenciesWhenFlushingThenAllowForRelaxedOrdering, IsAtLeastXeHpcCore) {
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
auto &ultCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
auto directSubmission = new MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>>(ultCsr);
ultCsr.directSubmission.reset(directSubmission);
ultCsr.registerClient();
ultCsr.registerClient();
MockCommandQueueHw<FamilyType> mockCmdQueueHw{context, pClDevice, nullptr};
cl_event outEvent;
MockKernelWithInternals mockKernel(*pClDevice);
mockCmdQueueHw.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &outEvent);
auto &cmdStream = mockCmdQueueHw.getCS(0);
auto cmdsOffset = cmdStream.getUsed();
mockCmdQueueHw.enqueueBarrierWithWaitList(1, &outEvent, nullptr);
EXPECT_FALSE(ultCsr.recordedDispatchFlags.hasStallingCmds);
EXPECT_TRUE(ultCsr.recordedDispatchFlags.hasRelaxedOrderingDependencies);
auto lrrCmd = reinterpret_cast<MI_LOAD_REGISTER_REG *>(ptrOffset(cmdStream.getCpuBase(), cmdsOffset));
EXPECT_EQ(lrrCmd->getSourceRegisterAddress(), CS_GPR_R4);
EXPECT_EQ(lrrCmd->getDestinationRegisterAddress(), CS_GPR_R0);
lrrCmd++;
EXPECT_EQ(lrrCmd->getSourceRegisterAddress(), CS_GPR_R4 + 4);
EXPECT_EQ(lrrCmd->getDestinationRegisterAddress(), CS_GPR_R0 + 4);
auto eventNode = castToObject<Event>(outEvent)->getTimestampPacketNodes()->peekNodes()[0];
auto compareAddress = eventNode->getGpuAddress() + eventNode->getContextEndOffset();
EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart<FamilyType>(++lrrCmd, 0, compareAddress, 1, CompareOperation::Equal, true));
auto conditionalBbStart2 = reinterpret_cast<void *>(ptrOffset(lrrCmd, EncodeBatchBufferStartOrEnd<FamilyType>::getCmdSizeConditionalDataMemBatchBufferStart()));
EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart<FamilyType>(conditionalBbStart2, 0, compareAddress, 1, CompareOperation::Equal, true));
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(ptrOffset(conditionalBbStart2, EncodeBatchBufferStartOrEnd<FamilyType>::getCmdSizeConditionalDataMemBatchBufferStart()));
EXPECT_NE(nullptr, sdiCmd);
clReleaseEvent(outEvent);
}
HWTEST2_F(RelaxedOrderingEnqueueKernelTests, givenPipeControlForIoqDependencyResolvingEnabledWhenDispatchingRelaxedOrderingThenThrow, IsAtLeastXeHpcCore) {
DebugManager.flags.ResolveDependenciesViaPipeControls.set(1);