mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-19 06:24:51 +08:00
OCL: Allow for RelaxedOrdering barriers in IOQ mode
Related-To: NEO-7458 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
e49e245bec
commit
f522744886
@@ -407,7 +407,8 @@ class CommandQueueHw : public CommandQueue {
|
||||
EventBuilder &eventBuilder,
|
||||
TaskCountType taskLevel,
|
||||
CsrDependencies &csrDeps,
|
||||
CommandStreamReceiver *bcsCsr);
|
||||
CommandStreamReceiver *bcsCsr,
|
||||
bool hasRelaxedOrderingDependencies);
|
||||
void processDispatchForCacheFlush(Surface **surfaces,
|
||||
size_t numSurfaces,
|
||||
LinearStream *commandStream,
|
||||
|
||||
@@ -279,7 +279,15 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
}
|
||||
|
||||
if (flushDependenciesForNonKernelCommand) {
|
||||
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, csrDeps, false);
|
||||
if (isNonStallingIoqBarrierWithDependencies) {
|
||||
relaxedOrderingEnabled = relaxedOrderingForGpgpuAllowed(static_cast<uint32_t>(csrDeps.timestampPacketContainer.size()));
|
||||
}
|
||||
|
||||
if (relaxedOrderingEnabled) {
|
||||
RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers<GfxFamily>(commandStream);
|
||||
}
|
||||
|
||||
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, csrDeps, relaxedOrderingEnabled);
|
||||
}
|
||||
|
||||
if (isNonStallingIoqBarrierWithDependencies) {
|
||||
@@ -357,7 +365,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
eventBuilder,
|
||||
taskLevel,
|
||||
csrDeps,
|
||||
nullptr);
|
||||
nullptr,
|
||||
relaxedOrderingEnabled);
|
||||
} else {
|
||||
UNRECOVERABLE_IF(enqueueProperties.operation != EnqueueProperties::Operation::EnqueueWithoutSubmission);
|
||||
|
||||
@@ -1061,7 +1070,8 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
|
||||
EventBuilder &eventBuilder,
|
||||
TaskCountType taskLevel,
|
||||
CsrDependencies &csrDeps,
|
||||
CommandStreamReceiver *bcsCsr) {
|
||||
CommandStreamReceiver *bcsCsr,
|
||||
bool hasRelaxedOrderingDependencies) {
|
||||
|
||||
CompletionStamp completionStamp = {this->taskCount, this->taskLevel, this->flushStamp->peekStamp()};
|
||||
bool flushGpgpuCsr = true;
|
||||
@@ -1122,8 +1132,8 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
|
||||
context->containsMultipleSubDevices(rootDeviceIndex), // areMultipleSubDevicesInContext
|
||||
false, // memoryMigrationRequired
|
||||
false, // textureCacheFlush
|
||||
true, // hasStallingCmds
|
||||
false, // hasRelaxedOrderingDependencies
|
||||
!hasRelaxedOrderingDependencies, // hasStallingCmds
|
||||
hasRelaxedOrderingDependencies, // hasRelaxedOrderingDependencies
|
||||
stateCacheInvalidationNeeded, // stateCacheInvalidation
|
||||
isStallingCommandsOnNextFlushRequired()); // isStallingCommandsOnNextFlushRequired
|
||||
|
||||
@@ -1401,7 +1411,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
|
||||
if (!blockQueue) {
|
||||
completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking,
|
||||
enqueueProperties, timestampPacketDependencies, eventsRequest,
|
||||
eventBuilder, taskLevel, csrDeps, &bcsCsr);
|
||||
eventBuilder, taskLevel, csrDeps, &bcsCsr, false);
|
||||
if (completionStamp.taskCount > CompletionStamp::notReady) {
|
||||
return CommandQueue::getErrorCodeFromTaskCount(completionStamp.taskCount);
|
||||
}
|
||||
|
||||
@@ -60,7 +60,7 @@ HWTEST_F(EnqueueHandlerTest, GivenCommandStreamWithoutKernelWhenCommandEnqueuedT
|
||||
EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr);
|
||||
|
||||
mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
|
||||
eventsRequest, eventBuilder, 0, csrDeps, nullptr);
|
||||
eventsRequest, eventBuilder, 0, csrDeps, nullptr, false);
|
||||
EXPECT_EQ(allocation->getTaskCount(mockCmdQ->getGpgpuCommandStreamReceiver().getOsContext().getContextId()), 1u);
|
||||
}
|
||||
|
||||
@@ -88,7 +88,7 @@ HWTEST_F(EnqueueHandlerTest, givenLogicalStateHelperWhenDispatchingCommandsThenA
|
||||
EXPECT_EQ(0u, logicalStateHelper->writeStreamInlineCalledCounter);
|
||||
|
||||
mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
|
||||
eventsRequest, eventBuilder, 0, csrDeps, nullptr);
|
||||
eventsRequest, eventBuilder, 0, csrDeps, nullptr, false);
|
||||
|
||||
EXPECT_EQ(1u, logicalStateHelper->writeStreamInlineCalledCounter);
|
||||
|
||||
@@ -164,7 +164,7 @@ HWTEST_F(EnqueueHandlerTimestampEnabledTest, givenProflingAndTimeStampPacketsEna
|
||||
EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u);
|
||||
|
||||
mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
|
||||
eventsRequest, eventBuilder, 0, csrDeps, nullptr);
|
||||
eventsRequest, eventBuilder, 0, csrDeps, nullptr, false);
|
||||
|
||||
EXPECT_NE(ev->submitTimeStamp.CPUTimeinNS, 0u);
|
||||
EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u);
|
||||
@@ -202,7 +202,7 @@ HWTEST_F(EnqueueHandlerTimestampDisabledTest, givenProflingEnabledTimeStampPacke
|
||||
EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u);
|
||||
|
||||
mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
|
||||
eventsRequest, eventBuilder, 0, csrDeps, nullptr);
|
||||
eventsRequest, eventBuilder, 0, csrDeps, nullptr, false);
|
||||
|
||||
EXPECT_NE(ev->submitTimeStamp.CPUTimeinNS, 0u);
|
||||
EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u);
|
||||
@@ -289,7 +289,7 @@ HWTEST_F(DispatchFlagsTests, whenEnqueueCommandWithoutKernelThenPassCorrectDispa
|
||||
|
||||
EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr);
|
||||
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
|
||||
eventsRequest, eventBuilder, 0, csrDeps, nullptr);
|
||||
eventsRequest, eventBuilder, 0, csrDeps, nullptr, false);
|
||||
|
||||
EXPECT_EQ(blocking, mockCsr->passedDispatchFlags.blocking);
|
||||
EXPECT_FALSE(mockCsr->passedDispatchFlags.implicitFlush);
|
||||
@@ -317,7 +317,7 @@ HWTEST_F(DispatchFlagsTests, whenEnqueueCommandWithoutKernelThenPassCorrectThrot
|
||||
bool blocking = true;
|
||||
|
||||
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
|
||||
eventsRequest, eventBuilder, 0, csrDeps, nullptr);
|
||||
eventsRequest, eventBuilder, 0, csrDeps, nullptr, false);
|
||||
|
||||
EXPECT_EQ(mockCmdQ->throttle, mockCsr->passedDispatchFlags.throttle);
|
||||
}
|
||||
@@ -359,7 +359,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitEnqueueWhenDispatchingCommandsWithoutK
|
||||
|
||||
EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer);
|
||||
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
|
||||
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr);
|
||||
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false);
|
||||
|
||||
EXPECT_TRUE(mockCsr->passedDispatchFlags.implicitFlush);
|
||||
EXPECT_TRUE(mockCsr->passedDispatchFlags.guardCommandBufferWithPipeControl);
|
||||
@@ -398,7 +398,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitOperationWhenEnqueueCommandWithoutKern
|
||||
|
||||
EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer);
|
||||
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
|
||||
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr);
|
||||
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false);
|
||||
|
||||
auto expectedValue = mockCmdQ->getGpgpuCommandStreamReceiver().getDcFlushSupport();
|
||||
EXPECT_EQ(expectedValue, mockCsr->passedDispatchFlags.stateCacheInvalidation);
|
||||
@@ -423,7 +423,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitOperationWhenEnqueueCommandWithoutKern
|
||||
|
||||
EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer);
|
||||
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies,
|
||||
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr);
|
||||
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false);
|
||||
EXPECT_TRUE(mockCsr->passedDispatchFlags.isStallingCommandsOnNextFlushRequired);
|
||||
}
|
||||
|
||||
@@ -465,12 +465,12 @@ HWTEST_F(DispatchFlagsBlitTests, givenN1EnabledWhenDispatchingWithoutKernelThenA
|
||||
|
||||
mockCsr->nTo1SubmissionModelEnabled = false;
|
||||
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocked, enqueueProperties, timestampPacketDependencies,
|
||||
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr);
|
||||
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false);
|
||||
EXPECT_FALSE(mockCsr->passedDispatchFlags.outOfOrderExecutionAllowed);
|
||||
|
||||
mockCsr->nTo1SubmissionModelEnabled = true;
|
||||
mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocked, enqueueProperties, timestampPacketDependencies,
|
||||
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr);
|
||||
eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false);
|
||||
EXPECT_TRUE(mockCsr->passedDispatchFlags.outOfOrderExecutionAllowed);
|
||||
}
|
||||
|
||||
|
||||
@@ -1083,9 +1083,9 @@ struct RelaxedOrderingEnqueueKernelTests : public EnqueueKernelTest {
|
||||
};
|
||||
|
||||
HWTEST2_F(RelaxedOrderingEnqueueKernelTests, givenEnqueueKernelWhenProgrammingDependenciesThenUseConditionalBbStarts, IsAtLeastXeHpcCore) {
|
||||
DebugManager.flags.OptimizeIoqBarriersHandling.set(0);
|
||||
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
|
||||
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
|
||||
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
|
||||
|
||||
auto &ultCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
auto directSubmission = new MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>>(ultCsr);
|
||||
@@ -1126,12 +1126,61 @@ HWTEST2_F(RelaxedOrderingEnqueueKernelTests, givenEnqueueKernelWhenProgrammingDe
|
||||
|
||||
mockCmdQueueHw.enqueueBarrierWithWaitList(1, &outEvent, nullptr);
|
||||
|
||||
// OptimizeIoqBarriersHandling disabled by debug flag
|
||||
EXPECT_TRUE(ultCsr.recordedDispatchFlags.hasStallingCmds);
|
||||
EXPECT_FALSE(ultCsr.recordedDispatchFlags.hasRelaxedOrderingDependencies);
|
||||
|
||||
clReleaseEvent(outEvent);
|
||||
}
|
||||
|
||||
HWTEST2_F(RelaxedOrderingEnqueueKernelTests, givenBarrierWithDependenciesWhenFlushingThenAllowForRelaxedOrdering, IsAtLeastXeHpcCore) {
|
||||
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
|
||||
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||
|
||||
auto &ultCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
auto directSubmission = new MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>>(ultCsr);
|
||||
ultCsr.directSubmission.reset(directSubmission);
|
||||
ultCsr.registerClient();
|
||||
ultCsr.registerClient();
|
||||
|
||||
MockCommandQueueHw<FamilyType> mockCmdQueueHw{context, pClDevice, nullptr};
|
||||
|
||||
cl_event outEvent;
|
||||
|
||||
MockKernelWithInternals mockKernel(*pClDevice);
|
||||
|
||||
mockCmdQueueHw.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &outEvent);
|
||||
|
||||
auto &cmdStream = mockCmdQueueHw.getCS(0);
|
||||
auto cmdsOffset = cmdStream.getUsed();
|
||||
|
||||
mockCmdQueueHw.enqueueBarrierWithWaitList(1, &outEvent, nullptr);
|
||||
|
||||
EXPECT_FALSE(ultCsr.recordedDispatchFlags.hasStallingCmds);
|
||||
EXPECT_TRUE(ultCsr.recordedDispatchFlags.hasRelaxedOrderingDependencies);
|
||||
|
||||
auto lrrCmd = reinterpret_cast<MI_LOAD_REGISTER_REG *>(ptrOffset(cmdStream.getCpuBase(), cmdsOffset));
|
||||
EXPECT_EQ(lrrCmd->getSourceRegisterAddress(), CS_GPR_R4);
|
||||
EXPECT_EQ(lrrCmd->getDestinationRegisterAddress(), CS_GPR_R0);
|
||||
|
||||
lrrCmd++;
|
||||
EXPECT_EQ(lrrCmd->getSourceRegisterAddress(), CS_GPR_R4 + 4);
|
||||
EXPECT_EQ(lrrCmd->getDestinationRegisterAddress(), CS_GPR_R0 + 4);
|
||||
|
||||
auto eventNode = castToObject<Event>(outEvent)->getTimestampPacketNodes()->peekNodes()[0];
|
||||
auto compareAddress = eventNode->getGpuAddress() + eventNode->getContextEndOffset();
|
||||
|
||||
EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart<FamilyType>(++lrrCmd, 0, compareAddress, 1, CompareOperation::Equal, true));
|
||||
|
||||
auto conditionalBbStart2 = reinterpret_cast<void *>(ptrOffset(lrrCmd, EncodeBatchBufferStartOrEnd<FamilyType>::getCmdSizeConditionalDataMemBatchBufferStart()));
|
||||
EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart<FamilyType>(conditionalBbStart2, 0, compareAddress, 1, CompareOperation::Equal, true));
|
||||
|
||||
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(ptrOffset(conditionalBbStart2, EncodeBatchBufferStartOrEnd<FamilyType>::getCmdSizeConditionalDataMemBatchBufferStart()));
|
||||
EXPECT_NE(nullptr, sdiCmd);
|
||||
|
||||
clReleaseEvent(outEvent);
|
||||
}
|
||||
|
||||
HWTEST2_F(RelaxedOrderingEnqueueKernelTests, givenPipeControlForIoqDependencyResolvingEnabledWhenDispatchingRelaxedOrderingThenThrow, IsAtLeastXeHpcCore) {
|
||||
DebugManager.flags.ResolveDependenciesViaPipeControls.set(1);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user