Add new debug key to control dependency resolution.

Currently only supported scenario is one in order queue.
Instead of resolving dependencies via semaphores, do this with pipe controls.

Signed-off-by: Michal Mrozek <michal.mrozek@intel.com>
This commit is contained in:
Michal Mrozek
2021-12-09 16:18:18 +00:00
committed by Compute-Runtime-Automation
parent 34d9d9b0d3
commit ebb16c8b74
7 changed files with 145 additions and 3 deletions

View File

@@ -324,6 +324,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
uint32_t peekBcsTaskCount(aub_stream::EngineType bcsEngineType) const;
void updateLatestSentEnqueueType(EnqueueProperties::Operation newEnqueueType) { this->latestSentEnqueueType = newEnqueueType; }
EnqueueProperties::Operation peekLatestSentEnqueueOperation() { return this->latestSentEnqueueType; }
void setupBarrierTimestampForBcsEngines(aub_stream::EngineType engineType, TimestampPacketDependencies &timestampPacketDependencies);
void processBarrierTimestampForBcsEngine(aub_stream::EngineType bcsEngineType, TimestampPacketDependencies &blitDependencies);

View File

@@ -106,7 +106,18 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
mainKernel->areMultipleSubDevicesInContext());
}
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(*commandStream, csrDependencies);
bool programDependencies = true;
if (DebugManager.flags.ResolveDependenciesViaPipeControls.get() == 1) {
//only optimize kernel after kernel
if (commandQueue.peekLatestSentEnqueueOperation() == EnqueueProperties::Operation::GpuKernel) {
programDependencies = false;
}
}
if (programDependencies) {
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(*commandStream, csrDependencies);
}
dsh->align(EncodeStates<GfxFamily>::alignInterfaceDescriptorData);

View File

@@ -379,6 +379,28 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterTests, Gi
EXPECT_EQ(sizeUsed, 0u);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterTests, givenHigherTaskLevelWhenFlushTaskCalledThenPipeControlEmittedWhenDebugFlagSet) {
DebugManagerStateRestore restorer;
DebugManager.flags.ResolveDependenciesViaPipeControls.set(1);
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
// Configure the CSR to not need to submit any state or commands.
configureCSRtoNonDirtyState<FamilyType>(true);
commandStreamReceiver.timestampPacketWriteEnabled = true;
this->taskLevel++;
flushTask(commandStreamReceiver);
EXPECT_EQ(taskLevel, commandStreamReceiver.taskLevel);
parseCommands<FamilyType>(commandStreamReceiver.commandStream, 0);
auto itorPC = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
EXPECT_NE(cmdList.end(), itorPC);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterTests, givenDeviceWithThreadGroupPreemptionSupportThenDontSendMediaVfeStateIfNotDirty) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.ForcePreemptionMode.set(static_cast<int32_t>(PreemptionMode::ThreadGroup));

View File

@@ -1318,6 +1318,106 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledOnDifferentCSRsFr
EXPECT_EQ(3u, semaphoresFound); // total number of semaphores found in cmdList
}
HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndDependenciesResolvedViaPipeControlsIfPreviousOperationIsBlitThenStillProgramSemaphores) {
DebugManagerStateRestore restorer;
DebugManager.flags.ResolveDependenciesViaPipeControls.set(1);
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using WALKER = typename FamilyType::WALKER_TYPE;
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
MockMultiDispatchInfo multiDispatchInfo(device.get(), std::vector<Kernel *>({kernel->mockKernel}));
auto &cmdStream = mockCmdQ->getCS(0);
mockCmdQ->updateLatestSentEnqueueType(NEO::EnqueueProperties::Operation::Blit);
const cl_uint eventsOnWaitlist = 1;
MockTimestampPacketContainer timestamp(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1);
Event event(mockCmdQ, 0, 0, 0);
event.addTimestampPacketNodes(timestamp);
cl_event waitlist[] = {&event};
EventsRequest eventsRequest(eventsOnWaitlist, waitlist, nullptr);
CsrDependencies csrDeps;
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
HardwareInterface<FamilyType>::dispatchWalker(
*mockCmdQ,
multiDispatchInfo,
csrDeps,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(cmdStream, 0);
uint32_t semaphoresFound = 0;
for (auto it = hwParser.cmdList.begin(); it != hwParser.cmdList.end(); it++) {
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*it);
if (semaphoreCmd) {
semaphoresFound++;
}
}
EXPECT_EQ(1u, semaphoresFound); // total number of semaphores found in cmdList
}
HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndDependenciesResolvedViaPipeControlsIfPreviousOperationIsGPUKernelThenDoNotProgramSemaphores) {
DebugManagerStateRestore restorer;
DebugManager.flags.ResolveDependenciesViaPipeControls.set(1);
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using WALKER = typename FamilyType::WALKER_TYPE;
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
MockMultiDispatchInfo multiDispatchInfo(device.get(), std::vector<Kernel *>({kernel->mockKernel}));
auto &cmdStream = mockCmdQ->getCS(0);
mockCmdQ->updateLatestSentEnqueueType(NEO::EnqueueProperties::Operation::GpuKernel);
const cl_uint eventsOnWaitlist = 1;
MockTimestampPacketContainer timestamp(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1);
Event event(mockCmdQ, 0, 0, 0);
event.addTimestampPacketNodes(timestamp);
cl_event waitlist[] = {&event};
EventsRequest eventsRequest(eventsOnWaitlist, waitlist, nullptr);
CsrDependencies csrDeps;
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
HardwareInterface<FamilyType>::dispatchWalker(
*mockCmdQ,
multiDispatchInfo,
csrDeps,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(cmdStream, 0);
uint32_t semaphoresFound = 0;
for (auto it = hwParser.cmdList.begin(); it != hwParser.cmdList.end(); it++) {
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*it);
if (semaphoreCmd) {
semaphoresFound++;
}
}
EXPECT_EQ(0u, semaphoresFound);
}
HWTEST_F(TimestampPacketTests, givenAlreadyAssignedNodeWhenEnqueueingNonBlockedThenMakeItResident) {
auto mockTagAllocator = new MockTagAllocator<>(device->getRootDeviceIndex(), executionEnvironment->memoryManager.get(), 1);

View File

@@ -362,4 +362,5 @@ ClosEnabled = -1
EngineUsageHint = -1
AddStatePrefetchCmdToMemoryPrefetchAPI = -1
UpdateCrossThreadDataSize = 0
ForceBcsEngineIndex = -1
ForceBcsEngineIndex = -1
ResolveDependenciesViaPipeControls = -1