Enable task count update from wait

Signed-off-by: Jobczyk, Lukasz <lukasz.jobczyk@intel.com>
This commit is contained in:
Jobczyk, Lukasz 2022-03-25 13:00:53 +00:00 committed by Compute-Runtime-Automation
parent 022eb054e6
commit d77a6cbe4b
14 changed files with 116 additions and 52 deletions

View File

@ -771,7 +771,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
blocking, //blocking
shouldFlushDC(commandType, printfHandler) || allocNeedsFlushDC, //dcFlush
multiDispatchInfo.usesSlm(), //useSLM
true, //guardCommandBufferWithPipeControl
!getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled(), //guardCommandBufferWithPipeControl
commandType == CL_COMMAND_NDRANGE_KERNEL, //GSBA32BitRequired
requiresCoherency, //requiresCoherency
(QueuePriority::LOW == priority), //lowPriority
@ -1008,7 +1008,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
blocking, //blocking
false, //dcFlush
false, //useSLM
true, //guardCommandBufferWithPipeControl
!getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled(), //guardCommandBufferWithPipeControl
false, //GSBA32BitRequired
false, //requiresCoherency
false, //lowPriority

View File

@ -70,7 +70,7 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
true, //blocking
true, //dcFlush
false, //useSLM
true, //guardCommandBufferWithPipeControl
!commandQueue.getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled(), //guardCommandBufferWithPipeControl
false, //GSBA32BitRequired
false, //requiresCoherency
commandQueue.getPriority() == QueuePriority::LOW, //lowPriority
@ -199,7 +199,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
true, //blocking
flushDC, //dcFlush
slmUsed, //useSLM
true, //guardCommandBufferWithPipeControl
!commandQueue.getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled(), //guardCommandBufferWithPipeControl
commandType == CL_COMMAND_NDRANGE_KERNEL, //GSBA32BitRequired
requiresCoherency, //requiresCoherency
commandQueue.getPriority() == QueuePriority::LOW, //lowPriority
@ -357,7 +357,7 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
true, //blocking
false, //dcFlush
false, //useSLM
true, //guardCommandBufferWithPipeControl
!commandStreamReceiver.isUpdateTagFromWaitEnabled(), //guardCommandBufferWithPipeControl
false, //GSBA32BitRequired
false, //requiresCoherency
commandQueue.getPriority() == QueuePriority::LOW, //lowPriority

View File

@ -102,24 +102,8 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenEnableUpdateTaskFromWaitWhenN
// Parse command list
parseCommands<FamilyType>(commandStreamTask, 0);
auto pipeControlExpected = MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, *defaultHwInfo);
auto itorPC = find<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
if (pipeControlExpected) {
EXPECT_NE(cmdList.end(), itorPC);
if (UnitTestHelper<FamilyType>::isPipeControlWArequired(pDevice->getHardwareInfo())) {
itorPC++;
itorPC = find<PIPE_CONTROL *>(itorPC, cmdList.end());
EXPECT_NE(cmdList.end(), itorPC);
}
// Verify that the dcFlushEnabled bit is set in PC
auto pCmdWA = reinterpret_cast<PIPE_CONTROL *>(*itorPC);
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, *defaultHwInfo), pCmdWA->getDcFlushEnable());
} else {
EXPECT_EQ(cmdList.end(), itorPC);
}
EXPECT_EQ(cmdList.end(), itorPC);
buffer->release();
}

View File

@ -1032,7 +1032,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenCsrInBatchingModeWithOutOfOrd
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, *defaultHwInfo), pipeControl->getDcFlushEnable());
}
HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetWhenFlushTaskThenThereIsNoPipeControlForUpdateTaskCount) {
HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetAndGuardCommandBufferWithPipeControlWhenFlushTaskThenThereIsPipeControlForUpdateTaskCount) {
DebugManagerStateRestore restorer;
DebugManager.flags.UpdateTaskCountFromWait.set(3);
@ -1061,6 +1061,37 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetWhe
parseCommands<FamilyType>(commandStream);
auto itorPipeControl = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
EXPECT_NE(itorPipeControl, cmdList.end());
}
HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetWhenFlushTaskThenThereIsNoPipeControlForUpdateTaskCount) {
DebugManagerStateRestore restorer;
DebugManager.flags.UpdateTaskCountFromWait.set(3);
CommandQueueHw<FamilyType> commandQueue(nullptr, pClDevice, 0, false);
auto &commandStream = commandQueue.getCS(4096u);
auto mockCsr = new MockCsrHw2<FamilyType>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
pDevice->resetCommandStreamReceiver(mockCsr);
mockCsr->useNewResourceImplicitFlush = false;
mockCsr->useGpuIdleImplicitFlush = false;
mockCsr->overrideDispatchPolicy(DispatchMode::BatchedDispatch);
DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
dispatchFlags.preemptionMode = PreemptionHelper::getDefaultPreemptionMode(pDevice->getHardwareInfo());
mockCsr->flushTask(commandStream,
0,
dsh,
ioh,
ssh,
taskLevel,
dispatchFlags,
*pDevice);
parseCommands<FamilyType>(commandStream);
auto itorPipeControl = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(itorPipeControl, cmdList.end());
}

View File

@ -86,6 +86,12 @@ HWTEST2_F(HwHelperTestPvcAndLater, GivenVariousValuesWhenCallingGetBarriersCount
EXPECT_EQ(32u, hwHelper.getBarriersCountFromHasBarriers(7u));
}
HWTEST2_F(HwHelperTestPvcAndLater, givenHwHelperWhenCheckIsUpdateTaskCountFromWaitSupportedThenReturnsTrue, IsAtLeastXeHpcCore) {
auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
EXPECT_TRUE(hwHelper.isUpdateTaskCountFromWaitSupported());
}
HWTEST2_F(HwHelperTestPvcAndLater, givenCooperativeContextSupportedWhenGetEngineInstancesThenReturnCorrectAmountOfCooperativeCcs, IsAtLeastXeHpcCore) {
HardwareInfo hwInfo = *defaultHwInfo;
hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled = 2;

View File

@ -178,6 +178,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
bool checkPlatformSupportsNewResourceImplicitFlush() const;
bool checkPlatformSupportsGpuIdleImplicitFlush() const;
void configurePostSyncWriteOffset();
void unregisterDirectSubmissionFromController();
HeapDirtyState dshState;
HeapDirtyState iohState;

View File

@ -43,10 +43,7 @@ namespace NEO {
template <typename GfxFamily>
CommandStreamReceiverHw<GfxFamily>::~CommandStreamReceiverHw() {
auto directSubmissionController = executionEnvironment.directSubmissionController.get();
if (directSubmissionController) {
directSubmissionController->unregisterDirectSubmission(this);
}
this->unregisterDirectSubmissionFromController();
}
template <typename GfxFamily>
@ -197,7 +194,6 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
const auto &hwInfo = peekHwInfo();
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
bool updateTag = false;
if (dispatchFlags.blocking || dispatchFlags.dcFlush || dispatchFlags.guardCommandBufferWithPipeControl) {
if (this->dispatchMode == DispatchMode::ImmediateDispatch) {
//for ImmediateDispatch we will send this right away, therefore this pipe control will close the level
@ -221,29 +217,20 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
auto address = getTagAllocation()->getGpuAddress();
updateTag = !isUpdateTagFromWaitEnabled();
updateTag |= dispatchFlags.blocking;
updateTag |= dispatchFlags.dcFlush;
PipeControlArgs args;
args.dcFlushEnable = MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(dispatchFlags.dcFlush, hwInfo);
args.notifyEnable = isUsedNotifyEnableForPostSync();
args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired;
args.textureCacheInvalidationEnable |= dispatchFlags.textureCacheFlush;
args.workloadPartitionOffset = isMultiTileOperationEnabled();
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
commandStreamTask,
PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
address,
taskCount + 1,
hwInfo,
args);
if (updateTag) {
PipeControlArgs args;
args.dcFlushEnable = MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(dispatchFlags.dcFlush, hwInfo);
args.notifyEnable = isUsedNotifyEnableForPostSync();
args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired;
args.textureCacheInvalidationEnable |= dispatchFlags.textureCacheFlush;
args.workloadPartitionOffset = isMultiTileOperationEnabled();
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
commandStreamTask,
PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
address,
taskCount + 1,
hwInfo,
args);
} else {
currentPipeControlForNooping = nullptr;
}
this->latestSentTaskCount = taskCount + 1;
DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "taskCount", peekTaskCount());
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
flatBatchBufferHelper->setPatchInfoData(PatchInfoData(address, 0u,
@ -258,6 +245,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
PatchInfoAllocationType::Default));
}
}
this->latestSentTaskCount = taskCount + 1;
if (DebugManager.flags.ForceSLML3Config.get()) {
dispatchFlags.useSLM = true;
@ -585,7 +573,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
if (submitCSR | submitTask) {
if (this->dispatchMode == DispatchMode::ImmediateDispatch) {
flushHandler(batchBuffer, this->getResidencyAllocations());
if (updateTag) {
if (dispatchFlags.blocking || dispatchFlags.dcFlush || dispatchFlags.guardCommandBufferWithPipeControl) {
this->latestFlushedTaskCount = this->taskCount + 1;
}
} else {
@ -1004,6 +992,14 @@ bool CommandStreamReceiverHw<GfxFamily>::detectInitProgrammingFlagsRequired(cons
return DebugManager.flags.ForceCsrReprogramming.get();
}
template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::unregisterDirectSubmissionFromController() {
auto directSubmissionController = executionEnvironment.directSubmissionController.get();
if (directSubmissionController) {
directSubmissionController->unregisterDirectSubmission(this);
}
}
template <typename GfxFamily>
uint32_t CommandStreamReceiverHw<GfxFamily>::flushBcsTask(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled, Device &device) {
using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END;
@ -1324,7 +1320,9 @@ inline void CommandStreamReceiverHw<GfxFamily>::flushHandler(BatchBuffer &batchB
template <typename GfxFamily>
inline bool CommandStreamReceiverHw<GfxFamily>::isUpdateTagFromWaitEnabled() {
bool enabled = false;
auto &hwHelper = HwHelper::get(peekHwInfo().platform.eRenderCoreFamily);
auto enabled = hwHelper.isUpdateTaskCountFromWaitSupported();
enabled &= this->isAnyDirectSubmissionEnabled();
switch (DebugManager.flags.UpdateTaskCountFromWait.get()) {
case 0:
@ -1430,6 +1428,9 @@ inline bool CommandStreamReceiverHw<GfxFamily>::initDirectSubmission(Device &dev
if (directSubmissionController) {
directSubmissionController->registerDirectSubmission(this);
}
if (this->isUpdateTagFromWaitEnabled()) {
this->overrideDispatchPolicy(DispatchMode::ImmediateDispatch);
}
}
osContext.setDirectSubmissionActive();
}

View File

@ -64,6 +64,7 @@ class HwHelper {
static bool cacheFlushAfterWalkerSupported(const HardwareInfo &hwInfo);
virtual bool timestampPacketWriteSupported() const = 0;
virtual bool isTimestampWaitSupported() const = 0;
virtual bool isUpdateTaskCountFromWaitSupported() const = 0;
virtual size_t getRenderSurfaceStateSize() const = 0;
virtual void setRenderSurfaceStateForBuffer(const RootDeviceEnvironment &rootDeviceEnvironment,
void *surfaceStateBuffer,
@ -248,6 +249,8 @@ class HwHelperHw : public HwHelper {
bool isTimestampWaitSupported() const override;
bool isUpdateTaskCountFromWaitSupported() const override;
bool is1MbAlignmentSupported(const HardwareInfo &hwInfo, bool isCompressionEnabled) const override;
bool isFenceAllocationRequired(const HardwareInfo &hwInfo) const override;

View File

@ -45,6 +45,11 @@ bool HwHelperHw<GfxFamily>::isTimestampWaitSupported() const {
return false;
}
template <typename GfxFamily>
bool HwHelperHw<GfxFamily>::isUpdateTaskCountFromWaitSupported() const {
return false;
}
template <typename GfxFamily>
bool HwHelperHw<GfxFamily>::isAssignEngineRoundRobinSupported() const {
return false;

View File

@ -41,6 +41,11 @@ bool HwHelperHw<Family>::isTimestampWaitSupported() const {
return true;
}
template <>
bool HwHelperHw<Family>::isUpdateTaskCountFromWaitSupported() const {
return true;
}
template <>
uint32_t HwHelperHw<Family>::adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
const HardwareInfo &hwInfo, bool isEngineInstanced) const {

View File

@ -60,6 +60,11 @@ bool HwHelperHw<GfxFamily>::isTimestampWaitSupported() const {
return false;
}
template <typename GfxFamily>
bool HwHelperHw<GfxFamily>::isUpdateTaskCountFromWaitSupported() const {
return false;
}
template <typename GfxFamily>
const EngineInstancesContainer HwHelperHw<GfxFamily>::getGpgpuEngineInstances(const HardwareInfo &hwInfo) const {
auto defaultEngine = getChosenEngineType(hwInfo);

View File

@ -43,6 +43,7 @@ class DrmCommandStreamReceiver : public DeviceCommandStreamReceiver<GfxFamily> {
uint32_t rootDeviceIndex,
const DeviceBitfield deviceBitfield,
gemCloseWorkerMode mode = gemCloseWorkerMode::gemCloseWorkerActive);
~DrmCommandStreamReceiver();
SubmissionStatus flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) override;
MOCKABLE_VIRTUAL void processResidency(const ResidencyContainer &allocationsForResidency, uint32_t handleId) override;

View File

@ -80,6 +80,13 @@ DrmCommandStreamReceiver<GfxFamily>::DrmCommandStreamReceiver(ExecutionEnvironme
kmdWaitTimeout = DebugManager.flags.SetKmdWaitTimeout.get();
}
template <typename GfxFamily>
inline DrmCommandStreamReceiver<GfxFamily>::~DrmCommandStreamReceiver() {
if (this->isUpdateTagFromWaitEnabled()) {
this->waitForCompletionWithTimeout(WaitParams{false, false, 0}, this->peekTaskCount());
}
}
template <typename GfxFamily>
SubmissionStatus DrmCommandStreamReceiver<GfxFamily>::flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) {
this->printDeviceIndex();

View File

@ -561,6 +561,21 @@ HWTEST_F(CommandStreamReceiverTest, givenUpdateTaskCountFromWaitWhenCheckTaskCou
}
}
HWTEST_F(CommandStreamReceiverTest, givenUpdateTaskCountFromWaitWhenCheckIfEnabledThenCanBeEnabledOnlyWithDirectSubmission) {
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
auto &hwHelper = HwHelper::get(csr.peekHwInfo().platform.eRenderCoreFamily);
{
csr.directSubmissionAvailable = true;
EXPECT_EQ(csr.isUpdateTagFromWaitEnabled(), hwHelper.isUpdateTaskCountFromWaitSupported());
}
{
csr.directSubmissionAvailable = false;
EXPECT_FALSE(csr.isUpdateTagFromWaitEnabled());
}
}
struct InitDirectSubmissionFixture {
void SetUp() {
DebugManager.flags.EnableDirectSubmission.set(1);