Enable flushTask path for BCS

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2022-11-30 14:57:18 +00:00
committed by Compute-Runtime-Automation
parent 99655d34f9
commit 85da0ee184
13 changed files with 370 additions and 43 deletions

View File

@@ -11,7 +11,9 @@
namespace NEO {
struct SvmAllocationData;
}
struct CompletionStamp;
class LinearStream;
} // namespace NEO
namespace L0 {
@@ -25,6 +27,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
using BaseClass = CommandListCoreFamily<gfxCoreFamily>;
using BaseClass::BaseClass;
using BaseClass::executeCommandListImmediate;
using BaseClass::isCopyOnly;
ze_result_t appendLaunchKernel(ze_kernel_handle_t kernelHandle,
const ze_group_count_t *threadGroupDimensions,
@@ -126,6 +129,9 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
MOCKABLE_VIRTUAL ze_result_t executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies);
NEO::CompletionStamp flushRegularTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies);
NEO::CompletionStamp flushBcsTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies);
void checkAvailableSpace();
void updateDispatchFlagsWithRequiredStreamState(NEO::DispatchFlags &dispatchFlags);

View File

@@ -69,7 +69,18 @@ void CommandListCoreFamilyImmediate<gfxCoreFamily>::updateDispatchFlagsWithRequi
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies) {
NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushBcsTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies) {
NEO::DispatchBcsFlags dispatchBcsFlags(
this->isSyncModeQueue, // flushTaskCount
hasStallingCmds, // hasStallingCmds
hasRelaxedOrderingDependencies // hasRelaxedOrderingDependencies
);
return this->csr->flushBcsTask(cmdStreamTask, taskStartOffset, dispatchBcsFlags, this->device->getHwInfo());
}
template <GFXCORE_FAMILY gfxCoreFamily>
NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushRegularTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies) {
NEO::DispatchFlags dispatchFlags(
{}, // csrDependencies
nullptr, // barrierTimestampPacketNodes
@@ -103,41 +114,10 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommandListImm
hasStallingCmds, // hasStallingCmds
hasRelaxedOrderingDependencies // hasRelaxedOrderingDependencies
);
this->updateDispatchFlagsWithRequiredStreamState(dispatchFlags);
this->commandContainer.removeDuplicatesFromResidencyContainer();
auto commandStream = this->commandContainer.getCommandStream();
size_t commandStreamStart = this->cmdListCurrentStartOffset;
auto lockCSR = this->csr->obtainUniqueOwnership();
std::unique_lock<std::mutex> lockForIndirect;
if (this->hasIndirectAllocationsAllowed()) {
this->cmdQImmediate->handleIndirectAllocationResidency(this->getUnifiedMemoryControls(), lockForIndirect, performMigration);
}
this->csr->setRequiredScratchSizes(this->getCommandListPerThreadScratchSize(), this->getCommandListPerThreadPrivateScratchSize());
if (performMigration) {
auto deviceImp = static_cast<DeviceImp *>(this->device);
auto pageFaultManager = deviceImp->getDriverHandle()->getMemoryManager()->getPageFaultManager();
if (pageFaultManager == nullptr) {
performMigration = false;
}
}
this->cmdQImmediate->makeResidentAndMigrate(performMigration, this->commandContainer.getResidencyContainer());
if (performMigration) {
this->migrateSharedAllocations();
}
if (this->performMemoryPrefetch) {
auto prefetchManager = this->device->getDriverHandle()->getMemoryManager()->getPrefetchManager();
prefetchManager->migrateAllocationsToGpu(this->getPrefetchContext(), *this->device->getDriverHandle()->getSvmAllocsManager(), *this->device->getNEODevice());
}
auto ioh = (this->commandContainer.getIndirectHeap(NEO::IndirectHeap::Type::INDIRECT_OBJECT));
NEO::IndirectHeap *dsh = nullptr;
NEO::IndirectHeap *ssh = nullptr;
@@ -182,15 +162,56 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommandListImm
}
}
auto completionStamp = this->csr->flushTask(
*commandStream,
commandStreamStart,
return this->csr->flushTask(
cmdStreamTask,
taskStartOffset,
dsh,
ioh,
ssh,
this->csr->peekTaskLevel(),
dispatchFlags,
*(this->device->getNEODevice()));
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies) {
this->commandContainer.removeDuplicatesFromResidencyContainer();
auto commandStream = this->commandContainer.getCommandStream();
size_t commandStreamStart = this->cmdListCurrentStartOffset;
auto lockCSR = this->csr->obtainUniqueOwnership();
std::unique_lock<std::mutex> lockForIndirect;
if (this->hasIndirectAllocationsAllowed()) {
this->cmdQImmediate->handleIndirectAllocationResidency(this->getUnifiedMemoryControls(), lockForIndirect, performMigration);
}
if (performMigration) {
auto deviceImp = static_cast<DeviceImp *>(this->device);
auto pageFaultManager = deviceImp->getDriverHandle()->getMemoryManager()->getPageFaultManager();
if (pageFaultManager == nullptr) {
performMigration = false;
}
}
this->cmdQImmediate->makeResidentAndMigrate(performMigration, this->commandContainer.getResidencyContainer());
if (performMigration) {
this->migrateSharedAllocations();
}
if (this->performMemoryPrefetch) {
auto prefetchManager = this->device->getDriverHandle()->getMemoryManager()->getPrefetchManager();
prefetchManager->migrateAllocationsToGpu(this->getPrefetchContext(), *this->device->getDriverHandle()->getSvmAllocsManager(), *this->device->getNEODevice());
}
NEO::CompletionStamp completionStamp;
if (isCopyOnly()) {
completionStamp = flushBcsTask(*commandStream, commandStreamStart, hasStallingCmds, hasRelaxedOrderingDependencies);
} else {
completionStamp = flushRegularTask(*commandStream, commandStreamStart, hasStallingCmds, hasRelaxedOrderingDependencies);
}
if (completionStamp.taskCount > NEO::CompletionStamp::notReady) {
if (completionStamp.taskCount == NEO::CompletionStamp::outOfHostMemory) {

View File

@@ -133,7 +133,7 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device
commandList->internalUsage = internalUsage;
commandList->cmdListType = CommandListType::TYPE_IMMEDIATE;
commandList->isSyncModeQueue = (desc->mode == ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS);
if ((!NEO::EngineHelper::isCopyOnlyEngineType(engineGroupType)) && !internalUsage) {
if (!internalUsage) {
commandList->isFlushTaskSubmissionEnabled = hwHelper.isPlatformFlushTaskEnabled(hwInfo);
if (NEO::DebugManager.flags.EnableFlushTaskSubmission.get() != -1) {
commandList->isFlushTaskSubmissionEnabled = !!NEO::DebugManager.flags.EnableFlushTaskSubmission.get();

View File

@@ -1289,7 +1289,7 @@ HWTEST2_F(CommandListCreateWithBcs,
EXPECT_TRUE(commandList->isCopyOnly());
}
HWTEST2_F(CommandListCreateWithBcs, givenForceFlushTaskEnabledWhenCreatingCommandListUsingLinkedCopyThenFlushTaskModeNotUsed, IsAtLeastXeHpCore) {
HWTEST2_F(CommandListCreateWithBcs, givenForceFlushTaskEnabledWhenCreatingCommandListUsingLinkedCopyThenFlushTaskModeUsed, IsAtLeastXeHpCore) {
DebugManagerStateRestore restorer;
NEO::DebugManager.flags.EnableFlushTaskSubmission.set(1);
@@ -1306,7 +1306,7 @@ HWTEST2_F(CommandListCreateWithBcs, givenForceFlushTaskEnabledWhenCreatingComman
ASSERT_NE(nullptr, commandList);
EXPECT_TRUE(commandList->isCopyOnly());
EXPECT_FALSE(commandList->isFlushTaskSubmissionEnabled);
EXPECT_TRUE(commandList->isFlushTaskSubmissionEnabled);
}
HWTEST2_F(CommandListCreate, whenGettingCommandsToPatchThenCorrectValuesAreReturned, IsAtLeastSkl) {

View File

@@ -861,7 +861,7 @@ HWTEST_F(CommandListCreate, givenAsyncCmdQueueAndTbxCsrWithCopyOnlyImmediateComm
EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
}
HWTEST_F(CommandListCreate, givenFlushTaskFlagEnabledAndAsyncCmdQueueWithCopyOnlyImmediateCommandListCreatedThenSlushTaskSubmissionIsSetToFalse) {
HWTEST_F(CommandListCreate, givenFlushTaskFlagEnabledAndAsyncCmdQueueWithCopyOnlyImmediateCommandListCreatedThenFlushTaskSubmissionIsSetToTrue) {
DebugManagerStateRestore restorer;
NEO::DebugManager.flags.EnableFlushTaskSubmission.set(true);
@@ -871,7 +871,7 @@ HWTEST_F(CommandListCreate, givenFlushTaskFlagEnabledAndAsyncCmdQueueWithCopyOnl
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::Copy, returnValue));
ASSERT_NE(nullptr, commandList);
EXPECT_EQ(false, commandList->isFlushTaskSubmissionEnabled);
EXPECT_TRUE(commandList->isFlushTaskSubmissionEnabled);
}
HWTEST2_F(CommandListCreate, givenAllValuesTbxAndSyncModeFlagsWhenCheckingWaitlistEventSyncRequiredThenExpectTrueOnlyForTbxTrueAndAsyncMode, IsAtLeastSkl) {

View File

@@ -182,6 +182,191 @@ HWTEST2_F(AppendMemoryCopy, givenAsyncImmediateCommandListWhenAppendingMemoryCop
commandList->cmdQImmediate = nullptr;
}
HWTEST2_F(AppendMemoryCopy, givenAsyncImmediateCommandListWhenAppendingMemoryCopyWithCopyEngineThenProgramCmdStreamWithFlushTask, IsAtLeastSkl) {
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
DebugManagerStateRestore restore;
NEO::DebugManager.flags.EnableFlushTaskSubmission.set(1);
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
auto cmdQueue = std::make_unique<Mock<CommandQueue>>();
cmdQueue->csr = ultCsr;
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
auto commandList = std::make_unique<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>>();
ASSERT_NE(nullptr, commandList);
commandList->isFlushTaskSubmissionEnabled = true;
ze_result_t ret = commandList->initialize(device, NEO::EngineGroupType::Copy, 0u);
ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
commandList->device = device;
commandList->isSyncModeQueue = false;
commandList->cmdQImmediate = cmdQueue.get();
commandList->cmdListType = CommandList::CommandListType::TYPE_IMMEDIATE;
commandList->csr = ultCsr;
// Program CSR state on first submit
EXPECT_EQ(0u, ultCsr->getCS(0).getUsed());
bool hwContextProgrammingRequired = (ultCsr->getCmdsSizeForHardwareContext() > 0);
size_t expectedSize = 0;
if (hwContextProgrammingRequired) {
expectedSize = alignUp(ultCsr->getCmdsSizeForHardwareContext() + sizeof(typename FamilyType::MI_BATCH_BUFFER_START), MemoryConstants::cacheLineSize);
}
ASSERT_EQ(ZE_RESULT_SUCCESS, commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, 0, nullptr));
EXPECT_EQ(expectedSize, ultCsr->getCS(0).getUsed());
size_t offset = 0;
if constexpr (FamilyType::isUsingMiMemFence) {
if (ultCsr->globalFenceAllocation) {
using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS;
auto sysMemFence = genCmdCast<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(ultCsr->getCS(0).getCpuBase());
ASSERT_NE(nullptr, sysMemFence);
EXPECT_EQ(ultCsr->globalFenceAllocation->getGpuAddress(), sysMemFence->getSystemMemoryFenceAddress());
offset += sizeof(STATE_SYSTEM_MEM_FENCE_ADDRESS);
}
}
if (hwContextProgrammingRequired) {
auto bbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(ptrOffset(ultCsr->getCS(0).getCpuBase(), offset));
ASSERT_NE(nullptr, bbStartCmd);
EXPECT_EQ(commandList->commandContainer.getCommandStream()->getGpuBase(), bbStartCmd->getBatchBufferStartAddress());
}
auto findTagUpdate = [](void *streamBase, size_t sizeUsed, uint64_t tagAddress) -> bool {
GenCmdList genCmdList;
EXPECT_TRUE(FamilyType::PARSE::parseCommandBuffer(genCmdList, streamBase, sizeUsed));
auto itor = find<MI_FLUSH_DW *>(genCmdList.begin(), genCmdList.end());
bool found = false;
while (itor != genCmdList.end()) {
auto cmd = genCmdCast<MI_FLUSH_DW *>(*itor);
if (cmd && cmd->getDestinationAddress() == tagAddress) {
found = true;
break;
}
itor++;
}
return found;
};
EXPECT_FALSE(findTagUpdate(commandList->commandContainer.getCommandStream()->getCpuBase(),
commandList->commandContainer.getCommandStream()->getUsed(),
ultCsr->getTagAllocation()->getGpuAddress()));
// Dont program CSR state on next submit
size_t csrOfffset = ultCsr->getCS(0).getUsed();
size_t cmdListOffset = commandList->commandContainer.getCommandStream()->getUsed();
ASSERT_EQ(ZE_RESULT_SUCCESS, commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, 0, nullptr));
EXPECT_EQ(csrOfffset, ultCsr->getCS(0).getUsed());
EXPECT_FALSE(findTagUpdate(ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), cmdListOffset),
commandList->commandContainer.getCommandStream()->getUsed() - cmdListOffset,
ultCsr->getTagAllocation()->getGpuAddress()));
}
HWTEST2_F(AppendMemoryCopy, givenSyncImmediateCommandListWhenAppendingMemoryCopyWithCopyEngineThenProgramCmdStreamWithFlushTask, IsAtLeastSkl) {
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
DebugManagerStateRestore restore;
NEO::DebugManager.flags.EnableFlushTaskSubmission.set(1);
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
auto cmdQueue = std::make_unique<Mock<CommandQueue>>();
cmdQueue->csr = ultCsr;
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
auto commandList = std::make_unique<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>>();
ASSERT_NE(nullptr, commandList);
commandList->isFlushTaskSubmissionEnabled = true;
ze_result_t ret = commandList->initialize(device, NEO::EngineGroupType::Copy, 0u);
ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
commandList->device = device;
commandList->isSyncModeQueue = true;
commandList->cmdQImmediate = cmdQueue.get();
commandList->cmdListType = CommandList::CommandListType::TYPE_IMMEDIATE;
commandList->csr = ultCsr;
// Program CSR state on first submit
EXPECT_EQ(0u, ultCsr->getCS(0).getUsed());
bool hwContextProgrammingRequired = (ultCsr->getCmdsSizeForHardwareContext() > 0);
size_t expectedSize = 0;
if (hwContextProgrammingRequired) {
expectedSize = alignUp(ultCsr->getCmdsSizeForHardwareContext() + sizeof(typename FamilyType::MI_BATCH_BUFFER_START), MemoryConstants::cacheLineSize);
}
ASSERT_EQ(ZE_RESULT_SUCCESS, commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, 0, nullptr));
EXPECT_EQ(expectedSize, ultCsr->getCS(0).getUsed());
size_t offset = 0;
if constexpr (FamilyType::isUsingMiMemFence) {
if (ultCsr->globalFenceAllocation) {
using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS;
auto sysMemFence = genCmdCast<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(ultCsr->getCS(0).getCpuBase());
ASSERT_NE(nullptr, sysMemFence);
EXPECT_EQ(ultCsr->globalFenceAllocation->getGpuAddress(), sysMemFence->getSystemMemoryFenceAddress());
offset += sizeof(STATE_SYSTEM_MEM_FENCE_ADDRESS);
}
}
if (hwContextProgrammingRequired) {
auto bbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(ptrOffset(ultCsr->getCS(0).getCpuBase(), offset));
ASSERT_NE(nullptr, bbStartCmd);
EXPECT_EQ(commandList->commandContainer.getCommandStream()->getGpuBase(), bbStartCmd->getBatchBufferStartAddress());
}
auto findTagUpdate = [](void *streamBase, size_t sizeUsed, uint64_t tagAddress) -> bool {
GenCmdList genCmdList;
EXPECT_TRUE(FamilyType::PARSE::parseCommandBuffer(genCmdList, streamBase, sizeUsed));
auto itor = find<MI_FLUSH_DW *>(genCmdList.begin(), genCmdList.end());
bool found = false;
while (itor != genCmdList.end()) {
auto cmd = genCmdCast<MI_FLUSH_DW *>(*itor);
if (cmd && cmd->getDestinationAddress() == tagAddress) {
found = true;
break;
}
itor++;
}
return found;
};
EXPECT_TRUE(findTagUpdate(commandList->commandContainer.getCommandStream()->getCpuBase(),
commandList->commandContainer.getCommandStream()->getUsed(),
ultCsr->getTagAllocation()->getGpuAddress()));
// Dont program CSR state on next submit
size_t csrOfffset = ultCsr->getCS(0).getUsed();
size_t cmdListOffset = commandList->commandContainer.getCommandStream()->getUsed();
ASSERT_EQ(ZE_RESULT_SUCCESS, commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, 0, nullptr));
EXPECT_EQ(csrOfffset, ultCsr->getCS(0).getUsed());
EXPECT_TRUE(findTagUpdate(ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), cmdListOffset),
commandList->commandContainer.getCommandStream()->getUsed() - cmdListOffset,
ultCsr->getTagAllocation()->getGpuAddress()));
}
HWTEST2_F(AppendMemoryCopy, givenSyncModeImmediateCommandListWhenAppendingMemoryCopyWithCopyEngineThenSuccessIsReturned, IsAtLeastSkl) {
Mock<CommandQueue> cmdQueue;
void *srcPtr = reinterpret_cast<void *>(0x1234);

View File

@@ -558,6 +558,12 @@ class CommandStreamReceiverMock : public CommandStreamReceiver {
return cs;
}
CompletionStamp flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) override {
CompletionStamp cs = {};
return cs;
}
bool flushBatchedSubmissions() override { return true; }
CommandStreamReceiverType getType() const override {

View File

@@ -86,9 +86,10 @@ class CommandStreamReceiver {
virtual SubmissionStatus flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) = 0;
virtual CompletionStamp flushTask(LinearStream &commandStream, size_t commandStreamStart,
virtual CompletionStamp flushTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh,
TaskCountType taskLevel, DispatchFlags &dispatchFlags, Device &device) = 0;
virtual CompletionStamp flushBcsTask(LinearStream &commandStream, size_t commandStreamStart, const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) = 0;
virtual bool flushBatchedSubmissions() = 0;
MOCKABLE_VIRTUAL SubmissionStatus submitBatchBuffer(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency);

View File

@@ -45,6 +45,8 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh,
TaskCountType taskLevel, DispatchFlags &dispatchFlags, Device &device) override;
CompletionStamp flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart, const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) override;
void forcePipeControl(NEO::LinearStream &commandStreamCSR);
bool flushBatchedSubmissions() override;
@@ -58,6 +60,8 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
size_t getRequiredStateBaseAddressSize(const Device &device) const;
size_t getRequiredCmdStreamSize(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdStreamSizeAligned(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdStreamSize(const DispatchBcsFlags &dispatchBcsFlags);
size_t getRequiredCmdStreamSizeAligned(const DispatchBcsFlags &dispatchBcsFlags);
size_t getRequiredCmdSizeForPreamble(Device &device) const;
size_t getCmdSizeForPreemption(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForEpilogue(const DispatchFlags &dispatchFlags) const;

View File

@@ -179,6 +179,79 @@ size_t CommandStreamReceiverHw<GfxFamily>::getCmdsSizeForHardwareContext() const
return getCmdSizeForPrologue();
}
template <typename GfxFamily>
CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) {
UNRECOVERABLE_IF(this->dispatchMode != DispatchMode::ImmediateDispatch);
uint64_t taskStartAddress = commandStreamTask.getGpuBase() + commandStreamTaskStart;
if (dispatchBcsFlags.flushTaskCount) {
uint64_t postSyncAddress = getTagAllocation()->getGpuAddress();
TaskCountType postSyncData = peekTaskCount() + 1;
NEO::MiFlushArgs args;
args.commandWithPostSync = true;
args.notifyEnable = isUsedNotifyEnableForPostSync();
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(commandStreamTask, postSyncAddress, postSyncData, args, hwInfo);
}
auto &commandStreamCSR = getCS(getRequiredCmdStreamSizeAligned(dispatchBcsFlags));
size_t commandStreamStartCSR = commandStreamCSR.getUsed();
programHardwareContext(commandStreamCSR);
if (globalFenceAllocation) {
makeResident(*globalFenceAllocation);
}
if (dispatchBcsFlags.flushTaskCount) {
makeResident(*getTagAllocation());
}
bool submitCSR = (commandStreamStartCSR != commandStreamCSR.getUsed());
void *bbEndLocation = nullptr;
programEndingCmd(commandStreamTask, &bbEndLocation, isBlitterDirectSubmissionEnabled(), dispatchBcsFlags.hasRelaxedOrderingDependencies, false);
EncodeNoop<GfxFamily>::alignToCacheLine(commandStreamTask);
if (submitCSR) {
auto bbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(commandStreamCSR.getSpace(sizeof(MI_BATCH_BUFFER_START)));
addBatchBufferStart(bbStart, taskStartAddress, false);
EncodeNoop<GfxFamily>::alignToCacheLine(commandStreamCSR);
this->makeResident(*commandStreamCSR.getGraphicsAllocation());
}
size_t startOffset = submitCSR ? commandStreamStartCSR : commandStreamTaskStart;
auto &streamToSubmit = submitCSR ? commandStreamCSR : commandStreamTask;
BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, 0, taskStartAddress, nullptr,
false, false, QueueThrottle::MEDIUM, NEO::QueueSliceCount::defaultSliceCount,
streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation, false, (submitCSR || dispatchBcsFlags.hasStallingCmds),
dispatchBcsFlags.hasRelaxedOrderingDependencies};
streamToSubmit.getGraphicsAllocation()->updateTaskCount(this->taskCount + 1, this->osContext->getContextId());
streamToSubmit.getGraphicsAllocation()->updateResidencyTaskCount(this->taskCount + 1, this->osContext->getContextId());
auto submissionStatus = flushHandler(batchBuffer, this->getResidencyAllocations());
if (submissionStatus != SubmissionStatus::SUCCESS) {
CompletionStamp completionStamp = {CompletionStamp::getTaskCountFromSubmissionStatusError(submissionStatus)};
return completionStamp;
}
if (dispatchBcsFlags.flushTaskCount) {
this->latestFlushedTaskCount = this->taskCount + 1;
}
++taskCount;
CompletionStamp completionStamp = {taskCount, taskLevel, flushStamp->peekStamp()};
return completionStamp;
}
template <typename GfxFamily>
CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
LinearStream &commandStreamTask,
@@ -856,6 +929,16 @@ inline bool CommandStreamReceiverHw<GfxFamily>::flushBatchedSubmissions() {
return submitResult;
}
template <typename GfxFamily>
size_t CommandStreamReceiverHw<GfxFamily>::getRequiredCmdStreamSize(const DispatchBcsFlags &dispatchBcsFlags) {
return getCmdsSizeForHardwareContext() + sizeof(typename GfxFamily::MI_BATCH_BUFFER_START);
}
template <typename GfxFamily>
size_t CommandStreamReceiverHw<GfxFamily>::getRequiredCmdStreamSizeAligned(const DispatchBcsFlags &dispatchBcsFlags) {
return alignUp(getRequiredCmdStreamSize(dispatchBcsFlags), MemoryConstants::cacheLineSize);
}
template <typename GfxFamily>
size_t CommandStreamReceiverHw<GfxFamily>::getRequiredCmdStreamSizeAligned(const DispatchFlags &dispatchFlags, Device &device) {
size_t size = getRequiredCmdStreamSize(dispatchFlags, device);

View File

@@ -48,6 +48,17 @@ constexpr uint32_t l3AndL1On = 2u;
constexpr uint32_t NotApplicable = 3u;
} // namespace L3CachingSettings
struct DispatchBcsFlags {
DispatchBcsFlags() = delete;
DispatchBcsFlags(bool flushTaskCount, bool hasStallingCmds, bool hasRelaxedOrderingDependencies)
: flushTaskCount(flushTaskCount), hasStallingCmds(hasStallingCmds), hasRelaxedOrderingDependencies(hasRelaxedOrderingDependencies) {}
bool flushTaskCount = false;
bool hasStallingCmds = false;
bool hasRelaxedOrderingDependencies = false;
};
struct DispatchFlags {
DispatchFlags() = delete;
DispatchFlags(CsrDependencies csrDependenciesP, TimestampPacketContainer *barrierTimestampPacketNodesP, PipelineSelectArgs pipelineSelectArgsP,

View File

@@ -26,3 +26,10 @@ CompletionStamp MockCommandStreamReceiver::flushTask(
CompletionStamp stamp = {taskCount, taskLevel, flushStamp->peekStamp()};
return stamp;
}
CompletionStamp MockCommandStreamReceiver::flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) {
++taskCount;
CompletionStamp stamp = {taskCount, taskLevel, flushStamp->peekStamp()};
return stamp;
}

View File

@@ -104,6 +104,9 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
DispatchFlags &dispatchFlags,
Device &device) override;
CompletionStamp flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) override;
bool flushBatchedSubmissions() override {
if (flushBatchedSubmissionsCallCounter) {
(*flushBatchedSubmissionsCallCounter)++;