Avoid implicit sync for async mode immediate copy queue

Related-To: LOCI-1988

Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@intel.com>
This commit is contained in:
Aravind Gopalakrishnan 2022-04-25 23:49:47 +00:00 committed by Compute-Runtime-Automation
parent ced22d45e9
commit 31b2433b2f
10 changed files with 158 additions and 19 deletions

View File

@ -257,6 +257,7 @@ struct CommandList : _ze_command_list_handle_t {
uint32_t partitionCount = 1;
bool isFlushTaskSubmissionEnabled = false;
bool isSyncModeQueue = false;
bool isTbxMode = false;
bool commandListSLMEnabled = false;
bool requiresQueueUncachedMocs = false;

View File

@ -117,12 +117,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::initialize(Device *device, NEO
this->partitionCount = static_cast<uint32_t>(this->device->getNEODevice()->getDeviceBitfield().count());
}
if (this->cmdListType == CommandListType::TYPE_IMMEDIATE && !isCopyOnly() && !isInternal()) {
const auto &hwInfo = device->getHwInfo();
this->isFlushTaskSubmissionEnabled = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily).isPlatformFlushTaskEnabled(hwInfo);
if (NEO::DebugManager.flags.EnableFlushTaskSubmission.get() != -1) {
this->isFlushTaskSubmissionEnabled = !!NEO::DebugManager.flags.EnableFlushTaskSubmission.get();
}
if (this->isFlushTaskSubmissionEnabled) {
commandContainer.setFlushTaskUsedForImmediate(this->isFlushTaskSubmissionEnabled);
}
@ -149,17 +144,22 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::executeCommandListImmediate(bo
this->close();
ze_command_list_handle_t immediateHandle = this->toHandle();
this->commandContainer.removeDuplicatesFromResidencyContainer();
const auto commandListExecutionResult = this->cmdQImmediate->executeCommandLists(1, &immediateHandle, nullptr, performMigration);
if (commandListExecutionResult == ZE_RESULT_ERROR_DEVICE_LOST) {
return commandListExecutionResult;
}
const auto synchronizationResult = this->cmdQImmediate->synchronize(std::numeric_limits<uint64_t>::max());
if (synchronizationResult == ZE_RESULT_ERROR_DEVICE_LOST) {
return synchronizationResult;
}
if (this->isCopyOnly() && !this->isSyncModeQueue && !this->isTbxMode) {
this->commandContainer.currentLinearStreamStartOffset = this->commandContainer.getCommandStream()->getUsed();
} else {
const auto synchronizationResult = this->cmdQImmediate->synchronize(std::numeric_limits<uint64_t>::max());
if (synchronizationResult == ZE_RESULT_ERROR_DEVICE_LOST) {
return synchronizationResult;
}
this->reset();
this->reset();
}
return ZE_RESULT_SUCCESS;
}

View File

@ -114,6 +114,13 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device
commandList->internalUsage = internalUsage;
commandList->cmdListType = CommandListType::TYPE_IMMEDIATE;
commandList->isSyncModeQueue = (desc->mode == ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS);
if (!(NEO::EngineGroupType::Copy == engineType) && !internalUsage) {
const auto &hwInfo = device->getHwInfo();
commandList->isFlushTaskSubmissionEnabled = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily).isPlatformFlushTaskEnabled(hwInfo);
if (NEO::DebugManager.flags.EnableFlushTaskSubmission.get() != -1) {
commandList->isFlushTaskSubmissionEnabled = !!NEO::DebugManager.flags.EnableFlushTaskSubmission.get();
}
}
returnValue = commandList->initialize(device, engineType, desc->flags);
if (returnValue != ZE_RESULT_SUCCESS) {
commandList->destroy();
@ -130,6 +137,7 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device
commandList->cmdQImmediate = commandQueue;
commandList->csr = csr;
commandList->isTbxMode = (csr->getType() == NEO::CommandStreamReceiverType::CSR_TBX) || (csr->getType() == NEO::CommandStreamReceiverType::CSR_TBX_WITH_AUB);
commandList->commandListPreemptionMode = device->getDevicePreemptionMode();
return commandList;
}

View File

@ -369,6 +369,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
auto commandList = CommandList::fromHandle(phCommandLists[i]);
auto &cmdBufferAllocations = commandList->commandContainer.getCmdBufferAllocations();
auto cmdBufferCount = cmdBufferAllocations.size();
bool immediateMode = (commandList->cmdListType == CommandList::CommandListType::TYPE_IMMEDIATE) ? true : false;
auto commandListPreemption = commandList->getCommandListPreemptionMode();
if (statePreemption != commandListPreemption) {
@ -413,7 +414,11 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
for (size_t iter = 0; iter < cmdBufferCount; iter++) {
auto allocation = cmdBufferAllocations[iter];
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&child, allocation->getGpuAddress(), true);
uint64_t startOffset = allocation->getGpuAddress();
if (immediateMode && (iter == (cmdBufferCount - 1))) {
startOffset = ptrOffset(allocation->getGpuAddress(), commandList->commandContainer.currentLinearStreamStartOffset);
}
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&child, startOffset, true);
}
printfFunctionContainer.insert(printfFunctionContainer.end(),

View File

@ -1694,7 +1694,7 @@ HWTEST_F(CommandListCreate, givenCommandListWithCopyOnlyWhenSetBarrierThenMiFlus
EXPECT_NE(cmdList.end(), itor);
}
HWTEST_F(CommandListCreate, givenImmediateCommandListWithCopyOnlyWhenSetBarrierThenMiFlushCmdIsNotInsertedInTheCmdContainer) {
HWTEST_F(CommandListCreate, givenImmediateCommandListWithCopyOnlyWhenSetBarrierThenMiFlushCmdIsInsertedInTheCmdContainer) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
@ -1713,7 +1713,7 @@ HWTEST_F(CommandListCreate, givenImmediateCommandListWithCopyOnlyWhenSetBarrierT
cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed()));
auto itor = find<MI_FLUSH_DW *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(cmdList.end(), itor);
EXPECT_NE(cmdList.end(), itor);
}
HWTEST_F(CommandListCreate, whenCommandListIsResetThenContainsStatelessUncachedResourceIsSetToFalse) {

View File

@ -731,11 +731,11 @@ HWTEST_F(CommandListCreate, givenFlushTaskFlagEnabledAndAsyncCmdQueueAndCopyOnly
cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed()));
auto itor = find<SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(cmdList.end(), itor);
EXPECT_EQ(used, commandContainer.getCommandStream()->getUsed());
EXPECT_NE(cmdList.end(), itor);
EXPECT_GT(commandContainer.getCommandStream()->getUsed(), used);
}
HWTEST_F(CommandListCreate, givenAsyncCmdQueueAndCopyOnlyImmediateCommandListWhenAppendWaitEventsWithSubdeviceScopeThenMiFlushAndSemWaitAreAddedViaFlushTask) {
HWTEST_F(CommandListCreate, givenAsyncCmdQueueAndCopyOnlyImmediateCommandListWhenAppendWaitEventsWithSubdeviceScopeThenMiFlushAndSemWaitAreAdded) {
using SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
ze_command_queue_desc_t desc = {};
@ -763,8 +763,33 @@ HWTEST_F(CommandListCreate, givenAsyncCmdQueueAndCopyOnlyImmediateCommandListWhe
cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed()));
auto itor = find<SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(cmdList.end(), itor);
EXPECT_EQ(used, commandContainer.getCommandStream()->getUsed());
EXPECT_NE(cmdList.end(), itor);
EXPECT_GT(commandContainer.getCommandStream()->getUsed(), used);
}
HWTEST_F(CommandListCreate, givenAsyncCmdQueueAndTbxCsrWithCopyOnlyImmediateCommandListWhenAppendWaitEventsReturnsSuccess) {
using SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::Copy, returnValue));
ASSERT_NE(nullptr, commandList);
EXPECT_EQ(device, commandList->device);
EXPECT_EQ(1u, commandList->cmdListType);
EXPECT_NE(nullptr, commandList->cmdQImmediate);
commandList->isTbxMode = true;
MockEvent event, event2;
event.signalScope = 0;
event.waitScope = 0;
event2.waitScope = 0;
ze_event_handle_t events[] = {&event, &event2};
auto ret = commandList->appendWaitOnEvents(2, events);
EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
}
HWTEST_F(CommandListCreate, givenFlushTaskFlagEnabledAndAsyncCmdQueueWithCopyOnlyImmediateCommandListCreatedThenSlushTaskSubmissionIsSetToFalse) {

View File

@ -1213,6 +1213,59 @@ HWTEST_F(CommandListAppendLaunchKernel, givenInvalidEventListWhenAppendLaunchCoo
EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, returnValue);
}
using WithinXeHPAndXeHPC = IsWithinGfxCore<IGFX_XE_HP_CORE, IGFX_XE_HPC_CORE>;
HWTEST2_F(CommandListAppendLaunchKernel, givenNotEnoughSpaceInCommandStreamWhenAppendingKernelWithImmediateListWithoutFlushTaskThenNewCmdBufferAllocated, WithinXeHPAndXeHPC) {
DebugManagerStateRestore restorer;
NEO::DebugManager.flags.EnableFlushTaskSubmission.set(0);
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
createKernel();
ze_result_t returnValue;
ze_command_queue_desc_t queueDesc = {};
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &queueDesc, false, NEO::EngineGroupType::Compute, returnValue));
auto &commandContainer = commandList->commandContainer;
const auto stream = commandContainer.getCommandStream();
const auto streamCpu = stream->getCpuBase();
Vec3<size_t> groupCount{1, 1, 1};
auto sizeLeftInStream = sizeof(MI_BATCH_BUFFER_END);
auto available = stream->getAvailableSpace();
stream->getSpace(available - sizeLeftInStream);
const uint32_t threadGroupDimensions[3] = {1, 1, 1};
NEO::EncodeDispatchKernelArgs dispatchKernelArgs{
0,
device->getNEODevice(),
kernel.get(),
threadGroupDimensions,
PreemptionMode::MidBatch,
0,
false,
false,
false,
false,
false,
false,
false,
false};
NEO::EncodeDispatchKernel<FamilyType>::encode(commandContainer, dispatchKernelArgs);
auto usedSpaceAfter = commandContainer.getCommandStream()->getUsed();
ASSERT_GT(usedSpaceAfter, 0u);
const auto streamCpu2 = stream->getCpuBase();
EXPECT_NE(nullptr, streamCpu2);
EXPECT_NE(streamCpu, streamCpu2);
EXPECT_EQ(2u, commandContainer.getCmdBufferAllocations().size());
auto immediateHandle = commandList->toHandle();
returnValue = commandList->cmdQImmediate->executeCommandLists(1, &immediateHandle, nullptr, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
}
HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLaunchCooperativeKernelIsCalledThenCorrectValueIsReturned, IsAtLeastSkl) {
Mock<::L0::Kernel> kernel;
auto pMockModule = std::unique_ptr<Module>(new Mock<Module>(device, nullptr));

View File

@ -220,6 +220,51 @@ HWTEST2_F(AppendMemoryCopy, givenImmediateCommandListWhenAppendingMemoryCopyWith
commandList->cmdQImmediate = nullptr;
}
HWTEST2_F(AppendMemoryCopy, givenAsyncImmediateCommandListWhenAppendingMemoryCopyWithCopyEngineThenSuccessIsReturned, IsAtLeastSkl) {
Mock<CommandQueue> cmdQueue;
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
auto commandList = std::make_unique<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>>();
ASSERT_NE(nullptr, commandList);
ze_result_t ret = commandList->initialize(device, NEO::EngineGroupType::Copy, 0u);
ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
commandList->device = device;
commandList->cmdQImmediate = &cmdQueue;
commandList->cmdListType = CommandList::CommandListType::TYPE_IMMEDIATE;
auto result = commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(1u, cmdQueue.executeCommandListsCalled);
EXPECT_EQ(0u, cmdQueue.synchronizeCalled);
commandList->cmdQImmediate = nullptr;
}
HWTEST2_F(AppendMemoryCopy, givenSyncModeImmediateCommandListWhenAppendingMemoryCopyWithCopyEngineThenSuccessIsReturned, IsAtLeastSkl) {
Mock<CommandQueue> cmdQueue;
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
auto commandList = std::make_unique<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>>();
ASSERT_NE(nullptr, commandList);
ze_result_t ret = commandList->initialize(device, NEO::EngineGroupType::Copy, 0u);
ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
commandList->device = device;
commandList->cmdQImmediate = &cmdQueue;
commandList->cmdListType = CommandList::CommandListType::TYPE_IMMEDIATE;
commandList->isSyncModeQueue = true;
auto result = commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(1u, cmdQueue.executeCommandListsCalled);
EXPECT_EQ(1u, cmdQueue.synchronizeCalled);
commandList->cmdQImmediate = nullptr;
}
HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyCalledThenPipeControlWithDcFlushAdded, IsAtLeastSkl) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;

View File

@ -286,6 +286,7 @@ void CommandContainer::closeAndAllocateNextCommandBuffer() {
auto ptr = commandStream->getSpace(0u);
memcpy_s(ptr, bbEndSize, hwHelper.getBatchBufferEndReference(), bbEndSize);
allocateNextCommandBuffer();
currentLinearStreamStartOffset = 0u;
}
void CommandContainer::prepareBindfulSsh() {

View File

@ -80,6 +80,7 @@ class CommandContainer : public NonCopyableOrMovableClass {
uint32_t nextIddInBlock = 0;
bool lastPipelineSelectModeRequired = false;
bool lastSentUseGlobalAtomics = false;
uint64_t currentLinearStreamStartOffset = 0u;
Device *getDevice() const { return device; }