performance: add implementation of cmdlist immediate flush task

Related-To: NEO-7808

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2023-06-30 18:29:47 +00:00 committed by Compute-Runtime-Automation
parent c18198ebd8
commit 866e3c37ba
8 changed files with 553 additions and 25 deletions

View File

@ -2657,6 +2657,10 @@ void CommandListCoreFamily<gfxCoreFamily>::updateStreamPropertiesForFlushTaskDis
requiredStreamState.frontEndState.setPropertiesComputeDispatchAllWalkerEnableDisableEuFusion(isCooperative, fusedEuDisabled);
requiredStreamState.pipelineSelect.setPropertySystolicMode(kernelAttributes.flags.usesSystolicPipelineSelectMode);
KernelImp &kernelImp = static_cast<KernelImp &>(kernel);
int32_t currentMocsState = static_cast<int32_t>(device->getMOCS(!kernelImp.getKernelRequiresUncachedMocs(), false) >> 1);
requiredStreamState.stateBaseAddress.setPropertyStatelessMocs(currentMocsState);
}
template <GFXCORE_FAMILY gfxCoreFamily>

View File

@ -111,7 +111,67 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushBcsTask
template <GFXCORE_FAMILY gfxCoreFamily>
NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediateRegularTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies) {
return {NEO::CompletionStamp::getTaskCountFromSubmissionStatusError(NEO::SubmissionStatus::UNSUPPORTED)};
bool sbaDirty = this->csr->getGSBAStateDirty();
NEO::IndirectHeap *dsh = nullptr;
NEO::IndirectHeap *ssh = nullptr;
NEO::IndirectHeap *ioh = this->commandContainer.getIndirectHeap(NEO::IndirectHeap::Type::INDIRECT_OBJECT);
this->csr->makeResident(*ioh->getGraphicsAllocation());
if (sbaDirty) {
this->requiredStreamState.stateBaseAddress.setPropertiesIndirectState(ioh->getHeapGpuBase(), ioh->getHeapSizeInPages());
}
if (this->cmdListHeapAddressModel == NEO::HeapAddressModel::GlobalStateless) {
ssh = this->csr->getGlobalStatelessHeap();
this->csr->makeResident(*ssh->getGraphicsAllocation());
if (sbaDirty) {
this->requiredStreamState.stateBaseAddress.setPropertiesSurfaceState(ssh->getHeapGpuBase(), ssh->getHeapSizeInPages());
}
} else if (this->immediateCmdListHeapSharing) {
ssh = this->commandContainer.getSurfaceStateHeapReserve().indirectHeapReservation;
if (ssh->getGraphicsAllocation()) {
this->csr->makeResident(*ssh->getGraphicsAllocation());
this->requiredStreamState.stateBaseAddress.setPropertiesBindingTableSurfaceState(ssh->getHeapGpuBase(), ssh->getHeapSizeInPages(),
ssh->getHeapGpuBase(), ssh->getHeapSizeInPages());
}
if (this->dynamicHeapRequired) {
dsh = this->commandContainer.getDynamicStateHeapReserve().indirectHeapReservation;
if (dsh->getGraphicsAllocation()) {
this->csr->makeResident(*dsh->getGraphicsAllocation());
this->requiredStreamState.stateBaseAddress.setPropertiesDynamicState(dsh->getHeapGpuBase(), dsh->getHeapSizeInPages());
}
}
} else {
if (this->dynamicHeapRequired) {
dsh = this->commandContainer.getIndirectHeap(NEO::IndirectHeap::Type::DYNAMIC_STATE);
this->csr->makeResident(*dsh->getGraphicsAllocation());
this->requiredStreamState.stateBaseAddress.setPropertiesDynamicState(dsh->getHeapGpuBase(), dsh->getHeapSizeInPages());
}
ssh = this->commandContainer.getIndirectHeap(NEO::IndirectHeap::Type::SURFACE_STATE);
this->csr->makeResident(*ssh->getGraphicsAllocation());
this->requiredStreamState.stateBaseAddress.setPropertiesBindingTableSurfaceState(ssh->getHeapGpuBase(), ssh->getHeapSizeInPages(),
ssh->getHeapGpuBase(), ssh->getHeapSizeInPages());
}
void *sshCpuPointer = ssh->getCpuBase();
NEO::ImmediateDispatchFlags dispatchFlags{
&this->requiredStreamState, // requiredState
sshCpuPointer, // sshCpuBase
this->isSyncModeQueue, // blockingAppend
hasRelaxedOrderingDependencies, // hasRelaxedOrderingDependencies
hasStallingCmds // hasStallingCmds
};
this->csr->setRequiredScratchSizes(this->getCommandListPerThreadScratchSize(), this->getCommandListPerThreadPrivateScratchSize());
CommandListImp::storeReferenceTsToMappedEvents(true);
return this->csr->flushImmediateTask(
cmdStreamTask,
taskStartOffset,
dispatchFlags,
*(this->device->getNEODevice()));
}
template <GFXCORE_FAMILY gfxCoreFamily>

View File

@ -114,6 +114,9 @@ void ModuleMutableCommandListFixture::setUpImpl() {
kernel = std::make_unique<ModuleImmutableDataFixture::MockKernel>(module.get());
createKernel(kernel.get());
this->dshRequired = device->getDeviceInfo().imageSupport;
this->expectedSbaCmds = commandList->doubleSbaWa ? 2 : 1;
}
void ModuleMutableCommandListFixture::setUp(uint32_t revision) {
@ -135,6 +138,10 @@ void ModuleMutableCommandListFixture::tearDown() {
ModuleImmutableDataFixture::tearDown();
}
uint32_t ModuleMutableCommandListFixture::getMocs(bool l3On) {
return device->getMOCS(l3On, false) >> 1;
}
void FrontEndCommandListFixtureInit::setUp(int32_t dispatchCmdBufferPrimary) {
DebugManager.flags.DispatchCmdlistCmdBufferPrimary.set(dispatchCmdBufferPrimary);
DebugManager.flags.EnableFrontEndTracking.set(1);
@ -169,13 +176,6 @@ void CommandListStateBaseAddressFixture::setUp() {
DebugManager.flags.ForceDefaultHeapSize.set(64);
ModuleMutableCommandListFixture::setUp();
this->dshRequired = device->getDeviceInfo().imageSupport;
this->expectedSbaCmds = commandList->doubleSbaWa ? 2 : 1;
}
uint32_t CommandListStateBaseAddressFixture::getMocs(bool l3On) {
return device->getMOCS(l3On, false) >> 1;
}
void CommandListPrivateHeapsFixture::setUp() {
@ -211,6 +211,7 @@ void CommandListGlobalHeapsFixtureInit::setUp() {
void CommandListGlobalHeapsFixtureInit::setUpParams(int32_t globalHeapMode) {
DebugManager.flags.SelectCmdListHeapAddressModel.set(globalHeapMode);
DebugManager.flags.UseImmediateFlushTask.set(0);
CommandListStateBaseAddressFixture::setUp();
DebugManager.flags.SelectCmdListHeapAddressModel.set(static_cast<int32_t>(NEO::HeapAddressModel::PrivateHeaps));
@ -369,9 +370,30 @@ void PrimaryBatchBufferPreamblelessCmdListFixture::tearDown() {
void ImmediateFlushTaskCmdListFixture::setUp() {
DebugManager.flags.UseImmediateFlushTask.set(1);
DebugManager.flags.ForceL1Caching.set(0);
ModuleMutableCommandListFixture::setUp();
}
void ImmediateFlushTaskGlobalStatelessCmdListFixture::setUp() {
DebugManager.flags.SelectCmdListHeapAddressModel.set(static_cast<int32_t>(NEO::HeapAddressModel::GlobalStateless));
ImmediateFlushTaskCmdListFixture::setUp();
}
void ImmediateFlushTaskCsrSharedHeapCmdListFixture::setUp() {
DebugManager.flags.EnableImmediateCmdListHeapSharing.set(1);
DebugManager.flags.SelectCmdListHeapAddressModel.set(static_cast<int32_t>(NEO::HeapAddressModel::PrivateHeaps));
ImmediateFlushTaskCmdListFixture::setUp();
}
void ImmediateFlushTaskPrivateHeapCmdListFixture::setUp() {
DebugManager.flags.EnableImmediateCmdListHeapSharing.set(0);
DebugManager.flags.SelectCmdListHeapAddressModel.set(static_cast<int32_t>(NEO::HeapAddressModel::PrivateHeaps));
ImmediateFlushTaskCmdListFixture::setUp();
}
} // namespace ult
} // namespace L0

View File

@ -75,14 +75,19 @@ struct ModuleMutableCommandListFixture : public ModuleImmutableDataFixture {
void tearDown();
void setUpImpl();
uint32_t getMocs(bool l3On);
std::unique_ptr<MockImmutableData> mockKernelImmData;
std::unique_ptr<L0::ult::CommandList> commandList;
std::unique_ptr<L0::ult::CommandList> commandListImmediate;
std::unique_ptr<ModuleImmutableDataFixture::MockKernel> kernel;
std::unique_ptr<VariableBackup<HardwareInfo>> backupHwInfo;
L0::ult::CommandQueue *commandQueue;
size_t expectedSbaCmds = 0;
NEO::EngineGroupType engineGroupType;
bool dshRequired = false;
DebugManagerStateRestore restorer;
};
@ -135,10 +140,6 @@ struct CmdListLargeGrfFixture : public CmdListStateComputeModeStateFixture {
struct CommandListStateBaseAddressFixture : public ModuleMutableCommandListFixture {
void setUp();
uint32_t getMocs(bool l3On);
size_t expectedSbaCmds = 0;
bool dshRequired = false;
};
struct CommandListPrivateHeapsFixture : public CommandListStateBaseAddressFixture {
@ -308,5 +309,17 @@ struct ImmediateFlushTaskCmdListFixture : public ModuleMutableCommandListFixture
void setUp();
};
struct ImmediateFlushTaskGlobalStatelessCmdListFixture : public ImmediateFlushTaskCmdListFixture {
void setUp();
};
struct ImmediateFlushTaskCsrSharedHeapCmdListFixture : public ImmediateFlushTaskCmdListFixture {
void setUp();
};
struct ImmediateFlushTaskPrivateHeapCmdListFixture : public ImmediateFlushTaskCmdListFixture {
void setUp();
};
} // namespace ult
} // namespace L0

View File

@ -993,6 +993,8 @@ TEST_F(CommandListCreate, whenCreatingImmCmdListWithSyncModeAndAppendBarrierThen
}
HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingThenPassStallingCmdsInfo, IsAtLeastXeHpcCore) {
bool useImmediateFlushTask = getHelper<L0GfxCoreHelper>().platformSupportsImmediateComputeFlushTask();
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
ze_result_t returnValue;
@ -1028,9 +1030,13 @@ HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingTh
auto ultCsr = static_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(whiteBoxCmdList->csr);
ultCsr->recordFlusheBatchBuffer = true;
auto verifyFlags = [&ultCsr](ze_result_t result, bool dispatchFlag, bool bbFlag) {
auto verifyFlags = [&ultCsr, useImmediateFlushTask](ze_result_t result, bool dispatchFlag, bool bbFlag) {
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(ultCsr->recordedDispatchFlags.hasStallingCmds, dispatchFlag);
if (useImmediateFlushTask) {
EXPECT_EQ(ultCsr->recordedImmediateDispatchFlags.hasStallingCmds, dispatchFlag);
} else {
EXPECT_EQ(ultCsr->recordedDispatchFlags.hasStallingCmds, dispatchFlag);
}
EXPECT_EQ(ultCsr->latestFlushedBatchBuffer.hasStallingCmds, bbFlag);
};
// non-pipelined state
@ -1087,6 +1093,8 @@ HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingTh
}
HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingDisabledRelaxedOrderingThenPassStallingCmdsInfo, IsAtLeastXeHpcCore) {
bool useImmediateFlushTask = getHelper<L0GfxCoreHelper>().platformSupportsImmediateComputeFlushTask();
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
ze_result_t returnValue;
@ -1125,14 +1133,22 @@ HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingDi
EXPECT_FALSE(NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*ultCsr, 1));
auto verifyFlags = [&ultCsr](ze_result_t result) {
auto verifyFlags = [&ultCsr, useImmediateFlushTask](ze_result_t result) {
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_TRUE(ultCsr->recordedDispatchFlags.hasStallingCmds);
if (useImmediateFlushTask) {
EXPECT_TRUE(ultCsr->recordedImmediateDispatchFlags.hasStallingCmds);
} else {
EXPECT_TRUE(ultCsr->recordedDispatchFlags.hasStallingCmds);
}
EXPECT_TRUE(ultCsr->latestFlushedBatchBuffer.hasStallingCmds);
};
auto resetFlags = [&ultCsr]() {
ultCsr->recordedDispatchFlags.hasStallingCmds = false;
auto resetFlags = [&ultCsr, useImmediateFlushTask]() {
if (useImmediateFlushTask) {
ultCsr->recordedImmediateDispatchFlags.hasStallingCmds = false;
} else {
ultCsr->recordedDispatchFlags.hasStallingCmds = false;
}
ultCsr->latestFlushedBatchBuffer.hasStallingCmds = false;
};
@ -1329,6 +1345,8 @@ HWTEST_F(CommandListCreate, givenDebugFlagSetWhenCallingSynchronizeThenDontUnreg
}
HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingThenPassRelaxedOrderingDependenciesInfo, IsAtLeastXeHpcCore) {
bool useImmediateFlushTask = getHelper<L0GfxCoreHelper>().platformSupportsImmediateComputeFlushTask();
DebugManagerStateRestore restore;
DebugManager.flags.DirectSubmissionRelaxedOrdering.set(1);
@ -1372,9 +1390,13 @@ HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingTh
ultCsr->registerClient();
ultCsr->registerClient();
auto verifyFlags = [&ultCsr](ze_result_t result, bool dispatchFlag, bool bbFlag) {
auto verifyFlags = [&ultCsr, useImmediateFlushTask](ze_result_t result, bool dispatchFlag, bool bbFlag) {
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(ultCsr->recordedDispatchFlags.hasRelaxedOrderingDependencies, dispatchFlag);
if (useImmediateFlushTask) {
EXPECT_EQ(ultCsr->recordedImmediateDispatchFlags.hasRelaxedOrderingDependencies, dispatchFlag);
} else {
EXPECT_EQ(ultCsr->recordedDispatchFlags.hasRelaxedOrderingDependencies, dispatchFlag);
}
EXPECT_EQ(ultCsr->latestFlushedBatchBuffer.hasRelaxedOrderingDependencies, bbFlag);
};
@ -1450,6 +1472,8 @@ HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingTh
}
HWTEST2_F(CommandListCreate, givenInOrderExecutionWhenDispatchingRelaxedOrderingWithoutInputEventsThenCountPreviousEventAsWaitlist, IsAtLeastXeHpcCore) {
bool useImmediateFlushTask = getHelper<L0GfxCoreHelper>().platformSupportsImmediateComputeFlushTask();
DebugManagerStateRestore restore;
DebugManager.flags.DirectSubmissionRelaxedOrdering.set(1);
@ -1491,7 +1515,11 @@ HWTEST2_F(CommandListCreate, givenInOrderExecutionWhenDispatchingRelaxedOrdering
commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, event, 0, nullptr, launchParams, false);
commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_TRUE(ultCsr->recordedDispatchFlags.hasRelaxedOrderingDependencies);
if (useImmediateFlushTask) {
EXPECT_TRUE(ultCsr->recordedImmediateDispatchFlags.hasRelaxedOrderingDependencies);
} else {
EXPECT_TRUE(ultCsr->recordedDispatchFlags.hasRelaxedOrderingDependencies);
}
EXPECT_TRUE(ultCsr->latestFlushedBatchBuffer.hasRelaxedOrderingDependencies);
}

View File

@ -64,6 +64,9 @@ HWTEST2_F(MultiTileCopyEngineCommandListTest, GivenMultiTileDeviceWhenCreatingCo
using CommandListExecuteImmediate = Test<DeviceFixture>;
HWTEST2_F(CommandListExecuteImmediate, whenExecutingCommandListImmediateWithFlushTaskThenRequiredStreamStateIsCorrectlyReported, IsAtLeastSkl) {
DebugManagerStateRestore restorer;
DebugManager.flags.UseImmediateFlushTask.set(0);
auto &gfxCoreHelper = device->getGfxCoreHelper();
auto &productHelper = device->getProductHelper();

View File

@ -6,6 +6,7 @@
*/
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/command_stream/scratch_space_controller_base.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/gmm_helper/gmm_lib.h"
#include "shared/source/helpers/definitions/command_encoder_args.h"
@ -1888,16 +1889,403 @@ HWTEST2_F(RayTracingCmdListTest,
ultCsr->isMadeResident(rtAllocation, residentCount);
}
using ImmediateFlushTaskCmdListTests = Test<ImmediateFlushTaskCmdListFixture>;
using ImmediateFlushTaskGlobalStatelessCmdListTest = Test<ImmediateFlushTaskGlobalStatelessCmdListFixture>;
HWTEST2_F(ImmediateFlushTaskCmdListTests,
givenInitialVersionOfImmediateFlushTaskWhenImmediateFlushTaskSelectedThenUnsupportedErrorReturned,
HWTEST2_F(ImmediateFlushTaskGlobalStatelessCmdListTest,
givenImmediateFlushOnGlobalStatelessWhenAppendingKernelThenExpectStateBaseAddressCommandDispatchedOnce,
IsAtLeastXeHpCore) {
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
auto &csrImmediate = neoDevice->getUltCommandStreamReceiver<FamilyType>();
csrImmediate.storeMakeResidentAllocations = true;
auto &csrStream = csrImmediate.commandStream;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
size_t csrUsedBefore = csrStream.getUsed();
auto result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, result);
size_t csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto globalSurfaceHeap = commandListImmediate->csr->getGlobalStatelessHeap();
auto ioHeap = commandListImmediate->getCmdContainer().getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT);
auto ioBaseAddress = neoDevice->getGmmHelper()->decanonize(ioHeap->getHeapGpuBase());
auto ssBaseAddress = globalSurfaceHeap->getHeapGpuBase();
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
auto sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(expectedSbaCmds, sbaCmds.size());
auto sbaCmd = reinterpret_cast<STATE_BASE_ADDRESS *>(*sbaCmds[0]);
EXPECT_TRUE(sbaCmd->getSurfaceStateBaseAddressModifyEnable());
EXPECT_EQ(ssBaseAddress, sbaCmd->getSurfaceStateBaseAddress());
EXPECT_EQ(ioBaseAddress, sbaCmd->getGeneralStateBaseAddress());
EXPECT_TRUE(csrImmediate.isMadeResident(ioHeap->getGraphicsAllocation()));
EXPECT_TRUE(csrImmediate.isMadeResident(globalSurfaceHeap->getGraphicsAllocation()));
csrUsedBefore = csrStream.getUsed();
result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
cmdList.clear();
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(0u, sbaCmds.size());
}
HWTEST2_F(ImmediateFlushTaskGlobalStatelessCmdListTest,
givenImmediateFlushOnGlobalStatelessWhenAppendingSecondKernelWithChangedMocsThenExpectStateBaseAddressCommandDispatchedTwiceWithChangedMocs,
IsAtLeastXeHpCore) {
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
auto &csrImmediate = neoDevice->getUltCommandStreamReceiver<FamilyType>();
auto &csrStream = csrImmediate.commandStream;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
size_t csrUsedBefore = csrStream.getUsed();
auto result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
size_t csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto globalSurfaceHeap = commandListImmediate->csr->getGlobalStatelessHeap();
auto ssBaseAddress = globalSurfaceHeap->getHeapGpuBase();
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
auto sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(expectedSbaCmds, sbaCmds.size());
auto sbaCmd = reinterpret_cast<STATE_BASE_ADDRESS *>(*sbaCmds[0]);
EXPECT_TRUE(sbaCmd->getSurfaceStateBaseAddressModifyEnable());
EXPECT_EQ(ssBaseAddress, sbaCmd->getSurfaceStateBaseAddress());
uint32_t cachedStatlessMocs = getMocs(true);
EXPECT_EQ((cachedStatlessMocs << 1), sbaCmd->getStatelessDataPortAccessMemoryObjectControlState());
kernel->kernelRequiresUncachedMocsCount++;
csrUsedBefore = csrStream.getUsed();
result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
cmdList.clear();
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(expectedSbaCmds, sbaCmds.size());
sbaCmd = reinterpret_cast<STATE_BASE_ADDRESS *>(*sbaCmds[0]);
uint32_t uncachedStatlessMocs = getMocs(false);
EXPECT_EQ((uncachedStatlessMocs << 1), sbaCmd->getStatelessDataPortAccessMemoryObjectControlState());
}
using ImmediateFlushTaskCsrSharedHeapCmdListTest = Test<ImmediateFlushTaskCsrSharedHeapCmdListFixture>;
HWTEST2_F(ImmediateFlushTaskCsrSharedHeapCmdListTest,
givenImmediateFlushOnCsrSharedHeapsWhenAppendingKernelThenExpectStateBaseAddressCommandDispatchedOnce,
IsAtLeastXeHpCore) {
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
auto &csrImmediate = neoDevice->getUltCommandStreamReceiver<FamilyType>();
csrImmediate.storeMakeResidentAllocations = true;
auto &csrStream = csrImmediate.commandStream;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
size_t csrUsedBefore = csrStream.getUsed();
auto result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
size_t csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto ssHeap = commandListImmediate->getCmdContainer().getSurfaceStateHeapReserve().indirectHeapReservation;
auto ssBaseAddress = ssHeap->getHeapGpuBase();
uint64_t dsBaseAddress = 0;
if (dshRequired) {
auto dsHeap = commandListImmediate->getCmdContainer().getDynamicStateHeapReserve().indirectHeapReservation;
dsBaseAddress = dsHeap->getHeapGpuBase();
}
auto ioHeap = commandListImmediate->getCmdContainer().getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT);
auto ioBaseAddress = neoDevice->getGmmHelper()->decanonize(ioHeap->getHeapGpuBase());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
auto sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(expectedSbaCmds, sbaCmds.size());
auto sbaCmd = reinterpret_cast<STATE_BASE_ADDRESS *>(*sbaCmds[0]);
EXPECT_TRUE(sbaCmd->getSurfaceStateBaseAddressModifyEnable());
EXPECT_EQ(ssBaseAddress, sbaCmd->getSurfaceStateBaseAddress());
EXPECT_EQ(dshRequired, sbaCmd->getDynamicStateBaseAddressModifyEnable());
EXPECT_EQ(dsBaseAddress, sbaCmd->getDynamicStateBaseAddress());
EXPECT_EQ(ioBaseAddress, sbaCmd->getGeneralStateBaseAddress());
EXPECT_TRUE(csrImmediate.isMadeResident(ioHeap->getGraphicsAllocation()));
EXPECT_TRUE(csrImmediate.isMadeResident(ssHeap->getGraphicsAllocation()));
if (dshRequired) {
auto dsHeap = commandListImmediate->getCmdContainer().getDynamicStateHeapReserve().indirectHeapReservation;
EXPECT_TRUE(csrImmediate.isMadeResident(dsHeap->getGraphicsAllocation()));
}
csrUsedBefore = csrStream.getUsed();
result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
cmdList.clear();
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(0u, sbaCmds.size());
}
HWTEST2_F(ImmediateFlushTaskCsrSharedHeapCmdListTest,
givenImmediateFlushOnCsrSharedHeapsWhenAppendingSecondKernelWithChangedMocsThenExpectStateBaseAddressCommandDispatchedTwiceWithChangedMocs,
IsAtLeastXeHpCore) {
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
auto &csrImmediate = neoDevice->getUltCommandStreamReceiver<FamilyType>();
auto &csrStream = csrImmediate.commandStream;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
size_t csrUsedBefore = csrStream.getUsed();
auto result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
size_t csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto ssHeap = commandListImmediate->getCmdContainer().getSurfaceStateHeapReserve().indirectHeapReservation;
auto ssBaseAddress = ssHeap->getHeapGpuBase();
uint64_t dsBaseAddress = 0;
if (dshRequired) {
auto dsHeap = commandListImmediate->getCmdContainer().getDynamicStateHeapReserve().indirectHeapReservation;
dsBaseAddress = dsHeap->getHeapGpuBase();
}
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
auto sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(expectedSbaCmds, sbaCmds.size());
auto sbaCmd = reinterpret_cast<STATE_BASE_ADDRESS *>(*sbaCmds[0]);
EXPECT_TRUE(sbaCmd->getSurfaceStateBaseAddressModifyEnable());
EXPECT_EQ(ssBaseAddress, sbaCmd->getSurfaceStateBaseAddress());
EXPECT_EQ(dshRequired, sbaCmd->getDynamicStateBaseAddressModifyEnable());
EXPECT_EQ(dsBaseAddress, sbaCmd->getDynamicStateBaseAddress());
uint32_t cachedStatlessMocs = getMocs(true);
EXPECT_EQ((cachedStatlessMocs << 1), sbaCmd->getStatelessDataPortAccessMemoryObjectControlState());
kernel->kernelRequiresUncachedMocsCount++;
csrUsedBefore = csrStream.getUsed();
result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
cmdList.clear();
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(expectedSbaCmds, sbaCmds.size());
sbaCmd = reinterpret_cast<STATE_BASE_ADDRESS *>(*sbaCmds[0]);
uint32_t uncachedStatlessMocs = getMocs(false);
EXPECT_EQ((uncachedStatlessMocs << 1), sbaCmd->getStatelessDataPortAccessMemoryObjectControlState());
}
HWTEST2_F(ImmediateFlushTaskCsrSharedHeapCmdListTest,
givenImmediateFlushOnCsrSharedHeapsWhenAppendingSecondKernelWithScratchThenExpectScratchStateAndAllocation,
IsAtLeastXeHpCore) {
using CFE_STATE = typename FamilyType::CFE_STATE;
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
auto &csrImmediate = neoDevice->getUltCommandStreamReceiver<FamilyType>();
csrImmediate.storeMakeResidentAllocations = true;
auto &csrStream = csrImmediate.commandStream;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
size_t csrUsedBefore = csrStream.getUsed();
auto result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
size_t csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
auto frontEndCommands = findAll<CFE_STATE *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(1u, frontEndCommands.size());
auto frontEndCmd = reinterpret_cast<CFE_STATE *>(*frontEndCommands[0]);
EXPECT_EQ(0u, frontEndCmd->getScratchSpaceBuffer());
EXPECT_EQ(nullptr, csrImmediate.getScratchSpaceController()->getScratchSpaceAllocation());
mockKernelImmData->kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = 0x100;
csrUsedBefore = csrStream.getUsed();
result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
cmdList.clear();
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
frontEndCommands = findAll<CFE_STATE *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(1u, frontEndCommands.size());
frontEndCmd = reinterpret_cast<CFE_STATE *>(*frontEndCommands[0]);
constexpr size_t expectedScratchOffset = 2 * sizeof(RENDER_SURFACE_STATE);
EXPECT_EQ(expectedScratchOffset, frontEndCmd->getScratchSpaceBuffer());
auto scratchAllocation = csrImmediate.getScratchSpaceController()->getScratchSpaceAllocation();
ASSERT_NE(nullptr, scratchAllocation);
EXPECT_TRUE(csrImmediate.isMadeResident(scratchAllocation));
auto ssHeap = commandListImmediate->getCmdContainer().getSurfaceStateHeapReserve().indirectHeapReservation;
void *scratchSurfaceStateMemory = ptrOffset(ssHeap->getCpuBase(), expectedScratchOffset);
auto scratchSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(scratchSurfaceStateMemory);
EXPECT_EQ(RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_SCRATCH, scratchSurfaceState->getSurfaceType());
EXPECT_EQ(scratchAllocation->getGpuAddress(), scratchSurfaceState->getSurfaceBaseAddress());
}
HWTEST2_F(ImmediateFlushTaskCsrSharedHeapCmdListTest,
givenImmediateFlushOnCsrSharedHeapsWhenAppendingBarrierThenNoSurfaceHeapAllocated,
IsAtLeastXeHpCore) {
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
auto &csrImmediate = neoDevice->getUltCommandStreamReceiver<FamilyType>();
auto &csrStream = csrImmediate.commandStream;
size_t csrUsedBefore = csrStream.getUsed();
auto result = commandListImmediate->appendBarrier(nullptr, 0, nullptr);
size_t csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto ssHeap = commandListImmediate->getCmdContainer().getSurfaceStateHeapReserve().indirectHeapReservation;
EXPECT_EQ(nullptr, ssHeap->getGraphicsAllocation());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
auto sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(expectedSbaCmds, sbaCmds.size());
auto sbaCmd = reinterpret_cast<STATE_BASE_ADDRESS *>(*sbaCmds[0]);
EXPECT_FALSE(sbaCmd->getSurfaceStateBaseAddressModifyEnable());
EXPECT_EQ(0u, sbaCmd->getSurfaceStateBaseAddress());
}
using ImmediateFlushTaskPrivateHeapCmdListTest = Test<ImmediateFlushTaskPrivateHeapCmdListFixture>;
HWTEST2_F(ImmediateFlushTaskPrivateHeapCmdListTest,
givenImmediateFlushOnPrivateHeapsWhenAppendingKernelThenExpectStateBaseAddressCommandDispatchedOnce,
IsAtLeastXeHpCore) {
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
auto &csrImmediate = neoDevice->getUltCommandStreamReceiver<FamilyType>();
csrImmediate.storeMakeResidentAllocations = true;
auto &csrStream = csrImmediate.commandStream;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
size_t csrUsedBefore = csrStream.getUsed();
auto result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
size_t csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto ssHeap = commandListImmediate->getCmdContainer().getIndirectHeap(NEO::HeapType::SURFACE_STATE);
auto ssBaseAddress = ssHeap->getHeapGpuBase();
uint64_t dsBaseAddress = 0;
if (dshRequired) {
auto dsHeap = commandListImmediate->getCmdContainer().getIndirectHeap(NEO::HeapType::DYNAMIC_STATE);
dsBaseAddress = dsHeap->getHeapGpuBase();
}
auto ioHeap = commandListImmediate->getCmdContainer().getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT);
auto ioBaseAddress = neoDevice->getGmmHelper()->decanonize(ioHeap->getHeapGpuBase());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
auto sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(expectedSbaCmds, sbaCmds.size());
auto sbaCmd = reinterpret_cast<STATE_BASE_ADDRESS *>(*sbaCmds[0]);
EXPECT_TRUE(sbaCmd->getSurfaceStateBaseAddressModifyEnable());
EXPECT_EQ(ssBaseAddress, sbaCmd->getSurfaceStateBaseAddress());
EXPECT_EQ(dshRequired, sbaCmd->getDynamicStateBaseAddressModifyEnable());
EXPECT_EQ(dsBaseAddress, sbaCmd->getDynamicStateBaseAddress());
EXPECT_EQ(ioBaseAddress, sbaCmd->getGeneralStateBaseAddress());
EXPECT_TRUE(csrImmediate.isMadeResident(ioHeap->getGraphicsAllocation()));
EXPECT_TRUE(csrImmediate.isMadeResident(ssHeap->getGraphicsAllocation()));
if (dshRequired) {
auto dsHeap = commandListImmediate->getCmdContainer().getIndirectHeap(NEO::HeapType::DYNAMIC_STATE);
EXPECT_TRUE(csrImmediate.isMadeResident(dsHeap->getGraphicsAllocation()));
}
csrUsedBefore = csrStream.getUsed();
result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
csrUsedAfter = csrStream.getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
cmdList.clear();
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
csrUsedAfter - csrUsedBefore));
sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(0u, sbaCmds.size());
}
} // namespace ult

View File

@ -189,6 +189,15 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
return BaseClass::flushTask(commandStream, commandStreamStart, dsh, ioh, ssh, taskLevel, dispatchFlags, device);
}
CompletionStamp flushImmediateTask(LinearStream &immediateCommandStream,
size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags,
Device &device) override {
recordedImmediateDispatchFlags = dispatchFlags;
this->lastFlushedCommandStream = &commandStream;
return BaseClass::flushImmediateTask(immediateCommandStream, immediateCommandStreamStart, dispatchFlags, device);
}
size_t getPreferredTagPoolSize() const override {
return BaseClass::getPreferredTagPoolSize() + 1;
}
@ -431,6 +440,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
mutable uint32_t checkGpuHangDetectedCalled = 0;
int ensureCommandBufferAllocationCalled = 0;
DispatchFlags recordedDispatchFlags;
ImmediateDispatchFlags recordedImmediateDispatchFlags = {};
BlitPropertiesContainer receivedBlitProperties = {};
uint32_t createAllocationForHostSurfaceCalled = 0;
WaitStatus returnWaitForCompletionWithTimeout = WaitStatus::Ready;