feature: improve patching of scratch inline pointer

Related-To: NEO-10381

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2024-03-28 13:13:07 +00:00
committed by Compute-Runtime-Automation
parent b8c5b15b65
commit a468827c12
11 changed files with 242 additions and 14 deletions

View File

@@ -199,6 +199,15 @@ struct CommandList : _ze_command_list_handle_t {
commandListPerThreadScratchSize[slotId] = size;
}
uint32_t getCommandListPatchedPerThreadScratchSize(uint32_t slotId) const {
return commandListPatchedPerThreadScratchSize[slotId];
}
void setCommandListPatchedPerThreadScratchSize(uint32_t slotId, uint32_t size) {
UNRECOVERABLE_IF(slotId > 1);
commandListPatchedPerThreadScratchSize[slotId] = size;
}
uint32_t getCommandListSLMEnable() const {
return commandListSLMEnabled;
}
@@ -404,6 +413,9 @@ struct CommandList : _ze_command_list_handle_t {
size_t cmdListCurrentStartOffset = 0;
size_t maxFillPaternSizeForCopyEngine = 0;
uint32_t commandListPerThreadScratchSize[2]{};
uint32_t commandListPatchedPerThreadScratchSize[2]{};
ze_command_list_flags_t flags = 0u;
NEO::PreemptionMode commandListPreemptionMode = NEO::PreemptionMode::Initial;
NEO::EngineGroupType engineGroupType = NEO::EngineGroupType::maxEngineGroups;
@@ -411,7 +423,6 @@ struct CommandList : _ze_command_list_handle_t {
std::optional<uint32_t> ordinal = std::nullopt;
CommandListType cmdListType = CommandListType::typeRegular;
uint32_t commandListPerThreadScratchSize[2]{};
uint32_t partitionCount = 1;
uint32_t defaultMocsIndex = 0;

View File

@@ -360,7 +360,7 @@ inline ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommand
cmdQ->makeResidentAndMigrate(performMigration, this->commandContainer.getResidencyContainer());
static_cast<CommandQueueHw<gfxCoreFamily> *>(this->cmdQImmediate)->patchCommands(*this, 0u);
static_cast<CommandQueueHw<gfxCoreFamily> *>(this->cmdQImmediate)->patchCommands(*this, 0u, 0, 0);
if (performMigration) {
this->migrateSharedAllocations();

View File

@@ -57,7 +57,9 @@ struct CommandQueueHw : public CommandQueueImp {
uint32_t perThreadScratchSpaceSlot1Size);
bool getPreemptionCmdProgramming() override;
void patchCommands(CommandList &commandList, uint64_t scratchAddress);
void patchCommands(CommandList &commandList, uint64_t scratchAddress,
uint32_t perThreadScratchSpaceSlot0Size,
uint32_t perThreadScratchSpaceSlot1Size);
protected:
struct CommandListExecutionContext {

View File

@@ -1705,7 +1705,9 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, Comm
if (this->heaplessModeEnabled && this->cmdListHeapAddressModel == NEO::HeapAddressModel::globalStateless) {
scratchAddress += ctx.globalStatelessAllocation->getGpuAddress();
}
patchCommands(commandList, scratchAddress);
patchCommands(commandList, scratchAddress,
ctx.scratchSpaceController->getPerThreadScratchSpaceSizeSlot0(),
ctx.scratchSpaceController->getPerThreadScratchSizeSlot1());
}
} // namespace L0

View File

@@ -133,7 +133,9 @@ void CommandQueueHw<gfxCoreFamily>::handleScratchSpace(NEO::HeapContainer &heapC
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint64_t scratchAddress) {
void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint64_t scratchAddress,
uint32_t perThreadScratchSpaceSlot0Size,
uint32_t perThreadScratchSpaceSlot1Size) {
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;

View File

@@ -163,11 +163,20 @@ void CommandQueueHw<gfxCoreFamily>::handleScratchSpace(NEO::HeapContainer &sshHe
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint64_t scratchAddress) {
void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint64_t scratchAddress,
uint32_t perThreadScratchSpaceSlot0Size,
uint32_t perThreadScratchSpaceSlot1Size) {
using CFE_STATE = typename GfxFamily::CFE_STATE;
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
bool patchNewInlineScratchAddress = false;
if (this->heaplessModeEnabled &&
(commandList.getCommandListPatchedPerThreadScratchSize(0) < perThreadScratchSpaceSlot0Size ||
commandList.getCommandListPatchedPerThreadScratchSize(1) < perThreadScratchSpaceSlot1Size)) {
patchNewInlineScratchAddress = true;
}
auto &commandsToPatch = commandList.getCommandsToPatch();
for (auto &commandToPatch : commandsToPatch) {
switch (commandToPatch.type) {
@@ -229,6 +238,9 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
break;
}
case CommandToPatch::ComputeWalkerInlineDataScratch: {
if (!patchNewInlineScratchAddress) {
continue;
}
uint64_t fullScratchAddress = scratchAddress + commandToPatch.baseAddress;
void *scratchAddressPatch = ptrOffset(commandToPatch.pDestination, commandToPatch.offset);
std::memcpy(scratchAddressPatch, &fullScratchAddress, commandToPatch.patchSize);
@@ -238,6 +250,11 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
UNRECOVERABLE_IF(true);
}
}
if (patchNewInlineScratchAddress) {
commandList.setCommandListPatchedPerThreadScratchSize(0, perThreadScratchSpaceSlot0Size);
commandList.setCommandListPatchedPerThreadScratchSize(1, perThreadScratchSpaceSlot1Size);
}
}
} // namespace L0

View File

@@ -598,7 +598,7 @@ void CommandListScratchPatchFixtureInit::setUpParams(int32_t globalStatelessMode
commandQueue->heaplessStateInitEnabled = !!heaplessStateInitEnabled;
mockKernelImmData->kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = 0x40;
mockKernelImmData->kernelDescriptor->payloadMappings.implicitArgs.scratchPointerAddress.pointerSize = 0x8;
mockKernelImmData->kernelDescriptor->payloadMappings.implicitArgs.scratchPointerAddress.pointerSize = scratchInlinePointerSize;
mockKernelImmData->kernelDescriptor->payloadMappings.implicitArgs.scratchPointerAddress.offset = scratchInlineOffset;
}

View File

@@ -371,8 +371,15 @@ struct CommandListScratchPatchFixtureInit : public ModuleMutableCommandListFixtu
template <typename FamilyType>
void testScratchInline(bool useImmediate);
template <typename FamilyType>
void testScratchGrowingPatching();
template <typename FamilyType>
void testScratchSameNotPatching();
int32_t fixtureGlobalStatelessMode = 0;
uint32_t scratchInlineOffset = 8;
uint32_t scratchInlinePointerSize = sizeof(uint64_t);
};
template <int32_t globalStatelessMode, int32_t heaplessStateInitEnabled>

View File

@@ -1493,5 +1493,152 @@ void CommandListScratchPatchFixtureInit::testScratchInline(bool useImmediate) {
EXPECT_TRUE(scratchInResidency);
}
template <typename FamilyType>
void CommandListScratchPatchFixtureInit::testScratchGrowingPatching() {
auto csr = device->getNEODevice()->getDefaultEngine().commandStreamReceiver;
auto scratchController = csr->getScratchSpaceController();
NEO::EncodeDispatchKernelArgs dispatchKernelArgs = {};
dispatchKernelArgs.isHeaplessModeEnabled = true;
size_t inlineOffset = NEO::EncodeDispatchKernel<FamilyType>::getInlineDataOffset(dispatchKernelArgs);
uint64_t surfaceHeapGpuBase = getSurfStateGpuBase(false);
auto cmdListStream = commandList->commandContainer.getCommandStream();
const ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
auto result = ZE_RESULT_SUCCESS;
size_t usedBefore = cmdListStream->getUsed();
result = commandList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
size_t usedAfter = cmdListStream->getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
ptrOffset(cmdListStream->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto walkerIterator = NEO::UnitTestHelper<FamilyType>::findWalkerCmd(cmdList.begin(), cmdList.end(), true);
ASSERT_NE(cmdList.end(), walkerIterator);
void *walkerPtrWithScratch = *walkerIterator;
result = commandList->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto commandListHandle = commandList->toHandle();
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto scratchAddress = scratchController->getScratchPatchAddress();
auto fullScratchAddress = surfaceHeapGpuBase + scratchAddress;
uint64_t scratchInlineValue = 0;
void *scratchInlinePtr = ptrOffset(walkerPtrWithScratch, (inlineOffset + scratchInlineOffset));
std::memcpy(&scratchInlineValue, scratchInlinePtr, sizeof(scratchInlineValue));
EXPECT_EQ(fullScratchAddress, scratchInlineValue);
commandList->reset();
mockKernelImmData->kernelDescriptor->kernelAttributes.perThreadScratchSize[1] = 0x40;
usedBefore = cmdListStream->getUsed();
result = commandList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
usedAfter = cmdListStream->getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
cmdList.clear();
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
ptrOffset(cmdListStream->getCpuBase(), usedBefore),
usedAfter - usedBefore));
walkerIterator = NEO::UnitTestHelper<FamilyType>::findWalkerCmd(cmdList.begin(), cmdList.end(), true);
ASSERT_NE(cmdList.end(), walkerIterator);
void *walkerPtrWithSlot1Scratch = *walkerIterator;
result = commandList->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
scratchAddress = scratchController->getScratchPatchAddress();
auto fullScratchSlot1Address = surfaceHeapGpuBase + scratchAddress;
scratchInlinePtr = ptrOffset(walkerPtrWithSlot1Scratch, (inlineOffset + scratchInlineOffset));
std::memcpy(&scratchInlineValue, scratchInlinePtr, sizeof(scratchInlineValue));
EXPECT_EQ(fullScratchSlot1Address, scratchInlineValue);
memset(scratchInlinePtr, 0, scratchInlinePointerSize);
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
std::memcpy(&scratchInlineValue, scratchInlinePtr, sizeof(scratchInlineValue));
EXPECT_EQ(0u, scratchInlineValue);
}
template <typename FamilyType>
void CommandListScratchPatchFixtureInit::testScratchSameNotPatching() {
auto csr = device->getNEODevice()->getDefaultEngine().commandStreamReceiver;
auto scratchController = csr->getScratchSpaceController();
NEO::EncodeDispatchKernelArgs dispatchKernelArgs = {};
dispatchKernelArgs.isHeaplessModeEnabled = true;
size_t inlineOffset = NEO::EncodeDispatchKernel<FamilyType>::getInlineDataOffset(dispatchKernelArgs);
uint64_t surfaceHeapGpuBase = getSurfStateGpuBase(false);
auto cmdListStream = commandList->commandContainer.getCommandStream();
const ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
auto result = ZE_RESULT_SUCCESS;
size_t usedBefore = cmdListStream->getUsed();
result = commandList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
size_t usedAfter = cmdListStream->getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
ptrOffset(cmdListStream->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto walkerIterator = NEO::UnitTestHelper<FamilyType>::findWalkerCmd(cmdList.begin(), cmdList.end(), true);
ASSERT_NE(cmdList.end(), walkerIterator);
void *walkerPtrWithScratch = *walkerIterator;
result = commandList->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto commandListHandle = commandList->toHandle();
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto scratchAddress = scratchController->getScratchPatchAddress();
auto fullScratchAddress = surfaceHeapGpuBase + scratchAddress;
uint64_t scratchInlineValue = 0;
void *scratchInlinePtr = ptrOffset(walkerPtrWithScratch, (inlineOffset + scratchInlineOffset));
std::memcpy(&scratchInlineValue, scratchInlinePtr, sizeof(scratchInlineValue));
EXPECT_EQ(fullScratchAddress, scratchInlineValue);
memset(scratchInlinePtr, 0, scratchInlinePointerSize);
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
std::memcpy(&scratchInlineValue, scratchInlinePtr, sizeof(scratchInlineValue));
EXPECT_EQ(0u, scratchInlineValue);
}
} // namespace ult
} // namespace L0

View File

@@ -1564,20 +1564,60 @@ HWTEST2_F(CommandListScratchPatchPrivateHeapsTest,
testScratchInline<FamilyType>(false);
}
HWTEST2_F(CommandListScratchPatchPrivateHeapsTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithScratchTwiceThenCorrectAddressPatchedOnce, IsAtLeastXeHpcCore) {
testScratchSameNotPatching<FamilyType>();
}
HWTEST2_F(CommandListScratchPatchPrivateHeapsTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithBiggerScratchSlot1ThenNewCorrectAddressPatched, IsAtLeastXeHpcCore) {
testScratchGrowingPatching<FamilyType>();
}
HWTEST2_F(CommandListScratchPatchGlobalStatelessHeapsTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithScratchThenExpectCorrectAddressPatched, IsAtLeastXeHpcCore) {
testScratchInline<FamilyType>(false);
}
HWTEST2_F(CommandListScratchPatchGlobalStatelessHeapsTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithScratchTwiceThenCorrectAddressPatchedOnce, IsAtLeastXeHpcCore) {
testScratchSameNotPatching<FamilyType>();
}
HWTEST2_F(CommandListScratchPatchGlobalStatelessHeapsTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithBiggerScratchSlot1ThenNewCorrectAddressPatched, IsAtLeastXeHpcCore) {
testScratchGrowingPatching<FamilyType>();
}
HWTEST2_F(CommandListScratchPatchPrivateHeapsStateInitTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithScratchThenExpectCorrectAddressPatched, IsAtLeastXeHpcCore) {
testScratchInline<FamilyType>(false);
}
HWTEST2_F(CommandListScratchPatchPrivateHeapsStateInitTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithScratchTwiceThenCorrectAddressPatchedOnce, IsAtLeastXeHpcCore) {
testScratchSameNotPatching<FamilyType>();
}
HWTEST2_F(CommandListScratchPatchPrivateHeapsStateInitTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithBiggerScratchSlot1ThenNewCorrectAddressPatched, IsAtLeastXeHpcCore) {
testScratchGrowingPatching<FamilyType>();
}
HWTEST2_F(CommandListScratchPatchGlobalStatelessHeapsStateInitTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithScratchThenExpectCorrectAddressPatched, IsAtLeastXeHpcCore) {
testScratchInline<FamilyType>(false);
}
HWTEST2_F(CommandListScratchPatchGlobalStatelessHeapsStateInitTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithScratchTwiceThenCorrectAddressPatchedOnce, IsAtLeastXeHpcCore) {
testScratchSameNotPatching<FamilyType>();
}
HWTEST2_F(CommandListScratchPatchGlobalStatelessHeapsStateInitTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithBiggerScratchSlot1ThenNewCorrectAddressPatched, IsAtLeastXeHpcCore) {
testScratchGrowingPatching<FamilyType>();
}
} // namespace ult
} // namespace L0

View File

@@ -948,9 +948,9 @@ HWTEST2_F(CommandQueueScratchTests, whenPatchCommandsIsCalledThenCommandsAreCorr
auto commandQueue = std::make_unique<MockCommandQueueHw<gfxCoreFamily>>(device, csr, &desc);
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0));
EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0, 0, 0));
commandList->commandsToPatch.push_back({});
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, 0, 0));
commandList->commandsToPatch.clear();
CFE_STATE destinationCfeStates[4];
@@ -978,7 +978,7 @@ HWTEST2_F(CommandQueueScratchTests, whenPatchCommandsIsCalledThenCommandsAreCorr
}
uint64_t patchedScratchAddress = 0xABCD00;
commandQueue->patchCommands(*commandList, patchedScratchAddress);
commandQueue->patchCommands(*commandList, patchedScratchAddress, 0, 0);
for (size_t i = 0; i < 4; i++) {
EXPECT_EQ(patchedScratchAddress, destinationCfeStates[i].getScratchSpaceBuffer());
auto &sourceCfeState = *reinterpret_cast<CFE_STATE *>(commandList->commandsToPatch[i].pCommand);
@@ -999,21 +999,21 @@ HWTEST2_F(CommandQueueScratchTests, givenCommandsToPatchToNotSupportedPlatformWh
auto commandQueue = std::make_unique<MockCommandQueueHw<gfxCoreFamily>>(device, csr, &desc);
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0));
EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0, 0, 0));
commandList->commandsToPatch.push_back({});
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, 0, 0));
commandList->commandsToPatch.clear();
CommandToPatch commandToPatch;
commandToPatch.type = CommandToPatch::FrontEndState;
commandList->commandsToPatch.push_back(commandToPatch);
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, 0, 0));
commandList->commandsToPatch.clear();
commandToPatch.type = CommandToPatch::Invalid;
commandList->commandsToPatch.push_back(commandToPatch);
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, 0, 0));
commandList->commandsToPatch.clear();
}