fix: patch compute walker scratch address when scratch controller is changed

- scenario when command list is changed from normal to low priority queue
- save only scratch patch address, when changed, then enable patching
- kernels will re-use scratch patch address after update of compute walker

Related-To: NEO-11972

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2024-07-30 12:03:08 +00:00 committed by Compute-Runtime-Automation
parent 250601fd7b
commit 7ca0210ef2
13 changed files with 166 additions and 51 deletions

View File

@ -30,6 +30,10 @@
struct _ze_command_list_handle_t {};
namespace NEO {
class ScratchSpaceController;
} // namespace NEO
namespace L0 {
struct Device;
struct EventPool;
@ -198,13 +202,20 @@ struct CommandList : _ze_command_list_handle_t {
commandListPerThreadScratchSize[slotId] = size;
}
uint32_t getCommandListPatchedPerThreadScratchSize(uint32_t slotId) const {
return commandListPatchedPerThreadScratchSize[slotId];
uint64_t getCurrentScratchPatchAddress() const {
return currentScratchPatchAddress;
}
void setCommandListPatchedPerThreadScratchSize(uint32_t slotId, uint32_t size) {
UNRECOVERABLE_IF(slotId > 1);
commandListPatchedPerThreadScratchSize[slotId] = size;
void setCurrentScratchPatchAddress(uint64_t scratchPatchAddress) {
currentScratchPatchAddress = scratchPatchAddress;
}
NEO::ScratchSpaceController *getCommandListUsedScratchController() const {
return usedScratchController;
}
void setCommandListUsedScratchController(NEO::ScratchSpaceController *scratchController) {
usedScratchController = scratchController;
}
uint32_t getCommandListSLMEnable() const {
@ -406,17 +417,19 @@ struct CommandList : _ze_command_list_handle_t {
int64_t currentIndirectObjectBaseAddress = NEO::StreamProperty64::initValue;
int64_t currentBindingTablePoolBaseAddress = NEO::StreamProperty64::initValue;
uint64_t currentScratchPatchAddress = 0;
ze_context_handle_t hContext = nullptr;
CommandQueue *cmdQImmediate = nullptr;
CommandQueue *cmdQImmediateCopyOffload = nullptr;
Device *device = nullptr;
NEO::ScratchSpaceController *usedScratchController = nullptr;
size_t minimalSizeForBcsSplit = 4 * MemoryConstants::megaByte;
size_t cmdListCurrentStartOffset = 0;
size_t maxFillPaternSizeForCopyEngine = 0;
uint32_t commandListPerThreadScratchSize[2]{};
uint32_t commandListPatchedPerThreadScratchSize[2]{};
ze_command_list_flags_t flags = 0u;
NEO::PreemptionMode commandListPreemptionMode = NEO::PreemptionMode::Initial;

View File

@ -122,8 +122,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
commandListPreemptionMode = device->getDevicePreemptionMode();
commandListPerThreadScratchSize[0] = 0u;
commandListPerThreadScratchSize[1] = 0u;
commandListPatchedPerThreadScratchSize[0] = 0u;
commandListPatchedPerThreadScratchSize[1] = 0u;
currentScratchPatchAddress = 0u;
usedScratchController = nullptr;
requiredStreamState.resetState();
finalStreamState.resetState();
containsAnyKernel = false;

View File

@ -419,7 +419,7 @@ inline ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommand
cmdQ->makeResidentAndMigrate(performMigration, this->commandContainer.getResidencyContainer());
static_cast<CommandQueueHw<gfxCoreFamily> *>(this->cmdQImmediate)->patchCommands(*this, 0u, 0, 0);
static_cast<CommandQueueHw<gfxCoreFamily> *>(this->cmdQImmediate)->patchCommands(*this, 0u, false);
if (performMigration) {
this->migrateSharedAllocations();

View File

@ -56,9 +56,7 @@ struct CommandQueueHw : public CommandQueueImp {
uint32_t perThreadScratchSpaceSlot1Size);
bool getPreemptionCmdProgramming() override;
void patchCommands(CommandList &commandList, uint64_t scratchAddress,
uint32_t perThreadScratchSpaceSlot0Size,
uint32_t perThreadScratchSpaceSlot1Size);
void patchCommands(CommandList &commandList, uint64_t scratchAddress, bool patchNewScratchAddress);
protected:
struct CommandListExecutionContext {

View File

@ -1740,13 +1740,26 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateStateBaseAddressDebugTracking() {
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, CommandListExecutionContext &ctx) {
bool patchNewScratchAddress = false;
uint64_t scratchAddress = ctx.scratchSpaceController->getScratchPatchAddress();
if (this->heaplessModeEnabled && this->cmdListHeapAddressModel == NEO::HeapAddressModel::globalStateless) {
scratchAddress += ctx.globalStatelessAllocation->getGpuAddress();
if (this->heaplessModeEnabled) {
if (this->cmdListHeapAddressModel == NEO::HeapAddressModel::globalStateless) {
scratchAddress += ctx.globalStatelessAllocation->getGpuAddress();
}
if (commandList.getCurrentScratchPatchAddress() != scratchAddress ||
commandList.getCommandListUsedScratchController() != ctx.scratchSpaceController) {
patchNewScratchAddress = true;
}
}
patchCommands(commandList, scratchAddress, patchNewScratchAddress);
if (patchNewScratchAddress) {
commandList.setCurrentScratchPatchAddress(scratchAddress);
commandList.setCommandListUsedScratchController(ctx.scratchSpaceController);
}
patchCommands(commandList, scratchAddress,
ctx.scratchSpaceController->getPerThreadScratchSpaceSizeSlot0(),
ctx.scratchSpaceController->getPerThreadScratchSizeSlot1());
}
} // namespace L0

View File

@ -134,8 +134,7 @@ void CommandQueueHw<gfxCoreFamily>::handleScratchSpace(NEO::HeapContainer &heapC
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint64_t scratchAddress,
uint32_t perThreadScratchSpaceSlot0Size,
uint32_t perThreadScratchSpaceSlot1Size) {
bool patchNewScratchAddress) {
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;

View File

@ -167,19 +167,11 @@ void CommandQueueHw<gfxCoreFamily>::handleScratchSpace(NEO::HeapContainer &sshHe
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint64_t scratchAddress,
uint32_t perThreadScratchSpaceSlot0Size,
uint32_t perThreadScratchSpaceSlot1Size) {
bool patchNewScratchAddress) {
using CFE_STATE = typename GfxFamily::CFE_STATE;
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
bool patchNewScratchAddress = false;
if (this->heaplessModeEnabled &&
(commandList.getCommandListPatchedPerThreadScratchSize(0) < perThreadScratchSpaceSlot0Size ||
commandList.getCommandListPatchedPerThreadScratchSize(1) < perThreadScratchSpaceSlot1Size)) {
patchNewScratchAddress = true;
}
auto &commandsToPatch = commandList.getCommandsToPatch();
for (auto &commandToPatch : commandsToPatch) {
switch (commandToPatch.type) {
@ -262,11 +254,6 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
UNRECOVERABLE_IF(true);
}
}
if (patchNewScratchAddress) {
commandList.setCommandListPatchedPerThreadScratchSize(0, perThreadScratchSpaceSlot0Size);
commandList.setCommandListPatchedPerThreadScratchSize(1, perThreadScratchSpaceSlot1Size);
}
}
} // namespace L0

View File

@ -631,5 +631,13 @@ uint64_t CommandListScratchPatchFixtureInit::getSurfStateGpuBase(bool useImmedia
}
}
uint64_t CommandListScratchPatchFixtureInit::getExpectedScratchPatchAddress(uint64_t controllerScratchAddress) {
if (fixtureGlobalStatelessMode == 1) {
controllerScratchAddress += device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getGlobalStatelessHeapAllocation()->getGpuAddress();
}
return controllerScratchAddress;
}
} // namespace ult
} // namespace L0

View File

@ -367,6 +367,7 @@ struct CommandListScratchPatchFixtureInit : public ModuleMutableCommandListFixtu
void tearDown();
uint64_t getSurfStateGpuBase(bool useImmediate);
uint64_t getExpectedScratchPatchAddress(uint64_t controllerScratchAddress);
template <typename FamilyType>
void testScratchInline(bool useImmediate);
@ -380,6 +381,9 @@ struct CommandListScratchPatchFixtureInit : public ModuleMutableCommandListFixtu
template <typename FamilyType>
void testScratchImmediatePatching();
template <typename FamilyType>
void testScratchChangedControllerPatching();
int32_t fixtureGlobalStatelessMode = 0;
uint32_t scratchInlineOffset = 8;
uint32_t scratchInlinePointerSize = sizeof(uint64_t);

View File

@ -1533,12 +1533,14 @@ void CommandListScratchPatchFixtureInit::testScratchGrowingPatching() {
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(0x40u, commandList->getCommandListPatchedPerThreadScratchSize(0));
EXPECT_EQ(0u, commandList->getCommandListPatchedPerThreadScratchSize(1));
auto scratchAddress = scratchController->getScratchPatchAddress();
auto fullScratchAddress = surfaceHeapGpuBase + scratchAddress;
auto expectedScratchPatchAddress = getExpectedScratchPatchAddress(scratchAddress);
EXPECT_EQ(expectedScratchPatchAddress, commandList->getCurrentScratchPatchAddress());
EXPECT_EQ(scratchController, commandList->getCommandListUsedScratchController());
uint64_t scratchInlineValue = 0;
void *scratchInlinePtr = ptrOffset(walkerPtrWithScratch, (inlineOffset + scratchInlineOffset));
@ -1547,8 +1549,8 @@ void CommandListScratchPatchFixtureInit::testScratchGrowingPatching() {
commandList->reset();
EXPECT_EQ(0u, commandList->getCommandListPatchedPerThreadScratchSize(0));
EXPECT_EQ(0u, commandList->getCommandListPatchedPerThreadScratchSize(1));
EXPECT_EQ(0u, commandList->getCurrentScratchPatchAddress());
EXPECT_EQ(nullptr, commandList->getCommandListUsedScratchController());
mockKernelImmData->kernelDescriptor->kernelAttributes.perThreadScratchSize[1] = 0x40;
@ -1570,18 +1572,17 @@ void CommandListScratchPatchFixtureInit::testScratchGrowingPatching() {
result = commandList->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
// simulate slot 0 is already patched - to check slot 1 change is detected
commandList->setCommandListPatchedPerThreadScratchSize(0, 0x40);
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(0x40u, commandList->getCommandListPatchedPerThreadScratchSize(0));
EXPECT_EQ(0x40u, commandList->getCommandListPatchedPerThreadScratchSize(1));
scratchAddress = scratchController->getScratchPatchAddress();
auto fullScratchSlot1Address = surfaceHeapGpuBase + scratchAddress;
expectedScratchPatchAddress = getExpectedScratchPatchAddress(scratchAddress);
EXPECT_EQ(expectedScratchPatchAddress, commandList->getCurrentScratchPatchAddress());
EXPECT_EQ(scratchController, commandList->getCommandListUsedScratchController());
scratchInlinePtr = ptrOffset(walkerPtrWithSlot1Scratch, (inlineOffset + scratchInlineOffset));
std::memcpy(&scratchInlineValue, scratchInlinePtr, sizeof(scratchInlineValue));
EXPECT_EQ(fullScratchSlot1Address, scratchInlineValue);
@ -1702,5 +1703,91 @@ void CommandListScratchPatchFixtureInit::testScratchImmediatePatching() {
EXPECT_EQ(0u, scratchInlineValue);
}
template <typename FamilyType>
void CommandListScratchPatchFixtureInit::testScratchChangedControllerPatching() {
auto csr = device->getNEODevice()->getDefaultEngine().commandStreamReceiver;
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(csr);
auto scratchControllerInitial = csr->getScratchSpaceController();
NEO::EncodeDispatchKernelArgs dispatchKernelArgs = {};
dispatchKernelArgs.isHeaplessModeEnabled = true;
size_t inlineOffset = NEO::EncodeDispatchKernel<FamilyType>::getInlineDataOffset(dispatchKernelArgs);
uint64_t surfaceHeapGpuBase = getSurfStateGpuBase(false);
auto cmdListStream = commandList->commandContainer.getCommandStream();
const ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
auto result = ZE_RESULT_SUCCESS;
size_t usedBefore = cmdListStream->getUsed();
result = commandList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
size_t usedAfter = cmdListStream->getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
ptrOffset(cmdListStream->getCpuBase(), usedBefore),
usedAfter - usedBefore));
auto walkerIterator = NEO::UnitTestHelper<FamilyType>::findWalkerTypeCmd(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), walkerIterator);
void *walkerPtrWithScratch = *walkerIterator;
result = commandList->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto commandListHandle = commandList->toHandle();
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto scratchAddress = scratchControllerInitial->getScratchPatchAddress();
auto fullScratchAddress = surfaceHeapGpuBase + scratchAddress;
auto expectedScratchPatchAddress = getExpectedScratchPatchAddress(scratchAddress);
EXPECT_EQ(expectedScratchPatchAddress, commandList->getCurrentScratchPatchAddress());
EXPECT_EQ(scratchControllerInitial, commandList->getCommandListUsedScratchController());
uint64_t scratchInlineValue = 0;
void *scratchInlinePtr = ptrOffset(walkerPtrWithScratch, (inlineOffset + scratchInlineOffset));
std::memcpy(&scratchInlineValue, scratchInlinePtr, sizeof(scratchInlineValue));
EXPECT_EQ(fullScratchAddress, scratchInlineValue);
memset(scratchInlinePtr, 0, scratchInlinePointerSize);
// simulate execution on different scratch controller (execution of command list from normal to low priority queue)
ultCsr->createScratchSpaceController();
auto scratchControllerSecond = csr->getScratchSpaceController();
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
scratchAddress = scratchControllerSecond->getScratchPatchAddress();
fullScratchAddress = surfaceHeapGpuBase + scratchAddress;
expectedScratchPatchAddress = getExpectedScratchPatchAddress(scratchAddress);
EXPECT_EQ(expectedScratchPatchAddress, commandList->getCurrentScratchPatchAddress());
EXPECT_EQ(scratchControllerSecond, commandList->getCommandListUsedScratchController());
scratchInlinePtr = ptrOffset(walkerPtrWithScratch, (inlineOffset + scratchInlineOffset));
std::memcpy(&scratchInlineValue, scratchInlinePtr, sizeof(scratchInlineValue));
EXPECT_EQ(fullScratchAddress, scratchInlineValue);
memset(scratchInlinePtr, 0, scratchInlinePointerSize);
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
std::memcpy(&scratchInlineValue, scratchInlinePtr, sizeof(scratchInlineValue));
EXPECT_EQ(0u, scratchInlineValue);
}
} // namespace ult
} // namespace L0

View File

@ -1586,6 +1586,11 @@ HWTEST2_F(CommandListScratchPatchPrivateHeapsTest,
testScratchGrowingPatching<FamilyType>();
}
HWTEST2_F(CommandListScratchPatchPrivateHeapsTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithChangedScratchControllerThenUpdatedCorrectAddressPatched, IsAtLeastXeHpcCore) {
testScratchChangedControllerPatching<FamilyType>();
}
HWTEST2_F(CommandListScratchPatchGlobalStatelessHeapsTest,
givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithScratchThenExpectCorrectAddressPatched, IsAtLeastXeHpcCore) {
testScratchInline<FamilyType>(false);

View File

@ -953,9 +953,9 @@ HWTEST2_F(CommandQueueScratchTests, whenPatchCommandsIsCalledThenCommandsAreCorr
auto commandQueue = std::make_unique<MockCommandQueueHw<gfxCoreFamily>>(device, csr, &desc);
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0, 0, 0));
EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0, false));
commandList->commandsToPatch.push_back({});
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, 0, 0));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false));
commandList->commandsToPatch.clear();
CFE_STATE destinationCfeStates[4];
@ -983,7 +983,7 @@ HWTEST2_F(CommandQueueScratchTests, whenPatchCommandsIsCalledThenCommandsAreCorr
}
uint64_t patchedScratchAddress = 0xABCD00;
commandQueue->patchCommands(*commandList, patchedScratchAddress, 0, 0);
commandQueue->patchCommands(*commandList, patchedScratchAddress, false);
for (size_t i = 0; i < 4; i++) {
EXPECT_EQ(patchedScratchAddress, destinationCfeStates[i].getScratchSpaceBuffer());
auto &sourceCfeState = *reinterpret_cast<CFE_STATE *>(commandList->commandsToPatch[i].pCommand);
@ -1004,21 +1004,21 @@ HWTEST2_F(CommandQueueScratchTests, givenCommandsToPatchToNotSupportedPlatformWh
auto commandQueue = std::make_unique<MockCommandQueueHw<gfxCoreFamily>>(device, csr, &desc);
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0, 0, 0));
EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0, false));
commandList->commandsToPatch.push_back({});
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, 0, 0));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false));
commandList->commandsToPatch.clear();
CommandToPatch commandToPatch;
commandToPatch.type = CommandToPatch::FrontEndState;
commandList->commandsToPatch.push_back(commandToPatch);
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, 0, 0));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false));
commandList->commandsToPatch.clear();
commandToPatch.type = CommandToPatch::Invalid;
commandList->commandsToPatch.push_back(commandToPatch);
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, 0, 0));
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0, false));
commandList->commandsToPatch.clear();
}

View File

@ -55,6 +55,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
using BaseClass::blitterDirectSubmission;
using BaseClass::checkPlatformSupportsGpuIdleImplicitFlush;
using BaseClass::checkPlatformSupportsNewResourceImplicitFlush;
using BaseClass::createScratchSpaceController;
using BaseClass::csrSizeRequestFlags;
using BaseClass::dcFlushSupport;
using BaseClass::directSubmission;