performance: limit copying compute walker from host view at kernel mutation

- at kernel mutation copy post sync and inline data
- at mandatory group count/size mutation copy dispatch data

Related-To: NEO-13916

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2025-07-17 12:14:33 +00:00
committed by Compute-Runtime-Automation
parent 88ed1ce169
commit 95dedbbcd8
8 changed files with 86 additions and 33 deletions

View File

@@ -690,7 +690,7 @@ ze_result_t MutableCommandListImp::updateMutableCommandKernelsExp(uint32_t numKe
}
}
// copy post sync and payload from old walker host view into new walker host view
// copy post sync and possible indirect/scratch pointers from old walker host view into new walker host view
auto newKernelComputeWalker = newMutableKernel->getMutableComputeWalker();
newKernelComputeWalker->copyWalkerDataToHostBuffer(oldKernelComputeWalker);
@@ -700,8 +700,8 @@ ze_result_t MutableCommandListImp::updateMutableCommandKernelsExp(uint32_t numKe
this->updateScratchAddress(scratchAddressPatchIndex, *oldKernelComputeWalker, *newKernelComputeWalker);
}
// save new host view into command buffer
newKernelComputeWalker->saveCpuBufferIntoGpuBuffer(false);
// save new host view inline data/post sync into command buffer
newKernelComputeWalker->saveCpuBufferIntoGpuBuffer(false, true);
// update reminder variables (signal/wait events variables) with new compute walker to have correct reference for new post sync addresses
for (auto &mutableVariableDescriptor : selectedAppend.variables) {

View File

@@ -83,7 +83,7 @@ struct MutableComputeWalker {
virtual void copyWalkerDataToHostBuffer(MutableComputeWalker *sourceWalker) = 0;
virtual void updateWalkerScratchPatchAddress(GpuAddress scratchPatchAddress) = 0;
virtual void saveCpuBufferIntoGpuBuffer(bool useDispatchPart) = 0;
virtual void saveCpuBufferIntoGpuBuffer(bool useDispatchPart, bool useInlinePostSyncPart) = 0;
protected:
void *walker;

View File

@@ -50,7 +50,7 @@ struct MutableComputeWalkerHw : public MutableComputeWalker, NEO::NonCopyableAnd
void copyWalkerDataToHostBuffer(MutableComputeWalker *sourceWalker) override;
void updateWalkerScratchPatchAddress(GpuAddress scratchPatchAddress) override;
void saveCpuBufferIntoGpuBuffer(bool useDispatchPart) override;
void saveCpuBufferIntoGpuBuffer(bool useDispatchPart, bool useInlinePostSyncPart) override;
protected:
template <typename WalkerType>

View File

@@ -10,6 +10,7 @@
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/device/device.h"
#include "shared/source/helpers/ptr_math.h"
#include "level_zero/core/source/mutable_cmdlist/mutable_command_walker_hw.h"
@@ -71,4 +72,24 @@ void MutableComputeWalkerHw<GfxFamily>::updateSlmSize(const NEO::Device &device,
this->setSlmSize(slmSize);
}
template <typename GfxFamily>
void MutableComputeWalkerHw<GfxFamily>::saveCpuBufferIntoGpuBuffer(bool useDispatchPart, bool useInlinePostSyncPart) {
using WalkerType = typename GfxFamily::DefaultWalkerType;
constexpr size_t dispatchPartSize = offsetof(WalkerType, TheStructure.Common.PostSync);
constexpr size_t walkerSize = sizeof(WalkerType);
constexpr size_t inlinePostSyncSize = walkerSize - dispatchPartSize;
if (useDispatchPart && useInlinePostSyncPart) {
memcpy_s(this->walker, walkerSize, this->cpuBuffer, walkerSize);
} else {
if (useDispatchPart) {
memcpy_s(this->walker, dispatchPartSize, this->cpuBuffer, dispatchPartSize);
} else if (useInlinePostSyncPart) {
auto cmdBufferInlinePostSync = ptrOffset(this->walker, dispatchPartSize);
auto hostViewInlinePostSync = ptrOffset(this->cpuBuffer, dispatchPartSize);
memcpy_s(cmdBufferInlinePostSync, inlinePostSyncSize, hostViewInlinePostSync, inlinePostSyncSize);
}
}
}
} // namespace L0::MCL

View File

@@ -277,20 +277,6 @@ template <typename GfxFamily>
void MutableComputeWalkerHw<GfxFamily>::updateWalkerScratchPatchAddress(GpuAddress scratchPatchAddress) {
}
template <typename GfxFamily>
void MutableComputeWalkerHw<GfxFamily>::saveCpuBufferIntoGpuBuffer(bool useDispatchPart) {
using WalkerType = typename GfxFamily::DefaultWalkerType;
auto walkerCmdHostBuffer = reinterpret_cast<WalkerType *>(this->cpuBuffer);
auto walkerCmd = reinterpret_cast<WalkerType *>(this->walker);
if (useDispatchPart) {
constexpr size_t dispatchPartSize = offsetof(WalkerType, TheStructure.Common.PostSync);
memcpy_s(this->walker, dispatchPartSize, this->cpuBuffer, dispatchPartSize);
} else {
*walkerCmd = *walkerCmdHostBuffer;
}
}
template <typename GfxFamily>
void MutableComputeWalkerHw<GfxFamily>::updateSpecificFields(const NEO::Device &device,
MutableWalkerSpecificFieldsArguments &args) {
@@ -341,7 +327,7 @@ void MutableComputeWalkerHw<GfxFamily>::updateSpecificFields(const NEO::Device &
walkerCmd->getRawData(computeDispatchAllWalkerEnableIndex) = cpuBufferWalker->getRawData(computeDispatchAllWalkerEnableIndex);
} else {
this->saveCpuBufferIntoGpuBuffer(true);
this->saveCpuBufferIntoGpuBuffer(true, false);
}
}
}

View File

@@ -582,7 +582,8 @@ void VariableDispatch::commitChanges(const NEO::Device &device) {
.updateSlm = this->commitSlmSize};
mutableCommandWalker->updateSpecificFields(device, args);
mutableCommandWalker->saveCpuBufferIntoGpuBuffer(true);
// save only dispatch part of walker (since this part was processed in group count/size mutation)
mutableCommandWalker->saveCpuBufferIntoGpuBuffer(true, false);
cleanCommitVariableDispatch();
}

View File

@@ -9,6 +9,7 @@
#include "shared/source/helpers/kernel_helpers.h"
#include "shared/source/indirect_heap/indirect_heap.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/mocks/mock_graphics_allocation.h"
#include "shared/test/common/test_macros/hw_test.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
@@ -91,6 +92,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE,
givenTwoKernelsWithSignalEventWhenFirstAppendedAndSecondMutatedThenPostSyncIsPreserved) {
using WalkerType = typename FamilyType::PorWalkerType;
auto kernelIsaAddress = kernel->getIsaAllocation()->getGpuAddress();
auto kernel2IsaAddress = kernelIsaAddress + 0x20000;
static_cast<NEO::MockGraphicsAllocation *>(kernel2->getIsaAllocation())->gpuAddress = kernel2IsaAddress;
auto event = createTestEvent(false, false, false, false);
auto eventAddress = event->getGpuAddress(this->device);
@@ -128,6 +134,24 @@ HWCMDTEST_F(IGFX_XE_HP_CORE,
walkerPostSyncAddress = NEO::UnitTestHelper<FamilyType>::getWalkerActivePostSyncAddress(walkerCmd);
EXPECT_EQ(eventAddress, walkerPostSyncAddress);
// kernel ISA will be updated after mandatory group count mutation
EXPECT_EQ(kernelIsaAddress, walkerCmd->getInterfaceDescriptor().getKernelStartPointer());
ze_group_count_t mutatedGroupCount = {8, 2, 1};
ze_mutable_group_count_exp_desc_t groupCountDesc = {ZE_STRUCTURE_TYPE_MUTABLE_GROUP_COUNT_EXP_DESC};
groupCountDesc.commandId = commandId;
groupCountDesc.pGroupCount = &mutatedGroupCount;
mutableCommandsDesc.pNext = &groupCountDesc;
result = mutableCommandList->updateMutableCommandsExp(&mutableCommandsDesc);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = mutableCommandList->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
// kernel 2 isa set
EXPECT_EQ(kernel2IsaAddress, walkerCmd->getInterfaceDescriptor().getKernelStartPointer());
}
HWCMDTEST_F(IGFX_XE_HP_CORE,

View File

@@ -50,27 +50,48 @@ HWCMDTEST_F(IGFX_XE_HP_CORE,
MutableHwCommandTest,
givenMutableComputeWalkerMatchesComputeWalkerWhenObjectIsCreatedAndCopiedFromCpuMemoryThenExactCopyIsExpected) {
using WalkerType = typename FamilyType::PorWalkerType;
constexpr size_t dispatchPartSize = offsetof(WalkerType, TheStructure.Common.PostSync);
constexpr size_t walkerSize = sizeof(WalkerType);
constexpr size_t inlinePostSyncSize = walkerSize - dispatchPartSize;
void *inlinePostSyncCmdBuffer = ptrOffset(this->cmdBufferGpuPtr, dispatchPartSize);
auto walkerTemplate = FamilyType::template getInitGpuWalker<WalkerType>();
createDefaultMutableWalker<FamilyType, WalkerType>(&walkerTemplate, true, false);
void *inlinePostSyncCpuBuffer = ptrOffset(this->cmdBufferCpuPtr, dispatchPartSize);
auto cpuBufferWalkerCmd = reinterpret_cast<WalkerType *>(this->cmdBufferCpuPtr);
// dummy post sync address in inline/post sync part of walker
cpuBufferWalkerCmd->getPostSync().setDestinationAddress(0x2000);
// dummy kernel start address in dispatch part of walker
cpuBufferWalkerCmd->getInterfaceDescriptor().setKernelStartPointer(0xFF000);
mutableWalker->saveCpuBufferIntoGpuBuffer(false);
// true, true will do the full copy of compute walker
mutableWalker->saveCpuBufferIntoGpuBuffer(true, true);
EXPECT_EQ(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, this->walkerCmdSize));
createDefaultMutableWalker<FamilyType, WalkerType>(&walkerTemplate, true, false);
auto cpuBuffer = reinterpret_cast<WalkerType *>(this->cmdBufferCpuPtr);
// dummy address to mutate in next step
cpuBuffer->getPostSync().setDestinationAddress(0x2000);
uint64_t postSyncAddress = 0x1000;
mutableWalker->setPostSyncAddress(postSyncAddress, 0);
memset(this->cmdBufferGpuPtr, 0, this->walkerCmdSize);
// true parameter will not save post syncs to gpu buffer
mutableWalker->saveCpuBufferIntoGpuBuffer(true);
// true, false parameter will not save post syncs to gpu buffer, but kernel ISA start address
mutableWalker->saveCpuBufferIntoGpuBuffer(true, false);
EXPECT_NE(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, this->walkerCmdSize));
EXPECT_EQ(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, dispatchPartSize));
EXPECT_NE(0, memcmp(inlinePostSyncCpuBuffer, inlinePostSyncCmdBuffer, inlinePostSyncSize));
memset(this->cmdBufferGpuPtr, 0, this->walkerCmdSize);
// false, true parameter will save post syncs to gpu buffer, but not kernel ISA start address
mutableWalker->saveCpuBufferIntoGpuBuffer(false, true);
EXPECT_NE(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, this->walkerCmdSize));
EXPECT_NE(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, dispatchPartSize));
EXPECT_EQ(0, memcmp(inlinePostSyncCpuBuffer, inlinePostSyncCmdBuffer, inlinePostSyncSize));
memset(this->cmdBufferGpuPtr, 0, this->walkerCmdSize);
// false, false will not copy any data
mutableWalker->saveCpuBufferIntoGpuBuffer(false, false);
EXPECT_NE(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, this->walkerCmdSize));
EXPECT_NE(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, dispatchPartSize));
EXPECT_NE(0, memcmp(inlinePostSyncCpuBuffer, inlinePostSyncCmdBuffer, inlinePostSyncSize));
}
HWCMDTEST_F(IGFX_XE_HP_CORE,