mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-29 17:13:29 +08:00
performance: limit copying compute walker from host view at kernel mutation
- at kernel mutation copy post sync and inline data - at mandatory group count/size mutation copy dispatch data Related-To: NEO-13916 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
88ed1ce169
commit
95dedbbcd8
@@ -690,7 +690,7 @@ ze_result_t MutableCommandListImp::updateMutableCommandKernelsExp(uint32_t numKe
|
||||
}
|
||||
}
|
||||
|
||||
// copy post sync and payload from old walker host view into new walker host view
|
||||
// copy post sync and possible indirect/scratch pointers from old walker host view into new walker host view
|
||||
auto newKernelComputeWalker = newMutableKernel->getMutableComputeWalker();
|
||||
newKernelComputeWalker->copyWalkerDataToHostBuffer(oldKernelComputeWalker);
|
||||
|
||||
@@ -700,8 +700,8 @@ ze_result_t MutableCommandListImp::updateMutableCommandKernelsExp(uint32_t numKe
|
||||
this->updateScratchAddress(scratchAddressPatchIndex, *oldKernelComputeWalker, *newKernelComputeWalker);
|
||||
}
|
||||
|
||||
// save new host view into command buffer
|
||||
newKernelComputeWalker->saveCpuBufferIntoGpuBuffer(false);
|
||||
// save new host view inline data/post sync into command buffer
|
||||
newKernelComputeWalker->saveCpuBufferIntoGpuBuffer(false, true);
|
||||
|
||||
// update reminder variables (signal/wait events variables) with new compute walker to have correct reference for new post sync addresses
|
||||
for (auto &mutableVariableDescriptor : selectedAppend.variables) {
|
||||
|
||||
@@ -83,7 +83,7 @@ struct MutableComputeWalker {
|
||||
|
||||
virtual void copyWalkerDataToHostBuffer(MutableComputeWalker *sourceWalker) = 0;
|
||||
virtual void updateWalkerScratchPatchAddress(GpuAddress scratchPatchAddress) = 0;
|
||||
virtual void saveCpuBufferIntoGpuBuffer(bool useDispatchPart) = 0;
|
||||
virtual void saveCpuBufferIntoGpuBuffer(bool useDispatchPart, bool useInlinePostSyncPart) = 0;
|
||||
|
||||
protected:
|
||||
void *walker;
|
||||
|
||||
@@ -50,7 +50,7 @@ struct MutableComputeWalkerHw : public MutableComputeWalker, NEO::NonCopyableAnd
|
||||
|
||||
void copyWalkerDataToHostBuffer(MutableComputeWalker *sourceWalker) override;
|
||||
void updateWalkerScratchPatchAddress(GpuAddress scratchPatchAddress) override;
|
||||
void saveCpuBufferIntoGpuBuffer(bool useDispatchPart) override;
|
||||
void saveCpuBufferIntoGpuBuffer(bool useDispatchPart, bool useInlinePostSyncPart) override;
|
||||
|
||||
protected:
|
||||
template <typename WalkerType>
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include "shared/source/command_stream/linear_stream.h"
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/device/device.h"
|
||||
#include "shared/source/helpers/ptr_math.h"
|
||||
|
||||
#include "level_zero/core/source/mutable_cmdlist/mutable_command_walker_hw.h"
|
||||
|
||||
@@ -71,4 +72,24 @@ void MutableComputeWalkerHw<GfxFamily>::updateSlmSize(const NEO::Device &device,
|
||||
this->setSlmSize(slmSize);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void MutableComputeWalkerHw<GfxFamily>::saveCpuBufferIntoGpuBuffer(bool useDispatchPart, bool useInlinePostSyncPart) {
|
||||
using WalkerType = typename GfxFamily::DefaultWalkerType;
|
||||
constexpr size_t dispatchPartSize = offsetof(WalkerType, TheStructure.Common.PostSync);
|
||||
constexpr size_t walkerSize = sizeof(WalkerType);
|
||||
constexpr size_t inlinePostSyncSize = walkerSize - dispatchPartSize;
|
||||
|
||||
if (useDispatchPart && useInlinePostSyncPart) {
|
||||
memcpy_s(this->walker, walkerSize, this->cpuBuffer, walkerSize);
|
||||
} else {
|
||||
if (useDispatchPart) {
|
||||
memcpy_s(this->walker, dispatchPartSize, this->cpuBuffer, dispatchPartSize);
|
||||
} else if (useInlinePostSyncPart) {
|
||||
auto cmdBufferInlinePostSync = ptrOffset(this->walker, dispatchPartSize);
|
||||
auto hostViewInlinePostSync = ptrOffset(this->cpuBuffer, dispatchPartSize);
|
||||
memcpy_s(cmdBufferInlinePostSync, inlinePostSyncSize, hostViewInlinePostSync, inlinePostSyncSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace L0::MCL
|
||||
|
||||
@@ -277,20 +277,6 @@ template <typename GfxFamily>
|
||||
void MutableComputeWalkerHw<GfxFamily>::updateWalkerScratchPatchAddress(GpuAddress scratchPatchAddress) {
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void MutableComputeWalkerHw<GfxFamily>::saveCpuBufferIntoGpuBuffer(bool useDispatchPart) {
|
||||
using WalkerType = typename GfxFamily::DefaultWalkerType;
|
||||
auto walkerCmdHostBuffer = reinterpret_cast<WalkerType *>(this->cpuBuffer);
|
||||
auto walkerCmd = reinterpret_cast<WalkerType *>(this->walker);
|
||||
|
||||
if (useDispatchPart) {
|
||||
constexpr size_t dispatchPartSize = offsetof(WalkerType, TheStructure.Common.PostSync);
|
||||
memcpy_s(this->walker, dispatchPartSize, this->cpuBuffer, dispatchPartSize);
|
||||
} else {
|
||||
*walkerCmd = *walkerCmdHostBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void MutableComputeWalkerHw<GfxFamily>::updateSpecificFields(const NEO::Device &device,
|
||||
MutableWalkerSpecificFieldsArguments &args) {
|
||||
@@ -341,7 +327,7 @@ void MutableComputeWalkerHw<GfxFamily>::updateSpecificFields(const NEO::Device &
|
||||
|
||||
walkerCmd->getRawData(computeDispatchAllWalkerEnableIndex) = cpuBufferWalker->getRawData(computeDispatchAllWalkerEnableIndex);
|
||||
} else {
|
||||
this->saveCpuBufferIntoGpuBuffer(true);
|
||||
this->saveCpuBufferIntoGpuBuffer(true, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -582,7 +582,8 @@ void VariableDispatch::commitChanges(const NEO::Device &device) {
|
||||
.updateSlm = this->commitSlmSize};
|
||||
mutableCommandWalker->updateSpecificFields(device, args);
|
||||
|
||||
mutableCommandWalker->saveCpuBufferIntoGpuBuffer(true);
|
||||
// save only dispatch part of walker (since this part was processed in group count/size mutation)
|
||||
mutableCommandWalker->saveCpuBufferIntoGpuBuffer(true, false);
|
||||
|
||||
cleanCommitVariableDispatch();
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include "shared/source/helpers/kernel_helpers.h"
|
||||
#include "shared/source/indirect_heap/indirect_heap.h"
|
||||
#include "shared/test/common/helpers/unit_test_helper.h"
|
||||
#include "shared/test/common/mocks/mock_graphics_allocation.h"
|
||||
#include "shared/test/common/test_macros/hw_test.h"
|
||||
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
|
||||
@@ -91,6 +92,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE,
|
||||
givenTwoKernelsWithSignalEventWhenFirstAppendedAndSecondMutatedThenPostSyncIsPreserved) {
|
||||
using WalkerType = typename FamilyType::PorWalkerType;
|
||||
|
||||
auto kernelIsaAddress = kernel->getIsaAllocation()->getGpuAddress();
|
||||
|
||||
auto kernel2IsaAddress = kernelIsaAddress + 0x20000;
|
||||
static_cast<NEO::MockGraphicsAllocation *>(kernel2->getIsaAllocation())->gpuAddress = kernel2IsaAddress;
|
||||
|
||||
auto event = createTestEvent(false, false, false, false);
|
||||
auto eventAddress = event->getGpuAddress(this->device);
|
||||
|
||||
@@ -128,6 +134,24 @@ HWCMDTEST_F(IGFX_XE_HP_CORE,
|
||||
|
||||
walkerPostSyncAddress = NEO::UnitTestHelper<FamilyType>::getWalkerActivePostSyncAddress(walkerCmd);
|
||||
EXPECT_EQ(eventAddress, walkerPostSyncAddress);
|
||||
|
||||
// kernel ISA will be updated after mandatory group count mutation
|
||||
EXPECT_EQ(kernelIsaAddress, walkerCmd->getInterfaceDescriptor().getKernelStartPointer());
|
||||
|
||||
ze_group_count_t mutatedGroupCount = {8, 2, 1};
|
||||
ze_mutable_group_count_exp_desc_t groupCountDesc = {ZE_STRUCTURE_TYPE_MUTABLE_GROUP_COUNT_EXP_DESC};
|
||||
groupCountDesc.commandId = commandId;
|
||||
groupCountDesc.pGroupCount = &mutatedGroupCount;
|
||||
mutableCommandsDesc.pNext = &groupCountDesc;
|
||||
|
||||
result = mutableCommandList->updateMutableCommandsExp(&mutableCommandsDesc);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
result = mutableCommandList->close();
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
// kernel 2 isa set
|
||||
EXPECT_EQ(kernel2IsaAddress, walkerCmd->getInterfaceDescriptor().getKernelStartPointer());
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE,
|
||||
|
||||
@@ -50,27 +50,48 @@ HWCMDTEST_F(IGFX_XE_HP_CORE,
|
||||
MutableHwCommandTest,
|
||||
givenMutableComputeWalkerMatchesComputeWalkerWhenObjectIsCreatedAndCopiedFromCpuMemoryThenExactCopyIsExpected) {
|
||||
using WalkerType = typename FamilyType::PorWalkerType;
|
||||
constexpr size_t dispatchPartSize = offsetof(WalkerType, TheStructure.Common.PostSync);
|
||||
constexpr size_t walkerSize = sizeof(WalkerType);
|
||||
constexpr size_t inlinePostSyncSize = walkerSize - dispatchPartSize;
|
||||
|
||||
void *inlinePostSyncCmdBuffer = ptrOffset(this->cmdBufferGpuPtr, dispatchPartSize);
|
||||
|
||||
auto walkerTemplate = FamilyType::template getInitGpuWalker<WalkerType>();
|
||||
createDefaultMutableWalker<FamilyType, WalkerType>(&walkerTemplate, true, false);
|
||||
void *inlinePostSyncCpuBuffer = ptrOffset(this->cmdBufferCpuPtr, dispatchPartSize);
|
||||
auto cpuBufferWalkerCmd = reinterpret_cast<WalkerType *>(this->cmdBufferCpuPtr);
|
||||
// dummy post sync address in inline/post sync part of walker
|
||||
cpuBufferWalkerCmd->getPostSync().setDestinationAddress(0x2000);
|
||||
// dummy kernel start address in dispatch part of walker
|
||||
cpuBufferWalkerCmd->getInterfaceDescriptor().setKernelStartPointer(0xFF000);
|
||||
|
||||
mutableWalker->saveCpuBufferIntoGpuBuffer(false);
|
||||
|
||||
// true, true will do the full copy of compute walker
|
||||
mutableWalker->saveCpuBufferIntoGpuBuffer(true, true);
|
||||
EXPECT_EQ(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, this->walkerCmdSize));
|
||||
|
||||
createDefaultMutableWalker<FamilyType, WalkerType>(&walkerTemplate, true, false);
|
||||
auto cpuBuffer = reinterpret_cast<WalkerType *>(this->cmdBufferCpuPtr);
|
||||
// dummy address to mutate in next step
|
||||
cpuBuffer->getPostSync().setDestinationAddress(0x2000);
|
||||
|
||||
uint64_t postSyncAddress = 0x1000;
|
||||
mutableWalker->setPostSyncAddress(postSyncAddress, 0);
|
||||
|
||||
memset(this->cmdBufferGpuPtr, 0, this->walkerCmdSize);
|
||||
|
||||
// true parameter will not save post syncs to gpu buffer
|
||||
mutableWalker->saveCpuBufferIntoGpuBuffer(true);
|
||||
// true, false parameter will not save post syncs to gpu buffer, but kernel ISA start address
|
||||
mutableWalker->saveCpuBufferIntoGpuBuffer(true, false);
|
||||
EXPECT_NE(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, this->walkerCmdSize));
|
||||
EXPECT_EQ(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, dispatchPartSize));
|
||||
EXPECT_NE(0, memcmp(inlinePostSyncCpuBuffer, inlinePostSyncCmdBuffer, inlinePostSyncSize));
|
||||
|
||||
memset(this->cmdBufferGpuPtr, 0, this->walkerCmdSize);
|
||||
|
||||
// false, true parameter will save post syncs to gpu buffer, but not kernel ISA start address
|
||||
mutableWalker->saveCpuBufferIntoGpuBuffer(false, true);
|
||||
EXPECT_NE(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, this->walkerCmdSize));
|
||||
EXPECT_NE(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, dispatchPartSize));
|
||||
EXPECT_EQ(0, memcmp(inlinePostSyncCpuBuffer, inlinePostSyncCmdBuffer, inlinePostSyncSize));
|
||||
|
||||
memset(this->cmdBufferGpuPtr, 0, this->walkerCmdSize);
|
||||
|
||||
// false, false will not copy any data
|
||||
mutableWalker->saveCpuBufferIntoGpuBuffer(false, false);
|
||||
EXPECT_NE(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, this->walkerCmdSize));
|
||||
EXPECT_NE(0, memcmp(this->cmdBufferCpuPtr, this->cmdBufferGpuPtr, dispatchPartSize));
|
||||
EXPECT_NE(0, memcmp(inlinePostSyncCpuBuffer, inlinePostSyncCmdBuffer, inlinePostSyncSize));
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE,
|
||||
|
||||
Reference in New Issue
Block a user