refactor: Improve scratch programming in heapless mode

Related-To: NEO-7621
Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
Kamil Kopryk 2024-01-30 01:00:53 +00:00 committed by Compute-Runtime-Automation
parent a104d9199d
commit 6d3a53fe7f
8 changed files with 68 additions and 54 deletions

View File

@ -107,41 +107,12 @@ inline void HardwareInterface<GfxFamily>::programWalker(
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder);
auto requiredScratchSlot0Size = queueCsr.getRequiredScratchSlot0Size();
auto requiredScratchSlot1Size = queueCsr.getRequiredScratchSlot1Size();
uint64_t scratchAddress = 0u;
EncodeDispatchKernel<GfxFamily>::template setScratchAddress<heaplessModeEnabled>(scratchAddress, requiredScratchSlot0Size, requiredScratchSlot1Size, &ssh, queueCsr);
auto interfaceDescriptor = &walkerCmd.getInterfaceDescriptor();
uint64_t scratchAddress = 0;
if constexpr (heaplessModeEnabled) {
auto scratchAllocation = queueCsr.getScratchAllocation();
auto scratchSpaceController = queueCsr.getScratchSpaceController();
if (scratchAllocation) {
scratchAddress = ssh.getGpuBase() + scratchSpaceController->getScratchPatchAddress();
} else {
auto requiredScratchSlot0Size = queueCsr.getRequiredScratchSlot0Size();
auto requiredScratchSlot1Size = queueCsr.getRequiredScratchSlot1Size();
bool stateBaseAddressDirty = false;
bool checkVfeStateDirty = false;
if (requiredScratchSlot0Size || requiredScratchSlot1Size) {
scratchSpaceController->setRequiredScratchSpace(ssh.getCpuBase(),
0u,
requiredScratchSlot0Size,
requiredScratchSlot1Size,
queueCsr.peekTaskCount(), queueCsr.getOsContext(),
stateBaseAddressDirty,
checkVfeStateDirty);
if (scratchSpaceController->getScratchSpaceSlot0Allocation()) {
queueCsr.makeResident(*scratchSpaceController->getScratchSpaceSlot0Allocation());
}
if (scratchSpaceController->getScratchSpaceSlot1Allocation()) {
queueCsr.makeResident(*scratchSpaceController->getScratchSpaceSlot1Allocation());
}
scratchAddress = ssh.getGpuBase() + scratchSpaceController->getScratchPatchAddress();
}
}
}
HardwareCommandsHelper<GfxFamily>::template sendIndirectState<WalkerType, InterfaceDescriptorType>(
commandStream,

View File

@ -184,6 +184,9 @@ struct EncodeDispatchKernel {
using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
return BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE;
}
template <bool isHeapless>
static void setScratchAddress(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &csr);
};
template <typename GfxFamily>

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2023 Intel Corporation
* Copyright (C) 2020-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -768,6 +768,11 @@ size_t EncodeDispatchKernel<Family>::getDefaultDshAlignment() {
return EncodeStates<Family>::alignIndirectStatePointer;
}
template <typename Family>
template <bool isHeapless>
void EncodeDispatchKernel<Family>::setScratchAddress(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &csr) {
}
template <typename Family>
void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws) {
for (int i = 0; i < 3; ++i) {

View File

@ -22,6 +22,8 @@ template void NEO::EncodeDispatchKernel<Family>::encode<Family::DefaultWalkerTyp
template void NEO::EncodeDispatchKernel<Family>::encodeThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
template void NEO::EncodeDispatchKernel<Family>::adjustWalkOrder<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
template void NEO::EncodeDispatchKernel<Family>::programBarrierEnable<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
template void NEO::EncodeDispatchKernel<Family>::setScratchAddress<false>(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &csr);
template void NEO::EncodeDispatchKernel<Family>::setScratchAddress<true>(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &csr);
template struct NEO::EncodeStates<Family>;
template struct NEO::EncodeMath<Family>;

View File

@ -321,27 +321,15 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
}
if constexpr (heaplessModeEnabled) {
auto requiredScratchSlot0Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
auto requiredScratchSlot1Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[1];
uint64_t scratchAddress = 0;
if (requiredScratchSlot0Size > 0 || requiredScratchSlot1Size > 0) {
auto csr = args.device->getDefaultEngine().commandStreamReceiver;
auto scratchController = csr->getScratchSpaceController();
bool gsbaState = false;
bool frontEndState = false;
auto ssh = container.getIndirectHeap(HeapType::surfaceState);
scratchController->setRequiredScratchSpace(ssh->getCpuBase(), 0, requiredScratchSlot0Size, requiredScratchSlot1Size,
csr->peekTaskCount(), csr->getOsContext(), gsbaState, frontEndState);
auto csr = args.device->getDefaultEngine().commandStreamReceiver;
auto ssh = container.getIndirectHeap(HeapType::surfaceState);
if (scratchController->getScratchSpaceSlot0Allocation()) {
csr->makeResident(*scratchController->getScratchSpaceSlot0Allocation());
}
if (scratchController->getScratchSpaceSlot1Allocation()) {
csr->makeResident(*scratchController->getScratchSpaceSlot1Allocation());
}
uint64_t scratchAddress = 0u;
scratchAddress = ssh->getGpuBase() + scratchController->getScratchPatchAddress();
}
EncodeDispatchKernel<Family>::template setScratchAddress<heaplessModeEnabled>(scratchAddress, requiredScratchSlot0Size, requiredScratchSlot1Size, ssh, *csr);
auto inlineDataPointer = reinterpret_cast<char *>(walkerCmd.getInlineDataPointer());
auto indirectDataPointerAddress = kernelDescriptor.payloadMappings.implicitArgs.indirectDataPointerAddress;

View File

@ -1539,3 +1539,19 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesImplicitScalingSecondaryBufferTe
givenDispatchImplicitScalingWithBbStartOverControlSectionWhenDispatchingAsSecondaryBufferContainerThenExpectSecondaryBatchBuffer) {
testBodyFindPrimaryBatchBuffer<FamilyType>();
}
using EncodeKernelScratchProgrammingTest = Test<ScratchProgrammingFixture>;
HWTEST2_F(EncodeKernelScratchProgrammingTest, givenHeaplessModeDisabledWhenSetScratchAddressIsCalledThenDoNothing, IsAtLeastXeHpCore) {
static constexpr bool heaplessModeEnabled = false;
auto &ultCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
uint64_t scratchAddress = 0;
uint32_t requiredScratchSlot0Size = 64;
uint32_t requiredScratchSlot1Size = 0;
EncodeDispatchKernel<FamilyType>::template setScratchAddress<heaplessModeEnabled>(scratchAddress, requiredScratchSlot0Size, requiredScratchSlot1Size, ssh, ultCsr);
uint64_t expectedScratchAddress = 0;
EXPECT_EQ(expectedScratchAddress, scratchAddress);
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2022-2023 Intel Corporation
* Copyright (C) 2022-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -8,8 +8,10 @@
#include "shared/test/unit_test/fixtures/command_container_fixture.h"
#include "shared/source/indirect_heap/heap_size.h"
#include "shared/source/indirect_heap/indirect_heap.h"
#include "shared/source/os_interface/product_helper.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/mocks/mock_graphics_allocation.h"
namespace NEO {
@ -72,6 +74,25 @@ EncodeDispatchKernelArgs CommandEncodeStatesFixture::createDefaultDispatchKernel
return args;
}
void ScratchProgrammingFixture::setUp() {
NEO::DeviceFixture::setUp();
size_t sizeStream = 512;
size_t alignmentStream = 0x1000;
ssh = new IndirectHeap{nullptr};
sshBuffer = alignedMalloc(sizeStream, alignmentStream);
ASSERT_NE(nullptr, sshBuffer);
ssh->replaceBuffer(sshBuffer, sizeStream);
auto graphicsAllocation = new MockGraphicsAllocation(sshBuffer, sizeStream);
ssh->replaceGraphicsAllocation(graphicsAllocation);
}
void ScratchProgrammingFixture::tearDown() {
delete ssh->getGraphicsAllocation();
delete ssh;
alignedFree(sshBuffer);
NEO::DeviceFixture::tearDown();
}
} // namespace NEO
void WalkerThreadFixture::setUp() {

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2023 Intel Corporation
* Copyright (C) 2020-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -65,6 +65,14 @@ class CommandEncodeStatesFixture : public DeviceFixture {
NEO::L1CachePolicy l1CachePolicyData;
};
struct ScratchProgrammingFixture : public NEO::DeviceFixture {
void setUp();
void tearDown();
IndirectHeap *ssh = nullptr;
void *sshBuffer = nullptr;
};
} // namespace NEO
struct WalkerThreadFixture {