feature: update processing kernel residency

- save position of kernel internal container when allocation can change
- reuse the same position when new allocation arrives
- add index container for additional allocation of image argument
- save position of additional allocation of image argument
- reuse position when for new image argument

Related-To: NEO-11719

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2024-09-09 10:17:13 +00:00 committed by Compute-Runtime-Automation
parent d103f9f0f6
commit 0628d97ee1
9 changed files with 192 additions and 15 deletions

View File

@ -820,7 +820,16 @@ ze_result_t KernelImp::setArgImage(uint32_t argIndex, size_t argSize, const void
argumentsResidencyContainer[argIndex] = image->getAllocation();
if (image->getImplicitArgsAllocation()) {
this->argumentsResidencyContainer.push_back(image->getImplicitArgsAllocation());
if (implicitArgsResidencyContainerIndices[argIndex] == std::numeric_limits<size_t>::max()) {
implicitArgsResidencyContainerIndices[argIndex] = argumentsResidencyContainer.size();
argumentsResidencyContainer.push_back(image->getImplicitArgsAllocation());
} else {
argumentsResidencyContainer[implicitArgsResidencyContainerIndices[argIndex]] = image->getImplicitArgsAllocation();
}
} else {
if (implicitArgsResidencyContainerIndices[argIndex] != std::numeric_limits<size_t>::max()) {
argumentsResidencyContainer[implicitArgsResidencyContainerIndices[argIndex]] = nullptr;
}
}
auto imageInfo = image->getImageInfo();
@ -1085,6 +1094,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
}
argumentsResidencyContainer.resize(this->kernelArgHandlers.size(), nullptr);
implicitArgsResidencyContainerIndices.resize(this->kernelArgHandlers.size(), std::numeric_limits<size_t>::max());
auto &kernelAttributes = kernelDescriptor.kernelAttributes;
if ((kernelAttributes.perHwThreadPrivateMemorySize != 0U) && (false == module->shouldAllocatePrivateMemoryPerDispatch())) {
@ -1179,14 +1189,24 @@ bool KernelImp::usesRegionGroupBarrier() const {
}
void KernelImp::patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
this->internalResidencyContainer.push_back(gfxAllocation);
if (syncBufferIndex == std::numeric_limits<size_t>::max()) {
syncBufferIndex = this->internalResidencyContainer.size();
this->internalResidencyContainer.push_back(gfxAllocation);
} else {
this->internalResidencyContainer[syncBufferIndex] = gfxAllocation;
}
NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.syncBufferAddress,
static_cast<uintptr_t>(ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset)));
}
void KernelImp::patchRegionGroupBarrier(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
this->internalResidencyContainer.push_back(gfxAllocation);
if (regionGroupBarrierIndex == std::numeric_limits<size_t>::max()) {
regionGroupBarrierIndex = this->internalResidencyContainer.size();
this->internalResidencyContainer.push_back(gfxAllocation);
} else {
this->internalResidencyContainer[regionGroupBarrierIndex] = gfxAllocation;
}
NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.regionGroupBarrierBuffer,

View File

@ -215,10 +215,13 @@ struct KernelImp : Kernel {
std::vector<KernelArgInfo> kernelArgInfos;
std::vector<KernelImp::KernelArgHandler> kernelArgHandlers;
std::vector<NEO::GraphicsAllocation *> argumentsResidencyContainer;
std::vector<size_t> implicitArgsResidencyContainerIndices;
std::vector<NEO::GraphicsAllocation *> internalResidencyContainer;
std::mutex *devicePrintfKernelMutex = nullptr;
NEO::GraphicsAllocation *printfBuffer = nullptr;
size_t syncBufferIndex = std::numeric_limits<size_t>::max();
size_t regionGroupBarrierIndex = std::numeric_limits<size_t>::max();
uint32_t groupSize[3] = {0u, 0u, 0u};
uint32_t numThreadsPerThreadGroup = 1u;

View File

@ -45,6 +45,7 @@ struct WhiteBox<::L0::KernelImp> : public ::L0::KernelImp {
using ::L0::KernelImp::dynamicStateHeapData;
using ::L0::KernelImp::dynamicStateHeapDataSize;
using ::L0::KernelImp::groupSize;
using ::L0::KernelImp::implicitArgsResidencyContainerIndices;
using ::L0::KernelImp::internalResidencyContainer;
using ::L0::KernelImp::isBindlessOffsetSet;
using ::L0::KernelImp::kernelHasIndirectAccess;
@ -61,12 +62,14 @@ struct WhiteBox<::L0::KernelImp> : public ::L0::KernelImp {
using ::L0::KernelImp::perThreadDataSizeForWholeThreadGroup;
using ::L0::KernelImp::pImplicitArgs;
using ::L0::KernelImp::printfBuffer;
using ::L0::KernelImp::regionGroupBarrierIndex;
using ::L0::KernelImp::requiredWorkgroupOrder;
using ::L0::KernelImp::setAssertBuffer;
using ::L0::KernelImp::slmArgsTotalSize;
using ::L0::KernelImp::suggestGroupSizeCache;
using ::L0::KernelImp::surfaceStateHeapData;
using ::L0::KernelImp::surfaceStateHeapDataSize;
using ::L0::KernelImp::syncBufferIndex;
using ::L0::KernelImp::unifiedMemoryControls;
using ::L0::KernelImp::usingSurfaceStateHeap;

View File

@ -17,6 +17,7 @@
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/mocks/mock_sync_buffer_handler.h"
#include "shared/test/common/test_macros/hw_test.h"
#include "level_zero/core/source/event/event.h"
@ -435,6 +436,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau
Mock<::L0::KernelImp> kernel;
auto pMockModule = std::unique_ptr<Module>(new Mock<Module>(device, nullptr));
kernel.module = pMockModule.get();
EXPECT_EQ(std::numeric_limits<size_t>::max(), kernel.syncBufferIndex);
kernel.setGroupSize(4, 1, 1);
ze_group_count_t groupCount{8, 1, 1};
@ -458,6 +460,15 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau
auto result = pCommandList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, cooperativeParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto mockSyncBufferHandler = reinterpret_cast<MockSyncBufferHandler *>(device->getNEODevice()->syncBufferHandler.get());
auto syncBufferAllocation = mockSyncBufferHandler->graphicsAllocation;
EXPECT_NE(std::numeric_limits<size_t>::max(), kernel.syncBufferIndex);
auto syncBufferAllocationIt = std::find(kernel.internalResidencyContainer.begin(), kernel.internalResidencyContainer.end(), syncBufferAllocation);
ASSERT_NE(kernel.internalResidencyContainer.end(), syncBufferAllocationIt);
auto expectedIndex = static_cast<size_t>(std::distance(kernel.internalResidencyContainer.begin(), syncBufferAllocationIt));
EXPECT_EQ(expectedIndex, kernel.syncBufferIndex);
pCommandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
pCommandList->initialize(device, engineGroupType, 0u);
CmdListKernelLaunchParams launchParams = {};
@ -465,6 +476,14 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau
result = pCommandList->appendLaunchKernelWithParams(&kernel, groupCount, nullptr, launchParams);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
// sync buffer index once set should not change
EXPECT_EQ(expectedIndex, kernel.syncBufferIndex);
syncBufferAllocationIt = std::find(kernel.internalResidencyContainer.begin(), kernel.internalResidencyContainer.end(), syncBufferAllocation);
ASSERT_NE(kernel.internalResidencyContainer.end(), syncBufferAllocationIt);
// verify syncBufferAllocation is added only once
auto notFoundIt = std::find(syncBufferAllocationIt + 1, kernel.internalResidencyContainer.end(), syncBufferAllocation);
EXPECT_EQ(kernel.internalResidencyContainer.end(), notFoundIt);
{
VariableBackup<std::array<bool, 4>> usesSyncBuffer{&kernelAttributes.flags.packed};
usesSyncBuffer = {};
@ -498,6 +517,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenA
Mock<::L0::KernelImp> kernel;
auto pMockModule = std::unique_ptr<Module>(new Mock<Module>(device, nullptr));
kernel.module = pMockModule.get();
EXPECT_EQ(std::numeric_limits<size_t>::max(), kernel.regionGroupBarrierIndex);
kernel.crossThreadData = std::make_unique<uint8_t[]>(64);
kernel.crossThreadDataSize = 64;
@ -528,9 +548,23 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenA
return element.first->getGpuAddressToPatch() == patchPtr;
});
ASSERT_NE(ultCsr->makeResidentAllocations.end(), allocIter);
auto regionGroupBarrierAllocation = allocIter->first;
auto regionGroupBarrierAllocIt = std::find(kernel.internalResidencyContainer.begin(), kernel.internalResidencyContainer.end(), regionGroupBarrierAllocation);
ASSERT_NE(kernel.internalResidencyContainer.end(), regionGroupBarrierAllocIt);
auto expectedIndex = static_cast<size_t>(std::distance(kernel.internalResidencyContainer.begin(), regionGroupBarrierAllocIt));
EXPECT_EQ(expectedIndex, kernel.regionGroupBarrierIndex);
EXPECT_EQ(ZE_RESULT_SUCCESS, cmdList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false));
// region group barrier index once set should not change
EXPECT_EQ(expectedIndex, kernel.regionGroupBarrierIndex);
regionGroupBarrierAllocIt = std::find(kernel.internalResidencyContainer.begin(), kernel.internalResidencyContainer.end(), regionGroupBarrierAllocation);
ASSERT_NE(kernel.internalResidencyContainer.end(), regionGroupBarrierAllocIt);
// verify regionGroupBarrierAllocation is added only once
auto notFoundIt = std::find(regionGroupBarrierAllocIt + 1, kernel.internalResidencyContainer.end(), regionGroupBarrierAllocation);
EXPECT_EQ(kernel.internalResidencyContainer.end(), notFoundIt);
auto patchPtr2 = *reinterpret_cast<uint64_t *>(ptrOffset(kernel.crossThreadData.get(), regionGroupBarrier.stateless));
size_t requestedNumberOfWorkgroups = groupCount.groupCountX * groupCount.groupCountY * groupCount.groupCountZ;

View File

@ -675,6 +675,115 @@ TEST_F(SetKernelArg, givenDisableSystemPointerKernelArgumentIsEnabledWhenBufferA
EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, res);
}
HWTEST2_F(SetKernelArg, givenBindlessImageAndKernelFromNativeWhenSetArgImageCalledThenResidencyContainerHasSingleImplicitArgAllocation, ImageSupport) {
auto neoDevice = device->getNEODevice();
if (!neoDevice->getRootDeviceEnvironment().getReleaseHelper() ||
!neoDevice->getDeviceInfo().imageSupport) {
GTEST_SKIP();
}
constexpr uint32_t imageArgIndex = 3;
createKernel();
auto &imageArg = const_cast<NEO::ArgDescImage &>(kernel->kernelImmData->getDescriptor().payloadMappings.explicitArgs[imageArgIndex].as<NEO::ArgDescImage>());
imageArg.metadataPayload.imgWidth = 0x1c;
imageArg.metadataPayload.imgHeight = 0x18;
imageArg.metadataPayload.imgDepth = 0x14;
imageArg.metadataPayload.arraySize = 0x10;
imageArg.metadataPayload.numSamples = 0xc;
imageArg.metadataPayload.channelDataType = 0x8;
imageArg.metadataPayload.channelOrder = 0x4;
imageArg.metadataPayload.numMipLevels = 0x0;
imageArg.metadataPayload.flatWidth = 0x30;
imageArg.metadataPayload.flatHeight = 0x2c;
imageArg.metadataPayload.flatPitch = 0x28;
imageArg.metadataPayload.flatBaseOffset = 0x20;
ze_image_desc_t desc = {};
desc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
desc.type = ZE_IMAGE_TYPE_3D;
desc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_10_10_10_2;
desc.format.type = ZE_IMAGE_FORMAT_TYPE_UINT;
desc.width = 11;
desc.height = 13;
desc.depth = 17;
desc.format.x = ZE_IMAGE_FORMAT_SWIZZLE_A;
desc.format.y = ZE_IMAGE_FORMAT_SWIZZLE_0;
desc.format.z = ZE_IMAGE_FORMAT_SWIZZLE_1;
desc.format.w = ZE_IMAGE_FORMAT_SWIZZLE_X;
auto imageBasic = std::make_unique<WhiteBox<::L0::ImageCoreFamily<gfxCoreFamily>>>();
auto ret = imageBasic->initialize(device, &desc);
ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
auto imageBasicHandle = imageBasic->toHandle();
auto bindlessHelper = new MockBindlesHeapsHelper(neoDevice,
neoDevice->getNumGenericSubDevices() > 1);
neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[neoDevice->getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHelper);
ze_image_bindless_exp_desc_t bindlessExtDesc = {};
bindlessExtDesc.stype = ZE_STRUCTURE_TYPE_BINDLESS_IMAGE_EXP_DESC;
bindlessExtDesc.pNext = nullptr;
bindlessExtDesc.flags = ZE_IMAGE_BINDLESS_EXP_FLAG_BINDLESS;
desc = {};
desc.pNext = &bindlessExtDesc;
desc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
desc.type = ZE_IMAGE_TYPE_3D;
desc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8;
desc.format.type = ZE_IMAGE_FORMAT_TYPE_UINT;
desc.width = 11;
desc.height = 13;
desc.depth = 17;
desc.format.x = ZE_IMAGE_FORMAT_SWIZZLE_A;
desc.format.y = ZE_IMAGE_FORMAT_SWIZZLE_0;
desc.format.z = ZE_IMAGE_FORMAT_SWIZZLE_1;
desc.format.w = ZE_IMAGE_FORMAT_SWIZZLE_X;
auto imageBindless1 = std::make_unique<WhiteBox<::L0::ImageCoreFamily<gfxCoreFamily>>>();
ret = imageBindless1->initialize(device, &desc);
ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
auto imgImplicitArgsAlloc1 = imageBindless1->getImplicitArgsAllocation();
auto imageBindlessHandle1 = imageBindless1->toHandle();
auto imageBindless2 = std::make_unique<WhiteBox<::L0::ImageCoreFamily<gfxCoreFamily>>>();
ret = imageBindless2->initialize(device, &desc);
ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
auto imgImplicitArgsAlloc2 = imageBindless2->getImplicitArgsAllocation();
auto imageBindlessHandle2 = imageBindless2->toHandle();
EXPECT_EQ(std::numeric_limits<size_t>::max(), kernel->implicitArgsResidencyContainerIndices[imageArgIndex]);
EXPECT_EQ(ZE_RESULT_SUCCESS, kernel->setArgImage(imageArgIndex, sizeof(imageBindless1.get()), &imageBindlessHandle1));
auto implicitArgIt = std::find(kernel->argumentsResidencyContainer.begin(), kernel->argumentsResidencyContainer.end(), imgImplicitArgsAlloc1);
ASSERT_NE(kernel->argumentsResidencyContainer.end(), implicitArgIt);
auto expectedDistance = static_cast<size_t>(std::distance(kernel->argumentsResidencyContainer.begin(), implicitArgIt));
EXPECT_EQ(expectedDistance, kernel->implicitArgsResidencyContainerIndices[imageArgIndex]);
EXPECT_EQ(imgImplicitArgsAlloc1, kernel->argumentsResidencyContainer[kernel->implicitArgsResidencyContainerIndices[imageArgIndex]]);
EXPECT_EQ(ZE_RESULT_SUCCESS, kernel->setArgImage(imageArgIndex, sizeof(imageBindless2.get()), &imageBindlessHandle2));
implicitArgIt = std::find(kernel->argumentsResidencyContainer.begin(), kernel->argumentsResidencyContainer.end(), imgImplicitArgsAlloc2);
ASSERT_NE(kernel->argumentsResidencyContainer.end(), implicitArgIt);
auto expectedDistance2 = static_cast<size_t>(std::distance(kernel->argumentsResidencyContainer.begin(), implicitArgIt));
EXPECT_EQ(expectedDistance2, kernel->implicitArgsResidencyContainerIndices[imageArgIndex]);
EXPECT_EQ(expectedDistance, expectedDistance2);
EXPECT_EQ(imgImplicitArgsAlloc2, kernel->argumentsResidencyContainer[kernel->implicitArgsResidencyContainerIndices[imageArgIndex]]);
EXPECT_EQ(ZE_RESULT_SUCCESS, kernel->setArgImage(imageArgIndex, sizeof(imageBasic.get()), &imageBasicHandle));
EXPECT_EQ(nullptr, kernel->argumentsResidencyContainer[kernel->implicitArgsResidencyContainerIndices[imageArgIndex]]);
}
using KernelImmutableDataTests = Test<ModuleImmutableDataFixture>;
TEST_F(KernelImmutableDataTests, givenKernelInitializedWithNoPrivateMemoryThenPrivateMemoryIsNull) {

View File

@ -17,6 +17,7 @@
#include "shared/test/common/mocks/mock_csr.h"
#include "shared/test/common/mocks/mock_internal_allocation_storage.h"
#include "shared/test/common/mocks/mock_os_context.h"
#include "shared/test/common/mocks/mock_sync_buffer_handler.h"
#include "shared/test/common/mocks/mock_timestamp_container.h"
#include "shared/test/common/test_macros/hw_test.h"
#include "shared/test/common/utilities/base_object_utils.h"
@ -687,10 +688,6 @@ HWTEST_F(EnqueueHandlerTest, givenKernelUsingSyncBufferWhenEnqueuingKernelThenSs
GTEST_SKIP();
}
struct MockSyncBufferHandler : SyncBufferHandler {
using SyncBufferHandler::graphicsAllocation;
};
pDevice->allocateSyncBufferHandler();
size_t offset = 0;

View File

@ -6,8 +6,8 @@
*/
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/program/sync_buffer_handler.h"
#include "shared/source/release_helper/release_helper.h"
#include "shared/test/common/mocks/mock_sync_buffer_handler.h"
#include "shared/test/common/mocks/ult_device_factory.h"
#include "shared/test/common/test_macros/hw_test.h"
@ -22,13 +22,6 @@
using namespace NEO;
class MockSyncBufferHandler : public SyncBufferHandler {
public:
using SyncBufferHandler::bufferSize;
using SyncBufferHandler::graphicsAllocation;
using SyncBufferHandler::usedBufferSize;
};
class SyncBufferEnqueueHandlerTest : public EnqueueHandlerTest {
public:
void SetUp() override {

View File

@ -90,6 +90,7 @@ set(NEO_CORE_tests_mocks
${CMAKE_CURRENT_SOURCE_DIR}/mock_sip.h
${CMAKE_CURRENT_SOURCE_DIR}/mock_submissions_aggregator.h
${CMAKE_CURRENT_SOURCE_DIR}/mock_svm_manager.h
${CMAKE_CURRENT_SOURCE_DIR}/mock_sync_buffer_handler.h
${CMAKE_CURRENT_SOURCE_DIR}/mock_tbx_csr.h
${CMAKE_CURRENT_SOURCE_DIR}/mock_timestamp_container.h
${CMAKE_CURRENT_SOURCE_DIR}/mock_timestamp_packet.h

View File

@ -0,0 +1,17 @@
/*
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/program/sync_buffer_handler.h"
class MockSyncBufferHandler : public NEO::SyncBufferHandler {
public:
using SyncBufferHandler::bufferSize;
using SyncBufferHandler::graphicsAllocation;
using SyncBufferHandler::usedBufferSize;
};