fix: Reuse private allocations during cmdList dispatch

Related-To: NEO-8201

Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
This commit is contained in:
Maciej Plewka
2023-08-10 15:40:21 +00:00
committed by Compute-Runtime-Automation
parent ba4867c3d0
commit 5807d512b3
20 changed files with 256 additions and 48 deletions

View File

@@ -21,6 +21,7 @@
#include <level_zero/zet_api.h>
#include <map>
#include <unordered_map>
#include <vector>
struct _ze_command_list_handle_t {};
@@ -355,7 +356,7 @@ struct CommandList : _ze_command_list_handle_t {
MOCKABLE_VIRTUAL void synchronizeEventList(uint32_t numWaitEvents, ze_event_handle_t *waitEventList);
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;
std::unordered_map<uint32_t, NEO::GraphicsAllocation *> ownedPrivateAllocations;
std::vector<NEO::GraphicsAllocation *> patternAllocations;
std::vector<Kernel *> printfKernelContainer;

View File

@@ -309,7 +309,8 @@ struct CommandListCoreFamily : CommandListImp {
return (this->pipeControlMultiKernelEventSync && splitKernel) ||
compactL3FlushEvent(dcFlush);
}
void allocateKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread);
MOCKABLE_VIRTUAL void allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, std::unordered_map<uint32_t, NEO::GraphicsAllocation *> &privateAllocsToReuse);
virtual void allocateOrReuseKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread);
CmdListEventOperation estimateEventPostSync(Event *event, uint32_t operations);
void dispatchPostSyncCopy(uint64_t gpuAddress, uint32_t value, bool workloadPartition);
void dispatchPostSyncCompute(uint64_t gpuAddress, uint32_t value, bool workloadPartition);

View File

@@ -20,6 +20,7 @@
#include "shared/source/helpers/definitions/command_encoder_args.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/helpers/kernel_helpers.h"
#include "shared/source/helpers/logical_state_helper.h"
#include "shared/source/helpers/pipe_control_args.h"
#include "shared/source/helpers/preamble.h"
@@ -54,6 +55,7 @@
#include "CL/cl.h"
#include <algorithm>
#include <unordered_map>
namespace L0 {
@@ -71,8 +73,8 @@ inline ze_result_t parseErrorCode(NEO::CommandContainer::ErrorCode returnValue)
template <GFXCORE_FAMILY gfxCoreFamily>
CommandListCoreFamily<gfxCoreFamily>::~CommandListCoreFamily() {
clearCommandsToPatch();
for (auto alloc : this->ownedPrivateAllocations) {
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc);
for (auto &alloc : this->ownedPrivateAllocations) {
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc.second);
}
this->ownedPrivateAllocations.clear();
for (auto &patternAlloc : this->patternAllocations) {
@@ -129,8 +131,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
this->returnPoints.clear();
}
for (auto alloc : this->ownedPrivateAllocations) {
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc);
for (auto &alloc : this->ownedPrivateAllocations) {
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc.second);
}
this->ownedPrivateAllocations.clear();
cmdListCurrentStartOffset = 0;
@@ -3172,16 +3174,27 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWriteToMemory(void *desc
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::allocateKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread) {
void CommandListCoreFamily<gfxCoreFamily>::allocateOrReuseKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread) {
L0::KernelImp *kernelImp = static_cast<KernelImp *>(kernel);
if (sizePerHwThread != 0U && kernelImp->getParentModule().shouldAllocatePrivateMemoryPerDispatch()) {
auto privateMemoryGraphicsAllocation = kernel->allocatePrivateMemoryGraphicsAllocation();
kernel->patchCrossthreadDataWithPrivateAllocation(privateMemoryGraphicsAllocation);
this->commandContainer.addToResidencyContainer(privateMemoryGraphicsAllocation);
this->ownedPrivateAllocations.push_back(privateMemoryGraphicsAllocation);
allocateOrReuseKernelPrivateMemory(kernel, sizePerHwThread, ownedPrivateAllocations);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, std::unordered_map<uint32_t, NEO::GraphicsAllocation *> &privateAllocsToReuse) {
L0::KernelImp *kernelImp = static_cast<KernelImp *>(kernel);
NEO::GraphicsAllocation *privateAlloc = nullptr;
if (privateAllocsToReuse[sizePerHwThread] != nullptr) {
privateAlloc = privateAllocsToReuse[sizePerHwThread];
} else {
privateAlloc = kernelImp->allocatePrivateMemoryGraphicsAllocation();
privateAllocsToReuse[sizePerHwThread] = privateAlloc;
}
kernelImp->patchAndMoveToResidencyContainerPrivateSurface(privateAlloc);
}
template <GFXCORE_FAMILY gfxCoreFamily>
CmdListEventOperation CommandListCoreFamily<gfxCoreFamily>::estimateEventPostSync(Event *event, uint32_t operations) {
CmdListEventOperation ret;

View File

@@ -191,6 +191,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
void handleInOrderDependencyCounter();
bool isSkippingInOrderBarrierAllowed(ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) const;
bool useCounterAllocationForInOrderMode() const override { return true; }
void allocateOrReuseKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread) override;
MOCKABLE_VIRTUAL void checkAssert();
ComputeFlushMethodType computeFlushMethod = nullptr;

View File

@@ -1321,4 +1321,13 @@ void CommandListCoreFamilyImmediate<gfxCoreFamily>::setupFlushMethod(const NEO::
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamilyImmediate<gfxCoreFamily>::allocateOrReuseKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread) {
L0::KernelImp *kernelImp = static_cast<KernelImp *>(kernel);
if (sizePerHwThread != 0U && kernelImp->getParentModule().shouldAllocatePrivateMemoryPerDispatch()) {
auto ownership = this->csr->obtainUniqueOwnership();
this->allocateOrReuseKernelPrivateMemory(kernel, sizePerHwThread, this->csr->getOwnedPrivateAllocations());
}
}
} // namespace L0

View File

@@ -91,7 +91,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
kernel->patchGlobalOffset();
this->allocateKernelPrivateMemoryIfNeeded(kernel, kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize);
this->allocateOrReuseKernelPrivateMemoryIfNeeded(kernel, kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize);
if (!launchParams.isIndirect) {
kernel->setGroupCount(threadGroupDimensions->groupCountX,

View File

@@ -152,8 +152,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
auto kernelPreemptionMode = obtainKernelPreemptionMode(kernel);
kernel->patchGlobalOffset();
this->allocateKernelPrivateMemoryIfNeeded(kernel, kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize);
this->allocateOrReuseKernelPrivateMemoryIfNeeded(kernel, kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize);
if (launchParams.isIndirect && threadGroupDimensions) {
prepareIndirectParams(threadGroupDimensions);

View File

@@ -909,6 +909,11 @@ void KernelImp::setInlineSamplers() {
}
}
void KernelImp::patchAndMoveToResidencyContainerPrivateSurface(NEO::GraphicsAllocation *alloc) {
this->patchCrossthreadDataWithPrivateAllocation(alloc);
this->residencyContainer.push_back(alloc);
}
ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
this->kernelImmData = module->getKernelImmutableData(desc->pKernelName);
if (this->kernelImmData == nullptr) {
@@ -1017,8 +1022,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
auto &kernelAttributes = kernelDescriptor.kernelAttributes;
if ((kernelAttributes.perHwThreadPrivateMemorySize != 0U) && (false == module->shouldAllocatePrivateMemoryPerDispatch())) {
this->privateMemoryGraphicsAllocation = allocatePrivateMemoryGraphicsAllocation();
this->patchCrossthreadDataWithPrivateAllocation(this->privateMemoryGraphicsAllocation);
this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation);
this->patchAndMoveToResidencyContainerPrivateSurface(this->privateMemoryGraphicsAllocation);
}
this->createPrintfBuffer();

View File

@@ -90,6 +90,8 @@ struct KernelImp : Kernel {
void setInlineSamplers();
void patchAndMoveToResidencyContainerPrivateSurface(NEO::GraphicsAllocation *alloc);
ze_result_t initialize(const ze_kernel_desc_t *desc);
const uint8_t *getPerThreadData() const override { return perThreadDataForWholeThreadGroup; }

View File

@@ -15,6 +15,8 @@
#include "level_zero/core/test/unit_tests/mocks/mock_device.h"
#include "level_zero/core/test/unit_tests/white_box.h"
#include <unordered_map>
namespace NEO {
class GraphicsAllocation;
}
@@ -30,7 +32,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using BaseClass = ::L0::CommandListCoreFamily<gfxCoreFamily>;
using BaseClass::addFlushRequiredCommand;
using BaseClass::allocateKernelPrivateMemoryIfNeeded;
using BaseClass::allocateOrReuseKernelPrivateMemoryIfNeeded;
using BaseClass::appendBlitFill;
using BaseClass::appendCopyImageBlit;
using BaseClass::appendDispatchOffsetRegister;
@@ -500,12 +502,14 @@ struct MockCommandList : public CommandList {
};
template <GFXCORE_FAMILY gfxCoreFamily>
class MockAppendMemoryCopy : public CommandListCoreFamily<gfxCoreFamily> {
class MockCommandListCoreFamily : public CommandListCoreFamily<gfxCoreFamily> {
public:
using BaseClass = CommandListCoreFamily<gfxCoreFamily>;
using BaseClass::allocateOrReuseKernelPrivateMemoryIfNeeded;
using BaseClass::commandContainer;
using BaseClass::dcFlushSupport;
using BaseClass::device;
using BaseClass::ownedPrivateAllocations;
ADDMETHOD(appendMemoryCopyKernelWithGA, ze_result_t, false, ZE_RESULT_SUCCESS,
(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc,
@@ -526,6 +530,19 @@ class MockAppendMemoryCopy : public CommandListCoreFamily<gfxCoreFamily> {
uint64_t srcOffset,
uint64_t size));
ADDMETHOD_VOIDRETURN(allocateOrReuseKernelPrivateMemory,
false,
(L0::Kernel * kernel,
uint32_t sizePerHwThread,
std::unordered_map<uint32_t, NEO::GraphicsAllocation *> &privateAllocsToReuse),
(kernel, sizePerHwThread, privateAllocsToReuse));
ADDMETHOD_VOIDRETURN(allocateOrReuseKernelPrivateMemoryIfNeeded,
false,
(L0::Kernel * kernel,
uint32_t sizePerHwThread),
(kernel, sizePerHwThread));
AlignedAllocationData getAlignedAllocationData(L0::Device *device, const void *buffer, uint64_t bufferSize, bool allowHostCopy) override {
return L0::CommandListCoreFamily<gfxCoreFamily>::getAlignedAllocationData(device, buffer, bufferSize, allowHostCopy);
}
@@ -610,6 +627,19 @@ class MockCommandListImmediateHw : public WhiteBox<::L0::CommandListCoreFamilyIm
checkAssertCalled++;
}
ADDMETHOD_VOIDRETURN(allocateOrReuseKernelPrivateMemory,
false,
(L0::Kernel * kernel,
uint32_t sizePerHwThread,
std::unordered_map<uint32_t, NEO::GraphicsAllocation *> &privateAllocsToReuse),
(kernel, sizePerHwThread, privateAllocsToReuse));
ADDMETHOD_VOIDRETURN(allocateOrReuseKernelPrivateMemoryIfNeeded,
false,
(L0::Kernel * kernel,
uint32_t sizePerHwThread),
(kernel, sizePerHwThread));
uint32_t checkAssertCalled = 0;
bool callBaseExecute = false;

View File

@@ -48,6 +48,7 @@ constexpr inline MockModuleTranslationUnit *toMockPtr(L0::ModuleTranslationUnit
template <>
struct WhiteBox<::L0::Module> : public ::L0::ModuleImp {
using BaseClass = ::L0::ModuleImp;
using BaseClass::allocatePrivateMemoryPerDispatch;
using BaseClass::BaseClass;
using BaseClass::builtFromSPIRv;
using BaseClass::copyPatchedSegments;

View File

@@ -10,6 +10,7 @@
#include "shared/test/common/cmd_parse/hw_parse.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_ostime.h"
#include "shared/test/common/mocks/ult_device_factory.h"
#include "shared/test/common/test_macros/hw_test.h"
@@ -3189,5 +3190,120 @@ HWTEST2_F(CommandListMappedTimestampTest, givenEventIsAddedToMappedEventListWhen
EXPECT_EQ(0u, commandList->peekMappedEventList().size());
}
template <GFXCORE_FAMILY gfxCoreFamily, typename BaseMock>
class MockCommandListCoreFamilyIfPrivateNeeded : public BaseMock {
public:
void allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, std::unordered_map<uint32_t, GraphicsAllocation *> &privateAllocsToReuse) override {
passedContainer = &privateAllocsToReuse;
BaseMock::allocateOrReuseKernelPrivateMemory(kernel, sizePerHwThread, privateAllocsToReuse);
}
std::unordered_map<uint32_t, GraphicsAllocation *> *passedContainer;
};
HWTEST2_F(CommandListCreate, givenPrivatePerDispatchDisabledWhenAllocatingPrivateMemoryThenAllocateIsNotCalled, IsAtLeastSkl) {
auto commandList = std::make_unique<MockCommandListCoreFamilyIfPrivateNeeded<gfxCoreFamily, MockCommandListCoreFamily<gfxCoreFamily>>>();
commandList->allocateOrReuseKernelPrivateMemoryIfNeededCallBase = true;
Mock<Module> mockModule(this->device, nullptr);
Mock<KernelImp> mockKernel;
mockKernel.module = &mockModule;
mockModule.allocatePrivateMemoryPerDispatch = false;
commandList->allocateOrReuseKernelPrivateMemoryIfNeeded(&mockKernel, 0x1000);
EXPECT_EQ(commandList->allocateOrReuseKernelPrivateMemoryCalled, 0u);
}
HWTEST2_F(CommandListCreate, givenPrivatePerDispatchEnabledWhenAllocatingPrivateMemoryThenAllocateIsCalled, IsAtLeastSkl) {
auto commandList = std::make_unique<MockCommandListCoreFamilyIfPrivateNeeded<gfxCoreFamily, MockCommandListCoreFamily<gfxCoreFamily>>>();
commandList->allocateOrReuseKernelPrivateMemoryIfNeededCallBase = true;
Mock<Module> mockModule(this->device, nullptr);
Mock<KernelImp> mockKernel;
mockKernel.module = &mockModule;
mockModule.allocatePrivateMemoryPerDispatch = true;
commandList->allocateOrReuseKernelPrivateMemoryIfNeeded(&mockKernel, 0x1000);
EXPECT_EQ(commandList->allocateOrReuseKernelPrivateMemoryCalled, 1u);
}
HWTEST2_F(CommandListCreate, givenPrivatePerDispatchEnabledWhenAllocatingPrivateMemoryThenCmdListMaprIsPassed, IsAtLeastSkl) {
auto commandList = std::make_unique<MockCommandListCoreFamilyIfPrivateNeeded<gfxCoreFamily, MockCommandListCoreFamily<gfxCoreFamily>>>();
commandList->allocateOrReuseKernelPrivateMemoryIfNeededCallBase = true;
Mock<Module> mockModule(this->device, nullptr);
Mock<KernelImp> mockKernel;
mockKernel.module = &mockModule;
mockModule.allocatePrivateMemoryPerDispatch = true;
commandList->allocateOrReuseKernelPrivateMemoryIfNeeded(&mockKernel, 0x1000);
EXPECT_EQ(commandList->passedContainer, &commandList->ownedPrivateAllocations);
}
HWTEST2_F(CommandListCreate, givenImmediateListAndPrivatePerDispatchDisabledWhenAllocatingPrivateMemoryCalledThenAllocateIsNotCalled, IsAtLeastSkl) {
auto commandList = std::make_unique<MockCommandListCoreFamilyIfPrivateNeeded<gfxCoreFamily, MockCommandListImmediateHw<gfxCoreFamily>>>();
commandList->allocateOrReuseKernelPrivateMemoryIfNeededCallBase = true;
Mock<Module> mockModule(this->device, nullptr);
Mock<KernelImp> mockKernel;
mockKernel.module = &mockModule;
mockModule.allocatePrivateMemoryPerDispatch = false;
commandList->allocateOrReuseKernelPrivateMemoryIfNeeded(&mockKernel, 0x1000);
EXPECT_EQ(commandList->allocateOrReuseKernelPrivateMemoryCalled, 0u);
}
HWTEST2_F(CommandListCreate, givenImmediateListAndPrivatePerDispatchEnabledWhenAllocatingPrivateMemoryThenAllocateIsCalled, IsAtLeastSkl) {
auto commandList = std::make_unique<MockCommandListCoreFamilyIfPrivateNeeded<gfxCoreFamily, MockCommandListImmediateHw<gfxCoreFamily>>>();
commandList->allocateOrReuseKernelPrivateMemoryIfNeededCallBase = true;
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
commandList->csr = &mockCommandStreamReceiver;
Mock<Module> mockModule(this->device, nullptr);
Mock<KernelImp> mockKernel;
mockKernel.module = &mockModule;
mockModule.allocatePrivateMemoryPerDispatch = true;
commandList->allocateOrReuseKernelPrivateMemoryIfNeeded(&mockKernel, 0x1000);
EXPECT_EQ(commandList->allocateOrReuseKernelPrivateMemoryCalled, 1u);
}
HWTEST2_F(CommandListCreate, givenImmediateListAndPrivatePerDispatchEnabledWhenAllocatingPrivateMemoryThenCsrMapIsPassed, IsAtLeastSkl) {
auto commandList = std::make_unique<MockCommandListCoreFamilyIfPrivateNeeded<gfxCoreFamily, MockCommandListImmediateHw<gfxCoreFamily>>>();
commandList->allocateOrReuseKernelPrivateMemoryIfNeededCallBase = true;
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
commandList->csr = &mockCommandStreamReceiver;
Mock<Module> mockModule(this->device, nullptr);
Mock<KernelImp> mockKernel;
mockKernel.module = &mockModule;
mockModule.allocatePrivateMemoryPerDispatch = true;
commandList->allocateOrReuseKernelPrivateMemoryIfNeeded(&mockKernel, 0x1000);
EXPECT_EQ(commandList->passedContainer, &mockCommandStreamReceiver.getOwnedPrivateAllocations());
}
HWTEST2_F(CommandListCreate, givenCmdListWhenAllocateOrReuseCalledForSizeThatIsStoredInMapThenItsReused, IsAtLeastSkl) {
auto commandList = std::make_unique<MockCommandListCoreFamily<gfxCoreFamily>>();
commandList->allocateOrReuseKernelPrivateMemoryCallBase = true;
commandList->device = this->device;
uint32_t sizePerHwThread = 0x1000;
auto mockMem = std::make_unique<uint8_t[]>(0x1000);
Mock<Module> mockModule(this->device, nullptr);
Mock<KernelImp> mockKernel;
const_cast<uint32_t &>(mockKernel.kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) = 0x1000;
mockKernel.module = &mockModule;
MockGraphicsAllocation mockGA(mockMem.get(), 2 * sizePerHwThread * this->neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
std::unordered_map<uint32_t, GraphicsAllocation *> mapForReuse;
mapForReuse[sizePerHwThread] = &mockGA;
commandList->allocateOrReuseKernelPrivateMemory(&mockKernel, sizePerHwThread, mapForReuse);
EXPECT_EQ(mockKernel.residencyContainer[0], &mockGA);
}
HWTEST2_F(CommandListCreate, givenNewSizeDifferentThanSizesInMapWhenAllocatingPrivateMemoryThenNewAllocationIsCreated, IsAtLeastSkl) {
auto commandList = std::make_unique<MockCommandListCoreFamily<gfxCoreFamily>>();
commandList->allocateOrReuseKernelPrivateMemoryCallBase = true;
commandList->device = this->device;
uint32_t sizePerHwThread = 0x1000;
auto mockMem = std::make_unique<uint8_t[]>(0x1000);
Mock<Module> mockModule(this->device, nullptr);
Mock<KernelImp> mockKernel;
const_cast<uint32_t &>(mockKernel.kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) = 0x1000;
mockKernel.module = &mockModule;
MockGraphicsAllocation mockGA(mockMem.get(), sizePerHwThread * this->neoDevice->getDeviceInfo().computeUnitsUsedForScratch / 2);
std::unordered_map<uint32_t, GraphicsAllocation *> mapForReuse;
mapForReuse[sizePerHwThread] = &mockGA;
commandList->allocateOrReuseKernelPrivateMemory(&mockKernel, sizePerHwThread / 2, mapForReuse);
EXPECT_NE(mockKernel.residencyContainer[0], &mockGA);
neoDevice->getMemoryManager()->freeGraphicsMemory(mockKernel.residencyContainer[0]);
}
} // namespace ult
} // namespace L0

View File

@@ -1386,7 +1386,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichTogethe
auto &kernelImmDatas = proxyModuleImpl->getKernelImmDatas();
for (size_t i = 0; i < kernelsNb; i++) {
auto &kernelDesc = const_cast<KernelDescriptor &>(kernelImmDatas[i]->getDescriptor());
kernelDesc.kernelAttributes.perHwThreadPrivateMemorySize = overAllocMinSize;
kernelDesc.kernelAttributes.perHwThreadPrivateMemorySize = overAllocMinSize + static_cast<uint32_t>(i * MemoryConstants::cacheLineSize);
kernelDesc.kernelAttributes.flags.usesPrintf = false;
kernelDesc.kernelMetadata.kernelName = kernelNames[i];
}
@@ -1405,8 +1405,8 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichTogethe
EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), i);
kernels.push_back(this->createKernelWithName(kernelNames[i]));
// This function is called by appendLaunchKernelWithParams
pCommandList->allocateKernelPrivateMemoryIfNeeded(kernels[i].get(),
kernels[i]->getKernelDescriptor().kernelAttributes.perHwThreadPrivateMemorySize);
pCommandList->allocateOrReuseKernelPrivateMemoryIfNeeded(kernels[i].get(),
kernels[i]->getKernelDescriptor().kernelAttributes.perHwThreadPrivateMemorySize);
EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), i + 1);
}
}
@@ -1442,8 +1442,8 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichDontExc
EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 0u);
kernels.push_back(this->createKernelWithName(kernelNames[i]));
// This function is called by appendLaunchKernelWithParams
pCommandList->allocateKernelPrivateMemoryIfNeeded(kernels[i].get(),
kernels[i]->getKernelDescriptor().kernelAttributes.perHwThreadPrivateMemorySize);
pCommandList->allocateOrReuseKernelPrivateMemoryIfNeeded(kernels[i].get(),
kernels[i]->getKernelDescriptor().kernelAttributes.perHwThreadPrivateMemorySize);
EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 0u);
}
}

View File

@@ -22,7 +22,7 @@ namespace ult {
using AppendMemoryCopy = Test<DeviceFixture>;
HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyRegionCalledThenTwoNewAllocationAreAddedToHostMapPtr, IsAtLeastSkl) {
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
@@ -33,7 +33,7 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyRegionC
}
HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenMemoryCopyRegion2DCalledThenSrcDstPointersArePageAligned, IsAtLeastSkl) {
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
@@ -46,7 +46,7 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenMemoryCo
}
HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenMemoryCopyRegion3DCalledThenSrcDstPointersArePageAligned, IsAtLeastSkl) {
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
@@ -59,7 +59,7 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenMemoryCo
}
HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemoryCopyRegion2DCalledThenSrcDstNotZeroOffsetsArePassed, IsAtLeastSkl) {
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1233);
void *dstPtr = reinterpret_cast<void *>(0x2345);
@@ -71,7 +71,7 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemo
}
HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemoryCopyRegion3DCalledThenSrcDstNotZeroOffsetsArePassed, IsAtLeastSkl) {
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1233);
void *dstPtr = reinterpret_cast<void *>(0x2345);
@@ -83,7 +83,7 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemo
}
HWTEST2_F(AppendMemoryCopy, givenCommandListAndAlignedHostPointersWhenBlitMemoryCopyRegion3DCalledThenSrcDstZeroOffsetsArePassed, IsAtLeastSkl) {
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u);
void *srcPtr = alignDown(reinterpret_cast<void *>(0x1233), NEO::EncodeSurfaceState<FamilyType>::getSurfaceBaseAddressAlignment());
void *dstPtr = alignDown(reinterpret_cast<void *>(0x2345), NEO::EncodeSurfaceState<FamilyType>::getSurfaceBaseAddressAlignment());
@@ -98,7 +98,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndDestinationPtrOffsetWhenMemor
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT;
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u);
constexpr size_t allocSize = 4096;
@@ -132,7 +132,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndSourcePtrOffsetWhenMemoryCopy
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT;
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u);
constexpr size_t allocSize = 4096;
@@ -166,7 +166,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndDestinationPtrOffsetWhenMemor
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT;
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u);
constexpr size_t allocSize = 4096;
@@ -201,7 +201,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndSourcePtrOffsetWhenMemoryCopy
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT;
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u);
constexpr size_t allocSize = 4096;
@@ -236,7 +236,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndDestinationPtrOffsetWhenMemor
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT;
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u);
constexpr size_t allocSize = 4096;
@@ -270,7 +270,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndSourcePtrOffsetWhenMemoryCopy
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT;
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u);
constexpr size_t allocSize = 4096;
@@ -303,7 +303,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndSourcePtrOffsetWhenMemoryCopy
HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyRegionCalledThenPipeControlWithDcFlushAdded, IsAtLeastSkl) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
@@ -610,7 +610,7 @@ HWTEST2_F(AppendMemoryCopy, givenSyncModeImmediateCommandListWhenAppendingMemory
HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyCalledThenPipeControlWithDcFlushAdded, IsAtLeastSkl) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
MockAppendMemoryCopy<gfxCoreFamily> cmdList;
MockCommandListCoreFamily<gfxCoreFamily> cmdList;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
@@ -646,7 +646,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListWhenTimestampPassedToMemoryCopyT
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using MI_FLUSH_DW = typename GfxFamily::MI_FLUSH_DW;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
MockCommandListCoreFamily<gfxCoreFamily> commandList;
commandList.initialize(device, NEO::EngineGroupType::Copy, 0u);
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
@@ -701,7 +701,7 @@ HWTEST2_F(AppendMemoryCopy,
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
MockCommandListCoreFamily<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -759,7 +759,7 @@ HWTEST2_F(AppendMemoryCopy,
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
MockCommandListCoreFamily<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);

View File

@@ -90,7 +90,7 @@ void testSingleTileAppendMemoryCopyThreeKernels(CopyTestInput &input, TestExpect
using OPERATION = typename POSTSYNC_DATA::OPERATION;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
MockCommandListCoreFamily<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -175,7 +175,7 @@ void testSingleTileAppendMemoryCopyThreeKernelsAndL3Flush(CopyTestInput &input,
using OPERATION = typename POSTSYNC_DATA::OPERATION;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
MockCommandListCoreFamily<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -283,7 +283,7 @@ void testSingleTileAppendMemoryCopySingleKernel(CopyTestInput &input, TestExpect
using OPERATION = typename POSTSYNC_DATA::OPERATION;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
MockCommandListCoreFamily<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -355,7 +355,7 @@ void testSingleTileAppendMemoryCopySingleKernelAndL3Flush(CopyTestInput &input,
using OPERATION = typename POSTSYNC_DATA::OPERATION;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
MockCommandListCoreFamily<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -509,7 +509,7 @@ void testMultiTileAppendMemoryCopyThreeKernels(CopyTestInput &input, TestExpecte
using OPERATION = typename POSTSYNC_DATA::OPERATION;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
MockCommandListCoreFamily<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -600,7 +600,7 @@ void testMultiTileAppendMemoryCopyThreeKernelsAndL3Flush(CopyTestInput &input, T
using OPERATION = typename POSTSYNC_DATA::OPERATION;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
MockCommandListCoreFamily<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -726,7 +726,7 @@ void testMultiTileAppendMemoryCopySingleKernel(CopyTestInput &input, TestExpecte
using OPERATION = typename POSTSYNC_DATA::OPERATION;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
MockCommandListCoreFamily<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -802,7 +802,7 @@ void testMultiTileAppendMemoryCopySingleKernelAndL3Flush(CopyTestInput &input, T
using OPERATION = typename POSTSYNC_DATA::OPERATION;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
MockAppendMemoryCopy<gfxCoreFamily> commandList;
MockCommandListCoreFamily<gfxCoreFamily> commandList;
commandList.appendMemoryCopyKernelWithGACallBase = true;
commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u);

View File

@@ -389,6 +389,10 @@ void CommandStreamReceiver::cleanupResources() {
getMemoryManager()->freeGraphicsMemory(globalStatelessHeapAllocation);
globalStatelessHeapAllocation = nullptr;
}
for (auto &alloc : ownedPrivateAllocations) {
getMemoryManager()->freeGraphicsMemory(alloc.second);
}
ownedPrivateAllocations.clear();
}
WaitStatus CommandStreamReceiver::waitForCompletionWithTimeout(const WaitParams &params, TaskCountType taskCountToWait) {
@@ -567,6 +571,9 @@ ResidencyContainer &CommandStreamReceiver::getResidencyAllocations() {
ResidencyContainer &CommandStreamReceiver::getEvictionAllocations() {
return this->evictionAllocations;
}
std::unordered_map<uint32_t, GraphicsAllocation *> &CommandStreamReceiver::getOwnedPrivateAllocations() {
return this->ownedPrivateAllocations;
}
AubSubCaptureStatus CommandStreamReceiver::checkAndActivateAubSubCapture(const std::string &kernelName) { return {false, false}; }

View File

@@ -118,6 +118,7 @@ class CommandStreamReceiver {
ResidencyContainer &getResidencyAllocations();
ResidencyContainer &getEvictionAllocations();
std::unordered_map<uint32_t, GraphicsAllocation *> &getOwnedPrivateAllocations();
virtual GmmPageTableMngr *createPageTableManager() { return nullptr; }
bool needsPageTableManager() const;
@@ -460,6 +461,8 @@ class CommandStreamReceiver {
ResidencyContainer residencyAllocations;
ResidencyContainer evictionAllocations;
std::unordered_map<uint32_t, GraphicsAllocation *> ownedPrivateAllocations;
MutexType ownershipMutex;
MutexType hostPtrSurfaceCreationMutex;
ExecutionEnvironment &executionEnvironment;

View File

@@ -35,6 +35,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
using CommandStreamReceiver::baseWaitFunction;
using CommandStreamReceiver::checkForNewResources;
using CommandStreamReceiver::checkImplicitFlushForGpuIdle;
using CommandStreamReceiver::cleanupResources;
using CommandStreamReceiver::CommandStreamReceiver;
using CommandStreamReceiver::globalFenceAllocation;
using CommandStreamReceiver::gpuHangCheckPeriod;

View File

@@ -69,3 +69,13 @@
} \
return funcName##Result; \
}
#define ADDMETHOD_VOIDRETURN(funcName, callBase, funcParams, invokeParams) \
bool funcName##CallBase = callBase; \
uint32_t funcName##Called = 0u; \
void funcName funcParams override { \
funcName##Called++; \
if (funcName##CallBase) { \
BaseClass::funcName invokeParams; \
} \
}

View File

@@ -4485,3 +4485,13 @@ HWTEST2_F(CommandStreamReceiverHwTest,
EXPECT_EQ(nullptr, frontEndCmd);
EXPECT_FALSE(commandStreamReceiver.getMediaVFEStateDirty());
}
HWTEST_F(CommandStreamReceiverTest, givenCsrWhenCleanUpResourcesThenOwnedPrivateAllocationsAreFreed) {
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
auto mockGA = std::make_unique<MockGraphicsAllocation>();
auto mapForReuse = &csr.getOwnedPrivateAllocations();
mapForReuse->insert({0x100, mockGA.release()});
csr.cleanupResources();
EXPECT_EQ(mapForReuse->size(), 0u);
}