Add option to allocate private mem per dispatch

Signed-off-by: Jaroslaw Chodor <jaroslaw.chodor@intel.com>
Signed-off-by: Krystian Chmielewski <krystian.chmielewski@intel.com>
This commit is contained in:
Jaroslaw Chodor
2021-07-23 19:23:42 +02:00
committed by Compute-Runtime-Automation
parent cf4972d90e
commit 7c6c45f5b5
12 changed files with 137 additions and 20 deletions

View File

@@ -259,6 +259,8 @@ struct CommandList : _ze_command_list_handle_t {
NEO::StreamProperties requiredStreamState{};
NEO::StreamProperties finalStreamState{};
CommandsToPatch commandsToPatch{};
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;
};
using CommandListAllocatorFn = CommandList *(*)(uint32_t);

View File

@@ -58,6 +58,10 @@ inline ze_result_t parseErrorCode(NEO::ErrorCode returnValue) {
template <GFXCORE_FAMILY gfxCoreFamily>
CommandListCoreFamily<gfxCoreFamily>::~CommandListCoreFamily() {
clearCommandsToPatch();
for (auto alloc : this->ownedPrivateAllocations) {
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc);
}
this->ownedPrivateAllocations.clear();
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -98,6 +102,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
programThreadArbitrationPolicy(device);
}
for (auto alloc : this->ownedPrivateAllocations) {
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc);
}
this->ownedPrivateAllocations.clear();
return ZE_RESULT_SUCCESS;
}

View File

@@ -41,6 +41,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
bool isPredicate,
bool isCooperative) {
const auto kernel = Kernel::fromHandle(hKernel);
const auto &kernelDescriptor = kernel->getKernelDescriptor();
UNRECOVERABLE_IF(kernel == nullptr);
appendEventForProfiling(hEvent, true);
const auto functionImmutableData = kernel->getImmutableData();
@@ -56,6 +57,14 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
kernel->patchGlobalOffset();
if (kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize != 0U &&
nullptr == kernel->getPrivateMemoryGraphicsAllocation()) {
auto privateMemoryGraphicsAllocation = kernel->allocatePrivateMemoryGraphicsAllocation();
kernel->patchCrossthreadDataWithPrivateAllocation(privateMemoryGraphicsAllocation);
this->commandContainer.addToResidencyContainer(privateMemoryGraphicsAllocation);
this->ownedPrivateAllocations.push_back(privateMemoryGraphicsAllocation);
}
if (!isIndirect) {
kernel->setGroupCount(pThreadGroupDimensions->groupCountX,
pThreadGroupDimensions->groupCountY,

View File

@@ -137,6 +137,11 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
virtual bool usesSyncBuffer() = 0;
virtual void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0;
virtual NEO::GraphicsAllocation *allocatePrivateMemoryGraphicsAllocation() = 0;
virtual void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) = 0;
virtual NEO::GraphicsAllocation *getPrivateMemoryGraphicsAllocation() = 0;
Kernel() = default;
Kernel(const Kernel &) = delete;
Kernel(Kernel &&) = delete;

View File

@@ -686,6 +686,36 @@ ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties)
return ZE_RESULT_SUCCESS;
}
NEO::GraphicsAllocation *KernelImp::allocatePrivateMemoryGraphicsAllocation() {
auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
auto neoDevice = module->getDevice()->getNEODevice();
auto privateSurfaceSize = NEO::KernelHelper::getPrivateSurfaceSize(kernelAttributes.perHwThreadPrivateMemorySize,
neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
UNRECOVERABLE_IF(privateSurfaceSize == 0);
auto privateMemoryGraphicsAllocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(
{neoDevice->getRootDeviceIndex(), privateSurfaceSize, NEO::GraphicsAllocation::AllocationType::PRIVATE_SURFACE, neoDevice->getDeviceBitfield()});
UNRECOVERABLE_IF(privateMemoryGraphicsAllocation == nullptr);
return privateMemoryGraphicsAllocation;
}
void KernelImp::patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) {
auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
auto neoDevice = module->getDevice()->getNEODevice();
ArrayRef<uint8_t> crossThredDataArrayRef = ArrayRef<uint8_t>(this->crossThreadData.get(), this->crossThreadDataSize);
ArrayRef<uint8_t> surfaceStateHeapArrayRef = ArrayRef<uint8_t>(this->surfaceStateHeapData.get(), this->surfaceStateHeapDataSize);
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
static_cast<uintptr_t>(privateMemoryGraphicsAllocation->getGpuAddressToPatch()),
*privateMemoryGraphicsAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
*neoDevice, kernelAttributes.flags.useGlobalAtomics);
this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation);
}
ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
this->kernelImmData = module->getKernelImmutableData(desc->pKernelName);
if (this->kernelImmData == nullptr) {
@@ -776,25 +806,9 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
auto neoDevice = module->getDevice()->getNEODevice();
if (kernelAttributes.perHwThreadPrivateMemorySize != 0) {
auto privateSurfaceSize = NEO::KernelHelper::getPrivateSurfaceSize(kernelAttributes.perHwThreadPrivateMemorySize,
neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
UNRECOVERABLE_IF(privateSurfaceSize == 0);
this->privateMemoryGraphicsAllocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(
{neoDevice->getRootDeviceIndex(), privateSurfaceSize, NEO::GraphicsAllocation::AllocationType::PRIVATE_SURFACE, neoDevice->getDeviceBitfield()});
UNRECOVERABLE_IF(this->privateMemoryGraphicsAllocation == nullptr);
ArrayRef<uint8_t> crossThredDataArrayRef = ArrayRef<uint8_t>(this->crossThreadData.get(), this->crossThreadDataSize);
ArrayRef<uint8_t> surfaceStateHeapArrayRef = ArrayRef<uint8_t>(this->surfaceStateHeapData.get(), this->surfaceStateHeapDataSize);
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
static_cast<uintptr_t>(privateMemoryGraphicsAllocation->getGpuAddressToPatch()),
*privateMemoryGraphicsAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
*neoDevice, kernelAttributes.flags.useGlobalAtomics);
this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation);
if ((kernelAttributes.perHwThreadPrivateMemorySize != 0U) && (false == module->shouldAllocatePrivateMemoryPerDispatch())) {
this->privateMemoryGraphicsAllocation = allocatePrivateMemoryGraphicsAllocation();
this->patchCrossthreadDataWithPrivateAllocation(this->privateMemoryGraphicsAllocation);
}
this->createPrintfBuffer();

View File

@@ -143,6 +143,13 @@ struct KernelImp : Kernel {
return kernelHasIndirectAccess;
}
NEO::GraphicsAllocation *allocatePrivateMemoryGraphicsAllocation() override;
void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) override;
NEO::GraphicsAllocation *getPrivateMemoryGraphicsAllocation() override {
return privateMemoryGraphicsAllocation;
}
protected:
KernelImp() = default;

View File

@@ -50,6 +50,8 @@ struct Module : _ze_module_handle_t {
virtual const std::vector<std::unique_ptr<KernelImmutableData>> &getKernelImmutableDataVector() const = 0;
virtual uint32_t getMaxGroupSize() const = 0;
virtual bool isDebugEnabled() const = 0;
virtual bool shouldAllocatePrivateMemoryPerDispatch() const = 0;
virtual void checkIfPrivateMemoryPerDispatchIsNeeded() = 0;
Module() = default;
Module(const Module &) = delete;

View File

@@ -13,6 +13,7 @@
#include "shared/source/device_binary_format/device_binary_formats.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/kernel_helpers.h"
#include "shared/source/helpers/string.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
@@ -383,6 +384,8 @@ bool ModuleImp::initialize(const ze_module_desc_t *desc, NEO::Device *neoDevice)
}
this->maxGroupSize = static_cast<uint32_t>(this->translationUnit->device->getNEODevice()->getDeviceInfo().maxWorkGroupSize);
checkIfPrivateMemoryPerDispatchIsNeeded();
if (debugEnabled) {
if (device->getSourceLevelDebugger()) {
for (auto kernelInfo : this->translationUnit->programInfo.kernelInfos) {
@@ -642,6 +645,24 @@ void ModuleImp::verifyDebugCapabilities() {
debugEnabled = debugCapabilities;
}
void ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded() {
size_t modulePrivateMemorySize = 0;
for (auto &kernelImmData : this->kernelImmDatas) {
if (0 == kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) {
continue;
}
auto kernelPrivateMemorySize = NEO::KernelHelper::getPrivateSurfaceSize(kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize,
this->device->getNEODevice()->getDeviceInfo().computeUnitsUsedForScratch);
modulePrivateMemorySize += kernelPrivateMemorySize;
}
this->allocatePrivateMemoryPerDispatch = false;
if (modulePrivateMemorySize > 0U) {
auto globalMemorySize = device->getNEODevice()->getRootDevice()->getGlobalMemorySize(static_cast<uint32_t>(device->getNEODevice()->getDeviceBitfield().to_ulong()));
this->allocatePrivateMemoryPerDispatch = modulePrivateMemorySize > globalMemorySize;
}
}
ze_result_t ModuleImp::getProperties(ze_module_properties_t *pModuleProperties) {
pModuleProperties->flags = 0;

View File

@@ -114,6 +114,10 @@ struct ModuleImp : public Module {
bool isDebugEnabled() const override;
bool shouldAllocatePrivateMemoryPerDispatch() const override {
return allocatePrivateMemoryPerDispatch;
}
ModuleTranslationUnit *getTranslationUnit() {
return this->translationUnit.get();
}
@@ -121,6 +125,7 @@ struct ModuleImp : public Module {
protected:
void copyPatchedSegments(const NEO::Linker::PatchableSegments &isaSegmentsForPatching);
void verifyDebugCapabilities();
void checkIfPrivateMemoryPerDispatchIsNeeded() override;
Device *device = nullptr;
PRODUCT_FAMILY productFamily{};
@@ -132,6 +137,7 @@ struct ModuleImp : public Module {
NEO::Linker::RelocatedSymbolsMap symbols;
bool debugEnabled = false;
bool isFullyLinked = false;
bool allocatePrivateMemoryPerDispatch = true;
ModuleType type;
NEO::Linker::UnresolvedExternals unresolvedExternalsInfo{};
std::set<NEO::GraphicsAllocation *> importedSymbolAllocations{};

View File

@@ -102,6 +102,12 @@ struct ModuleImmutableDataFixture : public DeviceFixture {
const KernelImmutableData *getKernelImmutableData(const char *functionName) const override {
return mockKernelImmData;
}
void checkIfPrivateMemoryPerDispatchIsNeeded() override {
const_cast<KernelDescriptor &>(kernelImmDatas[0]->getDescriptor()).kernelAttributes.perHwThreadPrivateMemorySize = mockKernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize;
ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded();
}
MockImmutableData *mockKernelImmData = nullptr;
};

View File

@@ -571,6 +571,41 @@ HWTEST_F(KernelImmutableDataTests, givenKernelInitializedWithPrivateMemoryThenCo
EXPECT_EQ(sizeContainerWithoutPrivateMemory + 1u, sizeContainerWithPrivateMemory);
}
HWTEST_F(KernelImmutableDataTests, givenKernelWithPrivateMemoryBiggerThanGlobalMemoryThenPrivateMemoryIsNotAllocated) {
std::string testFile;
retrieveBinaryKernelFilenameNoRevision(testFile, binaryFilename + "_", ".bin");
size_t size = 0;
auto src = loadDataFromFile(
testFile.c_str(),
size);
ASSERT_NE(0u, size);
ASSERT_NE(nullptr, src);
ze_module_desc_t moduleDesc = {};
moduleDesc.format = ZE_MODULE_FORMAT_NATIVE;
moduleDesc.pInputModule = reinterpret_cast<const uint8_t *>(src.get());
moduleDesc.inputSize = size;
ModuleBuildLog *moduleBuildLog = nullptr;
uint32_t perHwThreadPrivateMemorySizeRequested = std::numeric_limits<uint32_t>::max();
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(perHwThreadPrivateMemorySizeRequested);
std::unique_ptr<MockModule> module = std::make_unique<MockModule>(device,
moduleBuildLog,
ModuleType::User,
perHwThreadPrivateMemorySizeRequested,
mockKernelImmData.get());
bool result = module->initialize(&moduleDesc, device->getNEODevice());
EXPECT_TRUE(result);
EXPECT_TRUE(module->shouldAllocatePrivateMemoryPerDispatch());
std::unique_ptr<ModuleImmutableDataFixture::MockKernel> kernel;
kernel = std::make_unique<ModuleImmutableDataFixture::MockKernel>(module.get());
createKernel(kernel.get());
EXPECT_EQ(nullptr, kernel->getPrivateMemoryGraphicsAllocation());
}
class KernelDescriptorRTCallsTrue : public NEO::KernelDescriptor {
bool hasRTCalls() const override {
return true;

View File

@@ -121,6 +121,8 @@ class Device : public ReferenceTrackedObject<Device> {
void initializeRayTracing();
void reduceMaxMemAllocSize();
virtual uint64_t getGlobalMemorySize(uint32_t deviceBitfield) const;
protected:
Device() = delete;
Device(ExecutionEnvironment *executionEnvironment);
@@ -145,7 +147,6 @@ class Device : public ReferenceTrackedObject<Device> {
MOCKABLE_VIRTUAL std::unique_ptr<CommandStreamReceiver> createCommandStreamReceiver() const;
MOCKABLE_VIRTUAL SubDevice *createSubDevice(uint32_t subDeviceIndex);
MOCKABLE_VIRTUAL SubDevice *createEngineInstancedSubDevice(uint32_t subDeviceIndex, aub_stream::EngineType engineType);
virtual uint64_t getGlobalMemorySize(uint32_t deviceBitfield) const;
double getPercentOfGlobalMemoryAvailable() const;
virtual void createBindlessHeapsHelper() {}
bool createSubDevices();