mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-08 22:12:59 +08:00
Add option to allocate private mem per dispatch
Signed-off-by: Jaroslaw Chodor <jaroslaw.chodor@intel.com> Signed-off-by: Krystian Chmielewski <krystian.chmielewski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
cf4972d90e
commit
7c6c45f5b5
@@ -259,6 +259,8 @@ struct CommandList : _ze_command_list_handle_t {
|
||||
NEO::StreamProperties requiredStreamState{};
|
||||
NEO::StreamProperties finalStreamState{};
|
||||
CommandsToPatch commandsToPatch{};
|
||||
|
||||
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;
|
||||
};
|
||||
|
||||
using CommandListAllocatorFn = CommandList *(*)(uint32_t);
|
||||
|
||||
@@ -58,6 +58,10 @@ inline ze_result_t parseErrorCode(NEO::ErrorCode returnValue) {
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
CommandListCoreFamily<gfxCoreFamily>::~CommandListCoreFamily() {
|
||||
clearCommandsToPatch();
|
||||
for (auto alloc : this->ownedPrivateAllocations) {
|
||||
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc);
|
||||
}
|
||||
this->ownedPrivateAllocations.clear();
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
@@ -98,6 +102,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
|
||||
programThreadArbitrationPolicy(device);
|
||||
}
|
||||
|
||||
for (auto alloc : this->ownedPrivateAllocations) {
|
||||
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc);
|
||||
}
|
||||
this->ownedPrivateAllocations.clear();
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -41,6 +41,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
|
||||
bool isPredicate,
|
||||
bool isCooperative) {
|
||||
const auto kernel = Kernel::fromHandle(hKernel);
|
||||
const auto &kernelDescriptor = kernel->getKernelDescriptor();
|
||||
UNRECOVERABLE_IF(kernel == nullptr);
|
||||
appendEventForProfiling(hEvent, true);
|
||||
const auto functionImmutableData = kernel->getImmutableData();
|
||||
@@ -56,6 +57,14 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
|
||||
|
||||
kernel->patchGlobalOffset();
|
||||
|
||||
if (kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize != 0U &&
|
||||
nullptr == kernel->getPrivateMemoryGraphicsAllocation()) {
|
||||
auto privateMemoryGraphicsAllocation = kernel->allocatePrivateMemoryGraphicsAllocation();
|
||||
kernel->patchCrossthreadDataWithPrivateAllocation(privateMemoryGraphicsAllocation);
|
||||
this->commandContainer.addToResidencyContainer(privateMemoryGraphicsAllocation);
|
||||
this->ownedPrivateAllocations.push_back(privateMemoryGraphicsAllocation);
|
||||
}
|
||||
|
||||
if (!isIndirect) {
|
||||
kernel->setGroupCount(pThreadGroupDimensions->groupCountX,
|
||||
pThreadGroupDimensions->groupCountY,
|
||||
|
||||
@@ -137,6 +137,11 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
|
||||
virtual bool usesSyncBuffer() = 0;
|
||||
virtual void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0;
|
||||
|
||||
virtual NEO::GraphicsAllocation *allocatePrivateMemoryGraphicsAllocation() = 0;
|
||||
virtual void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) = 0;
|
||||
|
||||
virtual NEO::GraphicsAllocation *getPrivateMemoryGraphicsAllocation() = 0;
|
||||
|
||||
Kernel() = default;
|
||||
Kernel(const Kernel &) = delete;
|
||||
Kernel(Kernel &&) = delete;
|
||||
|
||||
@@ -686,6 +686,36 @@ ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties)
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
NEO::GraphicsAllocation *KernelImp::allocatePrivateMemoryGraphicsAllocation() {
|
||||
auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
|
||||
auto neoDevice = module->getDevice()->getNEODevice();
|
||||
|
||||
auto privateSurfaceSize = NEO::KernelHelper::getPrivateSurfaceSize(kernelAttributes.perHwThreadPrivateMemorySize,
|
||||
neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
|
||||
|
||||
UNRECOVERABLE_IF(privateSurfaceSize == 0);
|
||||
auto privateMemoryGraphicsAllocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(
|
||||
{neoDevice->getRootDeviceIndex(), privateSurfaceSize, NEO::GraphicsAllocation::AllocationType::PRIVATE_SURFACE, neoDevice->getDeviceBitfield()});
|
||||
|
||||
UNRECOVERABLE_IF(privateMemoryGraphicsAllocation == nullptr);
|
||||
return privateMemoryGraphicsAllocation;
|
||||
}
|
||||
|
||||
void KernelImp::patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) {
|
||||
auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
|
||||
auto neoDevice = module->getDevice()->getNEODevice();
|
||||
|
||||
ArrayRef<uint8_t> crossThredDataArrayRef = ArrayRef<uint8_t>(this->crossThreadData.get(), this->crossThreadDataSize);
|
||||
ArrayRef<uint8_t> surfaceStateHeapArrayRef = ArrayRef<uint8_t>(this->surfaceStateHeapData.get(), this->surfaceStateHeapDataSize);
|
||||
|
||||
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
|
||||
static_cast<uintptr_t>(privateMemoryGraphicsAllocation->getGpuAddressToPatch()),
|
||||
*privateMemoryGraphicsAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
|
||||
*neoDevice, kernelAttributes.flags.useGlobalAtomics);
|
||||
|
||||
this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation);
|
||||
}
|
||||
|
||||
ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
|
||||
this->kernelImmData = module->getKernelImmutableData(desc->pKernelName);
|
||||
if (this->kernelImmData == nullptr) {
|
||||
@@ -776,25 +806,9 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
|
||||
|
||||
auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
|
||||
auto neoDevice = module->getDevice()->getNEODevice();
|
||||
if (kernelAttributes.perHwThreadPrivateMemorySize != 0) {
|
||||
auto privateSurfaceSize = NEO::KernelHelper::getPrivateSurfaceSize(kernelAttributes.perHwThreadPrivateMemorySize,
|
||||
neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
|
||||
|
||||
UNRECOVERABLE_IF(privateSurfaceSize == 0);
|
||||
this->privateMemoryGraphicsAllocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(
|
||||
{neoDevice->getRootDeviceIndex(), privateSurfaceSize, NEO::GraphicsAllocation::AllocationType::PRIVATE_SURFACE, neoDevice->getDeviceBitfield()});
|
||||
|
||||
UNRECOVERABLE_IF(this->privateMemoryGraphicsAllocation == nullptr);
|
||||
|
||||
ArrayRef<uint8_t> crossThredDataArrayRef = ArrayRef<uint8_t>(this->crossThreadData.get(), this->crossThreadDataSize);
|
||||
ArrayRef<uint8_t> surfaceStateHeapArrayRef = ArrayRef<uint8_t>(this->surfaceStateHeapData.get(), this->surfaceStateHeapDataSize);
|
||||
|
||||
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
|
||||
static_cast<uintptr_t>(privateMemoryGraphicsAllocation->getGpuAddressToPatch()),
|
||||
*privateMemoryGraphicsAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
|
||||
*neoDevice, kernelAttributes.flags.useGlobalAtomics);
|
||||
|
||||
this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation);
|
||||
if ((kernelAttributes.perHwThreadPrivateMemorySize != 0U) && (false == module->shouldAllocatePrivateMemoryPerDispatch())) {
|
||||
this->privateMemoryGraphicsAllocation = allocatePrivateMemoryGraphicsAllocation();
|
||||
this->patchCrossthreadDataWithPrivateAllocation(this->privateMemoryGraphicsAllocation);
|
||||
}
|
||||
|
||||
this->createPrintfBuffer();
|
||||
|
||||
@@ -143,6 +143,13 @@ struct KernelImp : Kernel {
|
||||
return kernelHasIndirectAccess;
|
||||
}
|
||||
|
||||
NEO::GraphicsAllocation *allocatePrivateMemoryGraphicsAllocation() override;
|
||||
void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) override;
|
||||
|
||||
NEO::GraphicsAllocation *getPrivateMemoryGraphicsAllocation() override {
|
||||
return privateMemoryGraphicsAllocation;
|
||||
}
|
||||
|
||||
protected:
|
||||
KernelImp() = default;
|
||||
|
||||
|
||||
@@ -50,6 +50,8 @@ struct Module : _ze_module_handle_t {
|
||||
virtual const std::vector<std::unique_ptr<KernelImmutableData>> &getKernelImmutableDataVector() const = 0;
|
||||
virtual uint32_t getMaxGroupSize() const = 0;
|
||||
virtual bool isDebugEnabled() const = 0;
|
||||
virtual bool shouldAllocatePrivateMemoryPerDispatch() const = 0;
|
||||
virtual void checkIfPrivateMemoryPerDispatchIsNeeded() = 0;
|
||||
|
||||
Module() = default;
|
||||
Module(const Module &) = delete;
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "shared/source/device_binary_format/device_binary_formats.h"
|
||||
#include "shared/source/helpers/api_specific_config.h"
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/helpers/kernel_helpers.h"
|
||||
#include "shared/source/helpers/string.h"
|
||||
#include "shared/source/memory_manager/memory_manager.h"
|
||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||
@@ -383,6 +384,8 @@ bool ModuleImp::initialize(const ze_module_desc_t *desc, NEO::Device *neoDevice)
|
||||
}
|
||||
this->maxGroupSize = static_cast<uint32_t>(this->translationUnit->device->getNEODevice()->getDeviceInfo().maxWorkGroupSize);
|
||||
|
||||
checkIfPrivateMemoryPerDispatchIsNeeded();
|
||||
|
||||
if (debugEnabled) {
|
||||
if (device->getSourceLevelDebugger()) {
|
||||
for (auto kernelInfo : this->translationUnit->programInfo.kernelInfos) {
|
||||
@@ -642,6 +645,24 @@ void ModuleImp::verifyDebugCapabilities() {
|
||||
debugEnabled = debugCapabilities;
|
||||
}
|
||||
|
||||
void ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded() {
|
||||
size_t modulePrivateMemorySize = 0;
|
||||
for (auto &kernelImmData : this->kernelImmDatas) {
|
||||
if (0 == kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) {
|
||||
continue;
|
||||
}
|
||||
auto kernelPrivateMemorySize = NEO::KernelHelper::getPrivateSurfaceSize(kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize,
|
||||
this->device->getNEODevice()->getDeviceInfo().computeUnitsUsedForScratch);
|
||||
modulePrivateMemorySize += kernelPrivateMemorySize;
|
||||
}
|
||||
|
||||
this->allocatePrivateMemoryPerDispatch = false;
|
||||
if (modulePrivateMemorySize > 0U) {
|
||||
auto globalMemorySize = device->getNEODevice()->getRootDevice()->getGlobalMemorySize(static_cast<uint32_t>(device->getNEODevice()->getDeviceBitfield().to_ulong()));
|
||||
this->allocatePrivateMemoryPerDispatch = modulePrivateMemorySize > globalMemorySize;
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t ModuleImp::getProperties(ze_module_properties_t *pModuleProperties) {
|
||||
|
||||
pModuleProperties->flags = 0;
|
||||
|
||||
@@ -114,6 +114,10 @@ struct ModuleImp : public Module {
|
||||
|
||||
bool isDebugEnabled() const override;
|
||||
|
||||
bool shouldAllocatePrivateMemoryPerDispatch() const override {
|
||||
return allocatePrivateMemoryPerDispatch;
|
||||
}
|
||||
|
||||
ModuleTranslationUnit *getTranslationUnit() {
|
||||
return this->translationUnit.get();
|
||||
}
|
||||
@@ -121,6 +125,7 @@ struct ModuleImp : public Module {
|
||||
protected:
|
||||
void copyPatchedSegments(const NEO::Linker::PatchableSegments &isaSegmentsForPatching);
|
||||
void verifyDebugCapabilities();
|
||||
void checkIfPrivateMemoryPerDispatchIsNeeded() override;
|
||||
|
||||
Device *device = nullptr;
|
||||
PRODUCT_FAMILY productFamily{};
|
||||
@@ -132,6 +137,7 @@ struct ModuleImp : public Module {
|
||||
NEO::Linker::RelocatedSymbolsMap symbols;
|
||||
bool debugEnabled = false;
|
||||
bool isFullyLinked = false;
|
||||
bool allocatePrivateMemoryPerDispatch = true;
|
||||
ModuleType type;
|
||||
NEO::Linker::UnresolvedExternals unresolvedExternalsInfo{};
|
||||
std::set<NEO::GraphicsAllocation *> importedSymbolAllocations{};
|
||||
|
||||
@@ -102,6 +102,12 @@ struct ModuleImmutableDataFixture : public DeviceFixture {
|
||||
const KernelImmutableData *getKernelImmutableData(const char *functionName) const override {
|
||||
return mockKernelImmData;
|
||||
}
|
||||
|
||||
void checkIfPrivateMemoryPerDispatchIsNeeded() override {
|
||||
const_cast<KernelDescriptor &>(kernelImmDatas[0]->getDescriptor()).kernelAttributes.perHwThreadPrivateMemorySize = mockKernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize;
|
||||
ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded();
|
||||
}
|
||||
|
||||
MockImmutableData *mockKernelImmData = nullptr;
|
||||
};
|
||||
|
||||
|
||||
@@ -571,6 +571,41 @@ HWTEST_F(KernelImmutableDataTests, givenKernelInitializedWithPrivateMemoryThenCo
|
||||
EXPECT_EQ(sizeContainerWithoutPrivateMemory + 1u, sizeContainerWithPrivateMemory);
|
||||
}
|
||||
|
||||
HWTEST_F(KernelImmutableDataTests, givenKernelWithPrivateMemoryBiggerThanGlobalMemoryThenPrivateMemoryIsNotAllocated) {
|
||||
std::string testFile;
|
||||
retrieveBinaryKernelFilenameNoRevision(testFile, binaryFilename + "_", ".bin");
|
||||
|
||||
size_t size = 0;
|
||||
auto src = loadDataFromFile(
|
||||
testFile.c_str(),
|
||||
size);
|
||||
ASSERT_NE(0u, size);
|
||||
ASSERT_NE(nullptr, src);
|
||||
|
||||
ze_module_desc_t moduleDesc = {};
|
||||
moduleDesc.format = ZE_MODULE_FORMAT_NATIVE;
|
||||
moduleDesc.pInputModule = reinterpret_cast<const uint8_t *>(src.get());
|
||||
moduleDesc.inputSize = size;
|
||||
ModuleBuildLog *moduleBuildLog = nullptr;
|
||||
|
||||
uint32_t perHwThreadPrivateMemorySizeRequested = std::numeric_limits<uint32_t>::max();
|
||||
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(perHwThreadPrivateMemorySizeRequested);
|
||||
std::unique_ptr<MockModule> module = std::make_unique<MockModule>(device,
|
||||
moduleBuildLog,
|
||||
ModuleType::User,
|
||||
perHwThreadPrivateMemorySizeRequested,
|
||||
mockKernelImmData.get());
|
||||
bool result = module->initialize(&moduleDesc, device->getNEODevice());
|
||||
EXPECT_TRUE(result);
|
||||
EXPECT_TRUE(module->shouldAllocatePrivateMemoryPerDispatch());
|
||||
|
||||
std::unique_ptr<ModuleImmutableDataFixture::MockKernel> kernel;
|
||||
kernel = std::make_unique<ModuleImmutableDataFixture::MockKernel>(module.get());
|
||||
|
||||
createKernel(kernel.get());
|
||||
EXPECT_EQ(nullptr, kernel->getPrivateMemoryGraphicsAllocation());
|
||||
}
|
||||
|
||||
class KernelDescriptorRTCallsTrue : public NEO::KernelDescriptor {
|
||||
bool hasRTCalls() const override {
|
||||
return true;
|
||||
|
||||
@@ -121,6 +121,8 @@ class Device : public ReferenceTrackedObject<Device> {
|
||||
void initializeRayTracing();
|
||||
void reduceMaxMemAllocSize();
|
||||
|
||||
virtual uint64_t getGlobalMemorySize(uint32_t deviceBitfield) const;
|
||||
|
||||
protected:
|
||||
Device() = delete;
|
||||
Device(ExecutionEnvironment *executionEnvironment);
|
||||
@@ -145,7 +147,6 @@ class Device : public ReferenceTrackedObject<Device> {
|
||||
MOCKABLE_VIRTUAL std::unique_ptr<CommandStreamReceiver> createCommandStreamReceiver() const;
|
||||
MOCKABLE_VIRTUAL SubDevice *createSubDevice(uint32_t subDeviceIndex);
|
||||
MOCKABLE_VIRTUAL SubDevice *createEngineInstancedSubDevice(uint32_t subDeviceIndex, aub_stream::EngineType engineType);
|
||||
virtual uint64_t getGlobalMemorySize(uint32_t deviceBitfield) const;
|
||||
double getPercentOfGlobalMemoryAvailable() const;
|
||||
virtual void createBindlessHeapsHelper() {}
|
||||
bool createSubDevices();
|
||||
|
||||
Reference in New Issue
Block a user