performance: share inter-module ISA allocations

Related-To: NEO-10258

Currently each module has it's own GA
for kernel ISA's. This change allows new modules to
reuse existing allocation.

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2024-04-29 17:12:50 +00:00
committed by Compute-Runtime-Automation
parent 349a86a8dc
commit 10ed479b16
14 changed files with 414 additions and 35 deletions

View File

@@ -502,7 +502,7 @@ void ModuleTranslationUnit::processDebugData() {
ModuleImp::ModuleImp(Device *device, ModuleBuildLog *moduleBuildLog, ModuleType type)
: device(device), translationUnit(std::make_unique<ModuleTranslationUnit>(device)),
moduleBuildLog(moduleBuildLog), kernelsIsaParentRegion(nullptr), type(type) {
moduleBuildLog(moduleBuildLog), type(type) {
auto &gfxCoreHelper = device->getGfxCoreHelper();
auto &hwInfo = device->getHwInfo();
this->isaAllocationPageSize = gfxCoreHelper.useSystemMemoryPlacementForISA(hwInfo) ? MemoryConstants::pageSize : MemoryConstants::pageSize64k;
@@ -516,9 +516,9 @@ ModuleImp::~ModuleImp() {
}
}
this->kernelImmDatas.clear();
if (this->kernelsIsaParentRegion) {
DEBUG_BREAK_IF(this->device->getNEODevice()->getMemoryManager() == nullptr);
this->device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(this->kernelsIsaParentRegion.release());
if (this->sharedIsaAllocation) {
auto neoDevice = this->device->getNEODevice();
neoDevice->getIsaPoolAllocator().freeSharedIsaAllocation(this->sharedIsaAllocation.release());
}
}
@@ -566,7 +566,7 @@ ze_result_t ModuleImp::initialize(const ze_module_desc_t *desc, NEO::Device *neo
linkageSuccessful &= populateHostGlobalSymbolsMap(this->translationUnit->programInfo.globalsDeviceToHostNameMap);
this->updateBuildLog(neoDevice);
if ((this->isFullyLinked && this->type == ModuleType::user) || (this->kernelsIsaParentRegion && this->type == ModuleType::builtin)) {
if ((this->isFullyLinked && this->type == ModuleType::user) || (this->sharedIsaAllocation && this->type == ModuleType::builtin)) {
this->transferIsaSegmentsToAllocation(neoDevice, nullptr);
if (device->getL0Debugger()) {
@@ -585,29 +585,31 @@ void ModuleImp::transferIsaSegmentsToAllocation(NEO::Device *neoDevice, const NE
const auto &productHelper = neoDevice->getProductHelper();
auto &rootDeviceEnvironment = neoDevice->getRootDeviceEnvironment();
if (this->kernelsIsaParentRegion && this->kernelImmDatas.size()) {
if (this->sharedIsaAllocation && this->kernelImmDatas.size()) {
if (this->kernelImmDatas[0]->isIsaCopiedToAllocation()) {
return;
}
const auto isaBufferSize = this->kernelsIsaParentRegion->getUnderlyingBufferSize();
const auto isaBufferSize = this->sharedIsaAllocation->getSize();
DEBUG_BREAK_IF(isaBufferSize == 0);
auto isaBuffer = std::vector<std::byte>(isaBufferSize);
std::memset(isaBuffer.data(), 0x0, isaBufferSize);
auto moduleOffset = sharedIsaAllocation->getOffset();
for (auto &kernelImmData : this->kernelImmDatas) {
DEBUG_BREAK_IF(kernelImmData->isIsaCopiedToAllocation());
kernelImmData->getIsaGraphicsAllocation()->setAubWritable(true, std::numeric_limits<uint32_t>::max());
kernelImmData->getIsaGraphicsAllocation()->setTbxWritable(true, std::numeric_limits<uint32_t>::max());
auto [kernelHeapPtr, kernelHeapSize] = this->getKernelHeapPointerAndSize(kernelImmData, isaSegmentsForPatching);
auto offset = kernelImmData->getIsaOffsetInParentAllocation();
memcpy_s(isaBuffer.data() + offset, isaBufferSize - offset, kernelHeapPtr, kernelHeapSize);
auto isaOffset = kernelImmData->getIsaOffsetInParentAllocation() - moduleOffset;
memcpy_s(isaBuffer.data() + isaOffset, isaBufferSize - isaOffset, kernelHeapPtr, kernelHeapSize);
}
NEO::MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *this->kernelsIsaParentRegion),
auto moduleAllocation = this->sharedIsaAllocation->getGraphicsAllocation();
auto lock = this->sharedIsaAllocation->obtainSharedAllocationLock();
NEO::MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *moduleAllocation),
*neoDevice,
this->kernelsIsaParentRegion.get(),
0u,
moduleAllocation,
moduleOffset,
isaBuffer.data(),
isaBuffer.size());
for (auto &kernelImmData : kernelImmDatas) {
@@ -804,16 +806,17 @@ ze_result_t ModuleImp::setIsaGraphicsAllocations() {
bool debuggerDisabled = (this->device->getL0Debugger() == nullptr);
if (debuggerDisabled && kernelsIsaTotalSize <= isaAllocationPageSize) {
if (auto allocation = this->allocateKernelsIsaMemory(kernelsIsaTotalSize); allocation == nullptr) {
auto neoDevice = this->device->getNEODevice();
auto &isaAllocator = neoDevice->getIsaPoolAllocator();
auto crossModuleAllocation = isaAllocator.requestGraphicsAllocationForIsa(this->type == ModuleType::builtin, kernelsIsaTotalSize);
if (crossModuleAllocation == nullptr) {
return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY;
} else {
this->kernelsIsaParentRegion.reset(allocation);
}
this->sharedIsaAllocation.reset(crossModuleAllocation);
for (auto i = 0lu; i < kernelsCount; i++) {
auto [isaOffset, isaSize] = kernelsChunks[i];
this->kernelImmDatas[i]->setIsaParentAllocation(this->kernelsIsaParentRegion.get());
this->kernelImmDatas[i]->setIsaSubAllocationOffset(isaOffset);
this->kernelImmDatas[i]->setIsaParentAllocation(this->sharedIsaAllocation->getGraphicsAllocation());
this->kernelImmDatas[i]->setIsaSubAllocationOffset(this->sharedIsaAllocation->getOffset() + isaOffset);
this->kernelImmDatas[i]->setIsaSubAllocationSize(isaSize);
}
} else {
@@ -1707,4 +1710,11 @@ bool moveBuildOption(std::string &dstOptionsSet, std::string &srcOptionSet, NEO:
return true;
}
NEO::GraphicsAllocation *ModuleImp::getKernelsIsaParentAllocation() const {
if (!sharedIsaAllocation) {
return nullptr;
}
return sharedIsaAllocation->getGraphicsAllocation();
}
} // namespace L0

View File

@@ -23,6 +23,7 @@
namespace NEO {
struct KernelDescriptor;
class SharedIsaAllocation;
namespace Zebin::Debug {
struct Segments;
@@ -125,7 +126,7 @@ struct ModuleImp : public Module {
const KernelImmutableData *getKernelImmutableData(const char *kernelName) const override;
const std::vector<std::unique_ptr<KernelImmutableData>> &getKernelImmutableDataVector() const override { return kernelImmDatas; }
NEO::GraphicsAllocation *getKernelsIsaParentAllocation() const { return kernelsIsaParentRegion.get(); }
NEO::GraphicsAllocation *getKernelsIsaParentAllocation() const;
uint32_t getMaxGroupSize(const NEO::KernelDescriptor &kernelDescriptor) const override;
@@ -189,7 +190,7 @@ struct ModuleImp : public Module {
std::unique_ptr<ModuleTranslationUnit> translationUnit;
ModuleBuildLog *moduleBuildLog = nullptr;
NEO::GraphicsAllocation *exportedFunctionsSurface = nullptr;
std::unique_ptr<NEO::GraphicsAllocation> kernelsIsaParentRegion;
std::unique_ptr<NEO::SharedIsaAllocation> sharedIsaAllocation;
std::vector<std::shared_ptr<Kernel>> printfKernelContainer;
std::vector<std::unique_ptr<KernelImmutableData>> kernelImmDatas;
NEO::Linker::RelocatedSymbolsMap symbols;

View File

@@ -192,9 +192,14 @@ struct ModuleKernelIsaAllocationsFixture : public ModuleFixture {
}
void givenIsaMemoryRegionSharedBetweenKernelsWhenGraphicsAllocationFailsThenProperErrorReturned() {
mockModule->allocateKernelsIsaMemoryCallBase = false;
// Fill current pool so next request will try to allocate
auto alloc = device->getNEODevice()->getIsaPoolAllocator().requestGraphicsAllocationForIsa(false, MemoryConstants::pageSize2M * 2);
auto memoryManager = reinterpret_cast<MockMemoryManager *>(device->getNEODevice()->getMemoryManager());
memoryManager->failInDevicePoolWithError = true;
auto result = module->initialize(&this->moduleDesc, device->getNEODevice());
EXPECT_EQ(result, ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY);
device->getNEODevice()->getIsaPoolAllocator().freeSharedIsaAllocation(alloc);
}
void givenSeparateIsaMemoryRegionPerKernelWhenGraphicsAllocationFailsThenProperErrorReturned() {
@@ -4713,15 +4718,17 @@ TEST_F(ModuleKernelImmDatasTest, givenDeviceOOMWhenMemoryManagerFailsToAllocateM
moduleDesc.inputSize = src.size();
ModuleBuildLog *moduleBuildLog = nullptr;
module.reset(nullptr);
auto module = std::make_unique<Module>(device, moduleBuildLog, ModuleType::user);
ASSERT_NE(nullptr, module.get());
// Fill current pool so next request will try to allocate
auto alloc = device->getNEODevice()->getIsaPoolAllocator().requestGraphicsAllocationForIsa(false, MemoryConstants::pageSize2M * 2);
auto mockMemoryManager = static_cast<NEO::MockMemoryManager *>(neoDevice->getMemoryManager());
mockMemoryManager->isMockHostMemoryManager = true;
mockMemoryManager->forceFailureInPrimaryAllocation = true;
auto module = std::make_unique<Module>(device, moduleBuildLog, ModuleType::user);
ASSERT_NE(nullptr, module.get());
auto result = module->initialize(&moduleDesc, neoDevice);
EXPECT_EQ(result, ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY);
device->getNEODevice()->getIsaPoolAllocator().freeSharedIsaAllocation(alloc);
};
using MultiTileModuleTest = Test<MultiTileModuleFixture>;

View File

@@ -451,5 +451,36 @@ TEST_F(ModuleTests, givenFP64EmulationEnabledWhenCreatingModuleThenEnableFP64Gen
EXPECT_TRUE(CompilerOptions::contains(cip->buildInternalOptions, BuildOptions::enableFP64GenEmu));
};
TEST_F(ModuleTests, whenMultipleModulesCreatedThenModulesShareIsaAllocation) {
DebugManagerStateRestore restorer;
debugManager.flags.EnableLocalMemory.set(1);
uint8_t binary[10];
ze_module_desc_t moduleDesc = {};
moduleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV;
moduleDesc.pInputModule = binary;
moduleDesc.inputSize = 10;
ModuleBuildLog *moduleBuildLog = nullptr;
NEO::GraphicsAllocation *allocation;
std::vector<std::unique_ptr<L0::ModuleImp>> modules;
constexpr size_t numModules = 10;
for (auto i = 0u; i < numModules; i++) {
modules.emplace_back(new L0::ModuleImp(device, moduleBuildLog, ModuleType::user));
modules[i]->initialize(&moduleDesc, device->getNEODevice());
if (i == 0) {
allocation = modules[i]->getKernelsIsaParentAllocation();
}
auto &vec = modules[i]->getKernelImmutableDataVector();
auto offsetForImmData = vec[0]->getIsaOffsetInParentAllocation();
for (auto &immData : vec) {
EXPECT_EQ(offsetForImmData, immData->getIsaOffsetInParentAllocation());
offsetForImmData += immData->getIsaSubAllocationSize();
}
// Verify that all imm datas share same parent allocation
if (i != 0) {
EXPECT_EQ(allocation, modules[i]->getKernelsIsaParentAllocation());
}
}
};
} // namespace ult
} // namespace L0