mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 06:49:52 +08:00
feature: Optimize intra-module kernel ISA allocations
So far, there is a separate page allocated for each kernel's ISA within `KernelImmutableData::initialize()`. Apparently the ISA blocks are often much smaller than a 64k page, which leads to poor memory utilization and was even observed to cause the device OOM error if a single module has several keys. Improve the situation by reusing the parent allocation (owned by the module instance) for modules, which kernel ISAs can fit together within a single 64k page. This improves the memory utilization on a single module level. Related-To: NEO-7788 Signed-off-by: Maciej Bielski <maciej.bielski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
1b7e178b25
commit
c348831470
@@ -77,7 +77,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
{
|
||||
auto alloc = args.dispatchInterface->getIsaAllocation();
|
||||
UNRECOVERABLE_IF(nullptr == alloc);
|
||||
auto offset = alloc->getGpuAddressToPatch();
|
||||
auto offset = alloc->getGpuAddressToPatch() + args.dispatchInterface->getIsaOffsetInParentAllocation();
|
||||
idd.setKernelStartPointer(offset);
|
||||
idd.setKernelStartPointerHigh(0u);
|
||||
}
|
||||
|
||||
@@ -87,7 +87,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
{
|
||||
auto alloc = args.dispatchInterface->getIsaAllocation();
|
||||
UNRECOVERABLE_IF(nullptr == alloc);
|
||||
auto offset = alloc->getGpuAddressToPatch();
|
||||
auto offset = alloc->getGpuAddressToPatch() + args.dispatchInterface->getIsaOffsetInParentAllocation();
|
||||
if (!localIdsGenerationByRuntime) {
|
||||
offset += kernelDescriptor.entryPoints.skipPerThreadDataLoad;
|
||||
}
|
||||
|
||||
@@ -53,6 +53,7 @@ class GfxCoreHelper {
|
||||
static std::unique_ptr<GfxCoreHelper> create(const GFXCORE_FAMILY gfxCoreFamily);
|
||||
virtual size_t getMaxBarrierRegisterPerSlice() const = 0;
|
||||
virtual size_t getPaddingForISAAllocation() const = 0;
|
||||
virtual size_t getKernelIsaPointerAlignment() const = 0;
|
||||
virtual uint32_t getComputeUnitsUsedForScratch(const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual uint32_t getPitchAlignmentForImage(const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual void adjustDefaultEngineType(HardwareInfo *pHwInfo, const ProductHelper &productHelper) = 0;
|
||||
@@ -215,6 +216,10 @@ class GfxCoreHelperHw : public GfxCoreHelper {
|
||||
|
||||
size_t getPaddingForISAAllocation() const override;
|
||||
|
||||
size_t getKernelIsaPointerAlignment() const override {
|
||||
return static_cast<size_t>(GfxFamily::cmdInitInterfaceDescriptorData.KERNELSTARTPOINTER_ALIGN_SIZE);
|
||||
}
|
||||
|
||||
uint32_t getComputeUnitsUsedForScratch(const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||
|
||||
uint32_t getPitchAlignmentForImage(const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||
|
||||
@@ -40,6 +40,7 @@ struct DispatchKernelEncoderI {
|
||||
virtual uint32_t getSurfaceStateHeapDataSize() const = 0;
|
||||
|
||||
virtual GraphicsAllocation *getIsaAllocation() const = 0;
|
||||
virtual uint64_t getIsaOffsetInParentAllocation() const = 0;
|
||||
virtual const uint8_t *getDynamicStateHeapData() const = 0;
|
||||
|
||||
virtual uint32_t getRequiredWorkgroupOrder() const = 0;
|
||||
|
||||
@@ -997,6 +997,45 @@ HWTEST2_F(EncodeDispatchKernelTest, givenBindlessKernelWhenDispatchingKernelThen
|
||||
EXPECT_NE(usedAfter, usedBefore);
|
||||
}
|
||||
|
||||
HWTEST2_F(EncodeDispatchKernelTest, givenKernelsSharingISAParentAllocationsWhenProgrammingWalkerThenKernelStartPointerHasProperOffset, IsBeforeXeHpCore) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
|
||||
auto dispatchInterface = std::make_unique<MockDispatchKernelEncoder>();
|
||||
dispatchInterface->getIsaOffsetInParentAllocationResult = 8 << INTERFACE_DESCRIPTOR_DATA::KERNELSTARTPOINTER_BIT_SHIFT;
|
||||
uint32_t dims[] = {2, 1, 1};
|
||||
bool requiresUncachedMocs = false;
|
||||
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
|
||||
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dispatchArgs);
|
||||
|
||||
auto idd = static_cast<INTERFACE_DESCRIPTOR_DATA *>(cmdContainer->getIddBlock());
|
||||
EXPECT_EQ(idd->getKernelStartPointer(), dispatchInterface->getIsaAllocation()->getGpuAddressToPatch() + dispatchInterface->getIsaOffsetInParentAllocation());
|
||||
}
|
||||
|
||||
HWTEST_F(EncodeDispatchKernelTest, givenKernelStartPointerAlignmentInInterfaceDescriptorWhenHelperGetterUsedThenCorrectValueReturned) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::KERNELSTARTPOINTER_ALIGN_SIZE, pDevice->getGfxCoreHelper().getKernelIsaPointerAlignment());
|
||||
}
|
||||
|
||||
HWTEST2_F(EncodeDispatchKernelTest, givenKernelsSharingISAParentAllocationsWhenProgrammingWalkerThenKernelStartPointerHasProperOffset, IsAtLeastXeHpCore) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
|
||||
auto dispatchInterface = std::make_unique<MockDispatchKernelEncoder>();
|
||||
dispatchInterface->getIsaOffsetInParentAllocationResult = 8 << INTERFACE_DESCRIPTOR_DATA::KERNELSTARTPOINTER_BIT_SHIFT;
|
||||
uint32_t dims[] = {2, 1, 1};
|
||||
bool requiresUncachedMocs = false;
|
||||
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
|
||||
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dispatchArgs);
|
||||
|
||||
GenCmdList commands;
|
||||
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed());
|
||||
auto itor = find<WALKER_TYPE *>(commands.begin(), commands.end());
|
||||
ASSERT_NE(itor, commands.end());
|
||||
|
||||
auto walkerCmd = genCmdCast<WALKER_TYPE *>(*itor);
|
||||
EXPECT_EQ(walkerCmd->getInterfaceDescriptor().getKernelStartPointer(), dispatchInterface->getIsaAllocation()->getGpuAddressToPatch() + dispatchInterface->getIsaOffsetInParentAllocation());
|
||||
}
|
||||
|
||||
HWTEST_F(EncodeDispatchKernelTest, givenNonBindlessOrStatelessArgWhenDispatchingKernelThenSurfaceStateOffsetInCrossThreadDataIsNotPatched) {
|
||||
using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE;
|
||||
using DataPortBindlessSurfaceExtendedMessageDescriptor = typename FamilyType::DataPortBindlessSurfaceExtendedMessageDescriptor;
|
||||
|
||||
@@ -55,5 +55,6 @@ struct MockDispatchKernelEncoder : public DispatchKernelEncoderI {
|
||||
ADDMETHOD_CONST_NOBASE(getDynamicStateHeapData, const uint8_t *, nullptr, ());
|
||||
ADDMETHOD_CONST_NOBASE(requiresGenerationOfLocalIdsByRuntime, bool, true, ());
|
||||
ADDMETHOD_CONST_NOBASE(getSlmPolicy, SlmPolicy, SlmPolicy::SlmPolicyNone, ());
|
||||
ADDMETHOD_CONST_NOBASE(getIsaOffsetInParentAllocation, uint64_t, 0lu, ());
|
||||
};
|
||||
} // namespace NEO
|
||||
|
||||
Reference in New Issue
Block a user