performance: Add CCS Optimization

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk 2024-10-23 10:26:49 +00:00 committed by Compute-Runtime-Automation
parent 171f1e27a3
commit e687e11ab1
16 changed files with 71 additions and 2 deletions

View File

@ -364,6 +364,7 @@ DECLARE_DEBUG_VARIABLE(bool, DisableConcurrentBlockExecution, false, "disables c
DECLARE_DEBUG_VARIABLE(bool, UseNoRingFlushesKmdMode, true, "Windows only, passes flag to KMD that informs KMD to not emit any ring buffer flushes.")
DECLARE_DEBUG_VARIABLE(bool, DisableZeroCopyForUseHostPtr, false, "When active all buffer allocations created with CL_MEM_USE_HOST_PTR flag will not share memory with CPU.")
DECLARE_DEBUG_VARIABLE(bool, ForceNonCoherentModeForTimestamps, false, "When active timestamp buffers are allocated in non coherent memory.")
DECLARE_DEBUG_VARIABLE(bool, SetAssumeNotInUse, true, "Set AssumeNotInUse flag in d3d destroy allocation.")
DECLARE_DEBUG_VARIABLE(int32_t, EnableReusingGpuTimestamps, -1, "Reuse GPU timestamp for next device time requests. -1: os-specific, 0: disable, 1: enable")
DECLARE_DEBUG_VARIABLE(int32_t, AllowZeroCopyWithoutCoherency, -1, "Use cacheline flush instead of memory copy for map/unmap mem object")
DECLARE_DEBUG_VARIABLE(int32_t, EnableHostPtrTracking, -1, "Enable host ptr tracking: -1 - default platform setting, 0 - disabled, 1 - enabled")
@ -595,6 +596,7 @@ DECLARE_DEBUG_VARIABLE(bool, PrintBOChunkingLogs, false, "Print some logs on BO
DECLARE_DEBUG_VARIABLE(bool, EnableBOChunkingPrefetch, false, "Enables prefetching of Shared Memory chunks")
DECLARE_DEBUG_VARIABLE(bool, EnableBOChunkingDevMemPrefetch, false, "Enables prefetching of Device Memory chunks")
DECLARE_DEBUG_VARIABLE(bool, EnableBOChunkingPreferredLocationHint, false, "Enables preferred location advise on chunks")
DECLARE_DEBUG_VARIABLE(bool, DestroyAllocationsViaGmm, false, "Use DeAllocate2 wrapper instead of raw GDI destroy allocations")
DECLARE_DEBUG_VARIABLE(int32_t, EnableBOChunking, -1, "Enables use of chunking of BOs in the KMD, mask: -1 = default, 0 = no chunking, 1 = shared allocations only, 2 = multi-tile device allocations only, 3 = shared and multi-tile device allocations .")
DECLARE_DEBUG_VARIABLE(int32_t, NumberOfBOChunks, 2, "Number of chunks to use")
DECLARE_DEBUG_VARIABLE(int32_t, SetBOChunkingSize, -1, "Size of chunk in bytes: -1 = default, otherwise power of two chunk size in bytes")

View File

@ -16,6 +16,7 @@ struct RootDeviceEnvironment;
class GmmHandleAllocator;
class MapGpuVirtualAddressGmm;
class FreeGpuVirtualAddressGmm;
class DeallocateGmm;
class GmmClientContext {
public:
@ -28,6 +29,7 @@ class GmmClientContext {
MOCKABLE_VIRTUAL GMM_RESOURCE_INFO *createResInfoObject(GMM_RESCREATE_PARAMS *pCreateParams);
MOCKABLE_VIRTUAL GMM_RESOURCE_INFO *copyResInfoObject(GMM_RESOURCE_INFO *pSrcRes);
MOCKABLE_VIRTUAL void destroyResInfoObject(GMM_RESOURCE_INFO *pResInfo);
MOCKABLE_VIRTUAL long deallocate2(DeallocateGmm *deallocateGmm);
MOCKABLE_VIRTUAL uint64_t mapGpuVirtualAddress(MapGpuVirtualAddressGmm *pMapGpuVa);
MOCKABLE_VIRTUAL uint64_t freeGpuVirtualAddress(FreeGpuVirtualAddressGmm *pFreeGpuVa);
GMM_CLIENT_CONTEXT *getHandle() const;

View File

@ -14,5 +14,8 @@ uint64_t GmmClientContext::mapGpuVirtualAddress(MapGpuVirtualAddressGmm *pMapGpu
uint64_t GmmClientContext::freeGpuVirtualAddress(FreeGpuVirtualAddressGmm *pFreeGpuVa) {
return 0;
}
long GmmClientContext::deallocate2(DeallocateGmm *deallocateGmm) {
return 0;
}
} // namespace NEO

View File

@ -16,5 +16,8 @@ uint64_t GmmClientContext::mapGpuVirtualAddress(MapGpuVirtualAddressGmm *pMapGpu
uint64_t GmmClientContext::freeGpuVirtualAddress(FreeGpuVirtualAddressGmm *pFreeGpuVa) {
return 0;
}
long GmmClientContext::deallocate2(DeallocateGmm *deallocateGmm) {
return deallocateGmm->gdi->destroyAllocation2(deallocateGmm->destroyAllocation2);
}
} // namespace NEO

View File

@ -31,5 +31,11 @@ uint64_t GmmClientContext::freeGpuVirtualAddress(FreeGpuVirtualAddressGmm *pFree
return 0;
}
}
long GmmClientContext::deallocate2(DeallocateGmm *deallocateGmm) {
GMM_DESTROYALLOCATION2 gmmDestroyAllocation2{};
memcpy_s(&gmmDestroyAllocation2.KmtObj, sizeof(D3DKMT_DESTROYALLOCATION2), deallocateGmm->destroyAllocation2, sizeof(D3DKMT_DESTROYALLOCATION2));
return clientContext->DeAllocate2(&gmmDestroyAllocation2);
}
} // namespace NEO

View File

@ -31,4 +31,11 @@ class FreeGpuVirtualAddressGmm {
Gdi *gdi;
};
class DeallocateGmm {
public:
DeallocateGmm(D3DKMT_DESTROYALLOCATION2 *destroyAllocation2, Gdi *gdi) : destroyAllocation2(destroyAllocation2), gdi(gdi) {}
D3DKMT_DESTROYALLOCATION2 *destroyAllocation2;
Gdi *gdi;
};
} // namespace NEO

View File

@ -69,6 +69,7 @@ Gmm::Gmm(GmmHelper *gmmHelper, const void *alignedPtr, size_t alignedSize, size_
applyAuxFlagsForBuffer(gmmRequirements.preferCompressed && !storageInfo.isLockable);
applyMemoryFlags(storageInfo);
applyAppResource(storageInfo);
applyExtraInitFlag();
applyDebugOverrides();
gmmResourceInfo.reset(GmmResourceInfo::create(gmmHelper->getClientContext(), &resourceParams));

View File

@ -80,6 +80,7 @@ class Gmm {
void setupImageResourceParams(ImageInfo &imgInfo, bool preferCompressed);
bool extraMemoryFlagsRequired();
void applyExtraMemoryFlags(const StorageInfo &storageInfo);
void applyExtraInitFlag();
void applyDebugOverrides();
GmmHelper *gmmHelper = nullptr;

View File

@ -13,3 +13,4 @@ using namespace NEO;
void Gmm::applyExtraMemoryFlags(const StorageInfo &storageInfo) {}
bool Gmm::extraMemoryFlagsRequired() { return false; }
void Gmm::applyAppResource(const StorageInfo &storageInfo) {}
void Gmm::applyExtraInitFlag() {}

View File

@ -819,7 +819,9 @@ bool Wddm::destroyAllocations(const D3DKMT_HANDLE *handles, uint32_t allocationC
if ((0U == allocationCount) && (0U == resourceHandle)) {
return true;
}
NTSTATUS status = STATUS_SUCCESS;
D3DKMT_DESTROYALLOCATION2 destroyAllocation = {};
DEBUG_BREAK_IF(!(allocationCount <= 1 || resourceHandle == 0));
@ -827,10 +829,15 @@ bool Wddm::destroyAllocations(const D3DKMT_HANDLE *handles, uint32_t allocationC
destroyAllocation.hResource = resourceHandle;
destroyAllocation.phAllocationList = handles;
destroyAllocation.AllocationCount = allocationCount;
destroyAllocation.Flags.AssumeNotInUse = debugManager.flags.SetAssumeNotInUse.get();
destroyAllocation.Flags.AssumeNotInUse = 1;
DeallocateGmm deallocateGmm{&destroyAllocation, getGdi()};
status = getGdi()->destroyAllocation2(&destroyAllocation);
if (debugManager.flags.DestroyAllocationsViaGmm.get()) {
status = static_cast<NTSTATUS>(this->rootDeviceEnvironment.getGmmClientContext()->deallocate2(&deallocateGmm));
} else {
status = getGdi()->destroyAllocation2(&destroyAllocation);
}
return status == STATUS_SUCCESS;
}

View File

@ -22,6 +22,7 @@ class MockGmmClientContextBase : public GmmClientContext {
GMM_RESOURCE_INFO *createResInfoObject(GMM_RESCREATE_PARAMS *pCreateParams) override;
GMM_RESOURCE_INFO *copyResInfoObject(GMM_RESOURCE_INFO *pSrcRes) override;
void destroyResInfoObject(GMM_RESOURCE_INFO *pResInfo) override;
long deallocate2(DeallocateGmm *deallocateGmm) override;
uint8_t getSurfaceStateCompressionFormat(GMM_RESOURCE_FORMAT format) override;
uint8_t getMediaSurfaceStateCompressionFormat(GMM_RESOURCE_FORMAT format) override;
void setGmmDeviceInfo(GMM_DEVICE_INFO *deviceInfo) override;

View File

@ -12,4 +12,7 @@ uint64_t MockGmmClientContextBase::mapGpuVirtualAddress(MapGpuVirtualAddressGmm
mapGpuVirtualAddressCalled++;
return 0;
}
long MockGmmClientContextBase::deallocate2(DeallocateGmm *deallocateGmm) {
return 0;
}
} // namespace NEO

View File

@ -14,4 +14,7 @@ uint64_t MockGmmClientContextBase::mapGpuVirtualAddress(MapGpuVirtualAddressGmm
mapGpuVirtualAddressCalled++;
return pMapGpuVa->gdi->mapGpuVirtualAddress(pMapGpuVa->mapGpuVirtualAddressParams);
}
long MockGmmClientContextBase::deallocate2(DeallocateGmm *deallocateGmm) {
return deallocateGmm->gdi->destroyAllocation2(deallocateGmm->destroyAllocation2);
}
} // namespace NEO

View File

@ -14,4 +14,7 @@ uint64_t MockGmmClientContextBase::mapGpuVirtualAddress(MapGpuVirtualAddressGmm
mapGpuVirtualAddressCalled++;
return pMapGpuVa->gdi->mapGpuVirtualAddress(pMapGpuVa->mapGpuVirtualAddressParams);
}
long MockGmmClientContextBase::deallocate2(DeallocateGmm *deallocateGmm) {
return deallocateGmm->gdi->destroyAllocation2(deallocateGmm->destroyAllocation2);
}
} // namespace NEO

View File

@ -541,6 +541,7 @@ PrintBOChunkingLogs = 0
EnableBOChunkingPrefetch = 0
EnableBOChunkingDevMemPrefetch = 0
EnableBOChunkingPreferredLocationHint = 0
DestroyAllocationsViaGmm = 0
NumberOfBOChunks = 2
SetBOChunkingSize = -1
EnableBOChunking = -1
@ -623,6 +624,7 @@ DeferStateInitSubmissionToFirstRegularUsage = -1
WaitForPagingFenceInController = -1
DirectSubmissionPrintSemaphoreUsage = -1
ForceNonCoherentModeForTimestamps = 0
SetAssumeNotInUse = 1
ExperimentalUSMAllocationReuseVersion = -1
ForceNonWalkerSplitMemoryCopy = -1
DirectSubmissionSwitchSemaphoreMode = -1

View File

@ -394,6 +394,30 @@ TEST_F(Wddm20WithMockGdiDllTests, GivenThreeOsHandlesWhenAskedForDestroyAllocati
EXPECT_EQ(1u, ptrToDestroyAlloc2->Flags.AssumeNotInUse);
}
TEST_F(Wddm20WithMockGdiDllTests, GivenSetAssumeNotInUseSetToFalseWhenDestroyAllocationsThenAssumeNotInUseNotSet) {
DebugManagerStateRestore restorer;
debugManager.flags.SetAssumeNotInUse.set(false);
OsHandleStorage storage;
OsHandleWin osHandle1;
osHandle1.handle = ALLOCATION_HANDLE;
storage.fragmentStorageData[0].osHandleStorage = &osHandle1;
storage.fragmentStorageData[0].freeTheFragment = true;
D3DKMT_HANDLE handles[1] = {ALLOCATION_HANDLE};
bool retVal = wddm->destroyAllocations(handles, 1, 0);
EXPECT_TRUE(retVal);
auto destroyWithResourceHandleCalled = 0u;
D3DKMT_DESTROYALLOCATION2 *ptrToDestroyAlloc2 = nullptr;
getSizesFcn(destroyWithResourceHandleCalled, ptrToDestroyAlloc2);
EXPECT_EQ(0u, ptrToDestroyAlloc2->Flags.AssumeNotInUse);
}
TEST_F(Wddm20Tests, WhenMappingAndFreeingGpuVaThenReturnIsCorrect) {
OsAgnosticMemoryManager mm(*executionEnvironment);
auto gmmHelper = getGmmHelper();