mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 06:49:52 +08:00
Fix to L3 Caching given unaligned memory & hostPtrCopy only for src host ptrs
Signed-off-by: Spruit, Neil R <neil.r.spruit@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
35795357e9
commit
9c6433e55e
@@ -71,12 +71,12 @@ NEO::GraphicsAllocation *CommandList::getAllocationFromHostPtrMap(const void *bu
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
NEO::GraphicsAllocation *CommandList::getHostPtrAlloc(const void *buffer, uint64_t bufferSize) {
|
||||
NEO::GraphicsAllocation *CommandList::getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed) {
|
||||
NEO::GraphicsAllocation *alloc = getAllocationFromHostPtrMap(buffer, bufferSize);
|
||||
if (alloc) {
|
||||
return alloc;
|
||||
}
|
||||
alloc = device->allocateMemoryFromHostPtr(buffer, bufferSize);
|
||||
alloc = device->allocateMemoryFromHostPtr(buffer, bufferSize, hostCopyAllowed);
|
||||
UNRECOVERABLE_IF(alloc == nullptr);
|
||||
if (this->cmdListType == CommandListType::TYPE_IMMEDIATE && this->isFlushTaskSubmissionEnabled) {
|
||||
this->csr->getInternalAllocationStorage()->storeAllocation(std::unique_ptr<NEO::GraphicsAllocation>(alloc), NEO::AllocationUsage::TEMPORARY_ALLOCATION);
|
||||
|
||||
@@ -250,10 +250,11 @@ struct CommandList : _ze_command_list_handle_t {
|
||||
bool isFlushTaskSubmissionEnabled = false;
|
||||
bool isSyncModeQueue = false;
|
||||
bool commandListSLMEnabled = false;
|
||||
bool requiresUncachedMOCS = false;
|
||||
|
||||
protected:
|
||||
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);
|
||||
NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize);
|
||||
NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed);
|
||||
|
||||
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
|
||||
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;
|
||||
|
||||
@@ -245,7 +245,7 @@ struct CommandListCoreFamily : CommandListImp {
|
||||
NEO::PipeControlArgs createBarrierFlags();
|
||||
|
||||
uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region);
|
||||
MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize);
|
||||
MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize, bool hostCopyAllowed);
|
||||
ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
|
||||
|
||||
bool containsAnyKernel = false;
|
||||
|
||||
@@ -409,7 +409,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendImageCopyFromMemory(ze_i
|
||||
|
||||
uint64_t bufferSize = getInputBufferSize(image->getImageInfo().imgDesc.imageType, bytesPerPixel, pDstRegion);
|
||||
|
||||
auto allocationStruct = getAlignedAllocation(this->device, srcPtr, bufferSize);
|
||||
auto allocationStruct = getAlignedAllocation(this->device, srcPtr, bufferSize, true);
|
||||
|
||||
auto rowPitch = pDstRegion->width * bytesPerPixel;
|
||||
auto slicePitch =
|
||||
@@ -525,7 +525,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendImageCopyToMemory(void *
|
||||
|
||||
uint64_t bufferSize = getInputBufferSize(image->getImageInfo().imgDesc.imageType, bytesPerPixel, pSrcRegion);
|
||||
|
||||
auto allocationStruct = getAlignedAllocation(this->device, dstPtr, bufferSize);
|
||||
auto allocationStruct = getAlignedAllocation(this->device, dstPtr, bufferSize, false);
|
||||
|
||||
auto rowPitch = pSrcRegion->width * bytesPerPixel;
|
||||
auto slicePitch =
|
||||
@@ -1083,8 +1083,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
|
||||
|
||||
DEBUG_BREAK_IF(size != leftSize + middleSizeBytes + rightSize);
|
||||
|
||||
auto dstAllocationStruct = getAlignedAllocation(this->device, dstptr, size);
|
||||
auto srcAllocationStruct = getAlignedAllocation(this->device, srcptr, size);
|
||||
auto dstAllocationStruct = getAlignedAllocation(this->device, dstptr, size, false);
|
||||
auto srcAllocationStruct = getAlignedAllocation(this->device, srcptr, size, true);
|
||||
|
||||
if (size >= 4ull * MemoryConstants::gigaByte) {
|
||||
isStateless = true;
|
||||
@@ -1208,8 +1208,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(void *d
|
||||
srcSize = (srcRegion->width * srcRegion->height) + hostPtrSrcOffset;
|
||||
}
|
||||
|
||||
auto dstAllocationStruct = getAlignedAllocation(this->device, dstPtr, dstSize);
|
||||
auto srcAllocationStruct = getAlignedAllocation(this->device, srcPtr, srcSize);
|
||||
auto dstAllocationStruct = getAlignedAllocation(this->device, dstPtr, dstSize, false);
|
||||
auto srcAllocationStruct = getAlignedAllocation(this->device, srcPtr, srcSize, true);
|
||||
|
||||
dstSize += dstAllocationStruct.offset;
|
||||
srcSize += srcAllocationStruct.offset;
|
||||
@@ -1436,7 +1436,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
|
||||
}
|
||||
}
|
||||
|
||||
auto dstAllocation = this->getAlignedAllocation(this->device, ptr, size);
|
||||
auto dstAllocation = this->getAlignedAllocation(this->device, ptr, size, false);
|
||||
if (size >= 4ull * MemoryConstants::gigaByte) {
|
||||
isStateless = true;
|
||||
}
|
||||
@@ -1718,7 +1718,8 @@ inline uint64_t CommandListCoreFamily<gfxCoreFamily>::getInputBufferSize(NEO::Im
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
inline AlignedAllocationData CommandListCoreFamily<gfxCoreFamily>::getAlignedAllocation(Device *device,
|
||||
const void *buffer,
|
||||
uint64_t bufferSize) {
|
||||
uint64_t bufferSize,
|
||||
bool hostCopyAllowed) {
|
||||
NEO::SvmAllocationData *allocData = nullptr;
|
||||
void *ptr = const_cast<void *>(buffer);
|
||||
bool srcAllocFound = device->getDriverHandle()->findAllocationDataForRange(ptr,
|
||||
@@ -1740,7 +1741,7 @@ inline AlignedAllocationData CommandListCoreFamily<gfxCoreFamily>::getAlignedAll
|
||||
//get offset from base of allocation to arg address
|
||||
offset += reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(alloc->getUnderlyingBuffer());
|
||||
} else {
|
||||
alloc = getHostPtrAlloc(buffer, bufferSize);
|
||||
alloc = getHostPtrAlloc(buffer, bufferSize, hostCopyAllowed);
|
||||
alignedPtr = static_cast<uintptr_t>(alignDown(alloc->getGpuAddress(), NEO::EncodeSurfaceState<GfxFamily>::getSurfaceBaseAddressAlignment()));
|
||||
}
|
||||
|
||||
@@ -2035,7 +2036,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWriteGlobalTimestamp(
|
||||
CommandListCoreFamily<gfxCoreFamily>::appendSignalEventPostWalker(hSignalEvent);
|
||||
}
|
||||
|
||||
auto allocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(uint64_t));
|
||||
auto allocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(uint64_t), false);
|
||||
commandContainer.addToResidencyContainer(allocationStruct.alloc);
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
@@ -2055,7 +2056,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
|
||||
const size_t *pOffsets, ze_event_handle_t hSignalEvent,
|
||||
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
|
||||
|
||||
auto dstptrAllocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(ze_kernel_timestamp_result_t) * numEvents);
|
||||
auto dstptrAllocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(ze_kernel_timestamp_result_t) * numEvents, false);
|
||||
commandContainer.addToResidencyContainer(dstptrAllocationStruct.alloc);
|
||||
|
||||
std::unique_ptr<EventData[]> timestampsData = std::make_unique<EventData[]>(numEvents);
|
||||
@@ -2098,7 +2099,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
|
||||
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::QueryKernelTimestamps);
|
||||
builtinFunction->setArgumentValue(2u, sizeof(uint32_t), &useOnlyGlobalTimestamps);
|
||||
} else {
|
||||
auto pOffsetAllocationStruct = getAlignedAllocation(this->device, pOffsets, sizeof(size_t) * numEvents);
|
||||
auto pOffsetAllocationStruct = getAlignedAllocation(this->device, pOffsets, sizeof(size_t) * numEvents, false);
|
||||
auto offsetValPtr = static_cast<uintptr_t>(pOffsetAllocationStruct.alloc->getGpuAddress());
|
||||
commandContainer.addToResidencyContainer(pOffsetAllocationStruct.alloc);
|
||||
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::QueryKernelTimestampsWithOffsets);
|
||||
|
||||
@@ -109,6 +109,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
|
||||
|
||||
KernelImp *kernelImp = static_cast<KernelImp *>(kernel);
|
||||
this->containsStatelessUncachedResource |= kernelImp->getKernelRequiresUncachedMocs();
|
||||
this->requiresUncachedMOCS = this->containsStatelessUncachedResource;
|
||||
uint32_t partitionCount = 0;
|
||||
|
||||
NEO::Device *neoDevice = device->getNEODevice();
|
||||
|
||||
@@ -82,6 +82,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
auto anyCommandListWithCooperativeKernels = false;
|
||||
auto anyCommandListWithoutCooperativeKernels = false;
|
||||
|
||||
cachedMOCSAllowed = true;
|
||||
|
||||
for (auto i = 0u; i < numCommandLists; i++) {
|
||||
auto commandList = CommandList::fromHandle(phCommandLists[i]);
|
||||
if (peekIsCopyOnlyCommandQueue() != commandList->isCopyOnly()) {
|
||||
@@ -97,6 +99,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
} else {
|
||||
anyCommandListWithoutCooperativeKernels = true;
|
||||
}
|
||||
// If the Command List has commands that require uncached MOCS, then any changes to the commands in the queue requires the uncached MOCS
|
||||
if (commandList->requiresUncachedMOCS && cachedMOCSAllowed == true) {
|
||||
cachedMOCSAllowed = false;
|
||||
}
|
||||
}
|
||||
|
||||
bool isMixingRegularAndCooperativeKernelsAllowed = NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.get();
|
||||
|
||||
@@ -58,7 +58,7 @@ void CommandQueueHw<gfxCoreFamily>::programStateBaseAddress(uint64_t gsba, bool
|
||||
nullptr,
|
||||
gsba,
|
||||
true,
|
||||
(device->getMOCS(true, false) >> 1),
|
||||
(device->getMOCS(cachedMOCSAllowed, false) >> 1),
|
||||
neoDevice->getMemoryManager()->getInternalHeapBaseAddress(device->getRootDeviceIndex(), useLocalMemoryForIndirectHeap),
|
||||
neoDevice->getMemoryManager()->getInternalHeapBaseAddress(device->getRootDeviceIndex(), neoDevice->getMemoryManager()->isLocalMemoryUsedForIsa(neoDevice->getRootDeviceIndex())),
|
||||
globalHeapsBase,
|
||||
|
||||
@@ -81,6 +81,7 @@ struct CommandQueueImp : public CommandQueue {
|
||||
ze_command_queue_mode_t getSynchronousMode() const;
|
||||
virtual void dispatchTaskCountWrite(NEO::LinearStream &commandStream, bool flushDataCache) = 0;
|
||||
virtual bool getPreemptionCmdProgramming() = 0;
|
||||
bool cachedMOCSAllowed = true;
|
||||
|
||||
protected:
|
||||
MOCKABLE_VIRTUAL int submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr,
|
||||
|
||||
@@ -38,7 +38,7 @@ void CommandQueueHw<gfxCoreFamily>::programStateBaseAddress(uint64_t gsba, bool
|
||||
nullptr,
|
||||
0,
|
||||
true,
|
||||
(device->getMOCS(true, false) >> 1),
|
||||
(device->getMOCS(cachedMOCSAllowed, false) >> 1),
|
||||
neoDevice->getMemoryManager()->getInternalHeapBaseAddress(neoDevice->getRootDeviceIndex(), useLocalMemoryForIndirectHeap),
|
||||
neoDevice->getMemoryManager()->getInternalHeapBaseAddress(neoDevice->getRootDeviceIndex(), neoDevice->getMemoryManager()->isLocalMemoryUsedForIsa(neoDevice->getRootDeviceIndex())),
|
||||
globalHeapsBase,
|
||||
|
||||
@@ -129,7 +129,7 @@ struct Device : _ze_device_handle_t {
|
||||
virtual NEO::GraphicsAllocation *allocateManagedMemoryFromHostPtr(void *buffer,
|
||||
size_t size, struct CommandList *commandList) = 0;
|
||||
|
||||
virtual NEO::GraphicsAllocation *allocateMemoryFromHostPtr(const void *buffer, size_t size) = 0;
|
||||
virtual NEO::GraphicsAllocation *allocateMemoryFromHostPtr(const void *buffer, size_t size, bool hostCopyAllowed) = 0;
|
||||
virtual void setSysmanHandle(SysmanDevice *pSysmanDevice) = 0;
|
||||
virtual SysmanDevice *getSysmanHandle() = 0;
|
||||
virtual ze_result_t getCsrForOrdinalAndIndex(NEO::CommandStreamReceiver **csr, uint32_t ordinal, uint32_t index) = 0;
|
||||
|
||||
@@ -990,14 +990,14 @@ NEO::GraphicsAllocation *DeviceImp::allocateManagedMemoryFromHostPtr(void *buffe
|
||||
return allocation;
|
||||
}
|
||||
|
||||
NEO::GraphicsAllocation *DeviceImp::allocateMemoryFromHostPtr(const void *buffer, size_t size) {
|
||||
NEO::GraphicsAllocation *DeviceImp::allocateMemoryFromHostPtr(const void *buffer, size_t size, bool hostCopyAllowed) {
|
||||
NEO::AllocationProperties properties = {getRootDeviceIndex(), false, size,
|
||||
NEO::GraphicsAllocation::AllocationType::EXTERNAL_HOST_PTR,
|
||||
false, neoDevice->getDeviceBitfield()};
|
||||
properties.flags.flushL3RequiredForRead = properties.flags.flushL3RequiredForWrite = true;
|
||||
auto allocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(properties,
|
||||
buffer);
|
||||
if (allocation == nullptr) {
|
||||
if (allocation == nullptr && hostCopyAllowed) {
|
||||
allocation = neoDevice->getMemoryManager()->allocateInternalGraphicsMemoryWithHostCopy(neoDevice->getRootDeviceIndex(),
|
||||
neoDevice->getDeviceBitfield(),
|
||||
buffer,
|
||||
|
||||
@@ -91,7 +91,7 @@ struct DeviceImp : public Device {
|
||||
void setDebugSurface(NEO::GraphicsAllocation *debugSurface) { this->debugSurface = debugSurface; };
|
||||
~DeviceImp() override;
|
||||
NEO::GraphicsAllocation *allocateManagedMemoryFromHostPtr(void *buffer, size_t size, struct CommandList *commandList) override;
|
||||
NEO::GraphicsAllocation *allocateMemoryFromHostPtr(const void *buffer, size_t size) override;
|
||||
NEO::GraphicsAllocation *allocateMemoryFromHostPtr(const void *buffer, size_t size, bool hostCopyAllowed) override;
|
||||
void setSysmanHandle(SysmanDevice *pSysman) override;
|
||||
SysmanDevice *getSysmanHandle() override;
|
||||
ze_result_t getCsrForOrdinalAndIndex(NEO::CommandStreamReceiver **csr, uint32_t ordinal, uint32_t index) override;
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "shared/source/gmm_helper/gmm.h"
|
||||
#include "shared/source/gmm_helper/gmm_helper.h"
|
||||
#include "shared/source/helpers/bindless_heaps_helper.h"
|
||||
#include "shared/source/helpers/cache_policy.h"
|
||||
#include "shared/source/helpers/hw_helper.h"
|
||||
#include "shared/source/helpers/string.h"
|
||||
#include "shared/source/kernel/implicit_args.h"
|
||||
@@ -62,10 +63,20 @@ struct KernelHw : public KernelImp {
|
||||
bufferSizeForSsh = alignUp(bufferSizeForSsh, alignment);
|
||||
|
||||
bool l3Enabled = true;
|
||||
|
||||
// Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD.
|
||||
// Most commonly this issue will occur with Host Point Allocations from customers.
|
||||
l3Enabled = isL3Capable(*alloc);
|
||||
|
||||
auto allocData = this->module->getDevice()->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
|
||||
if (allocData && allocData->allocationFlagsProperty.flags.locallyUncachedResource) {
|
||||
l3Enabled = false;
|
||||
}
|
||||
|
||||
if (l3Enabled == false) {
|
||||
this->kernelRequiresUncachedMocsCount++;
|
||||
}
|
||||
|
||||
NEO::Device *neoDevice = module->getDevice()->getNEODevice();
|
||||
|
||||
NEO::EncodeSurfaceStateArgs args;
|
||||
|
||||
Reference in New Issue
Block a user