Fix to L3 Caching given unaligned memory & hostPtrCopy only for src host ptrs

Signed-off-by: Spruit, Neil R <neil.r.spruit@intel.com>
This commit is contained in:
Spruit, Neil R
2021-11-09 23:25:07 +00:00
committed by Compute-Runtime-Automation
parent 35795357e9
commit 9c6433e55e
23 changed files with 132 additions and 40 deletions

View File

@@ -71,12 +71,12 @@ NEO::GraphicsAllocation *CommandList::getAllocationFromHostPtrMap(const void *bu
return nullptr;
}
NEO::GraphicsAllocation *CommandList::getHostPtrAlloc(const void *buffer, uint64_t bufferSize) {
NEO::GraphicsAllocation *CommandList::getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed) {
NEO::GraphicsAllocation *alloc = getAllocationFromHostPtrMap(buffer, bufferSize);
if (alloc) {
return alloc;
}
alloc = device->allocateMemoryFromHostPtr(buffer, bufferSize);
alloc = device->allocateMemoryFromHostPtr(buffer, bufferSize, hostCopyAllowed);
UNRECOVERABLE_IF(alloc == nullptr);
if (this->cmdListType == CommandListType::TYPE_IMMEDIATE && this->isFlushTaskSubmissionEnabled) {
this->csr->getInternalAllocationStorage()->storeAllocation(std::unique_ptr<NEO::GraphicsAllocation>(alloc), NEO::AllocationUsage::TEMPORARY_ALLOCATION);

View File

@@ -250,10 +250,11 @@ struct CommandList : _ze_command_list_handle_t {
bool isFlushTaskSubmissionEnabled = false;
bool isSyncModeQueue = false;
bool commandListSLMEnabled = false;
bool requiresUncachedMOCS = false;
protected:
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);
NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize);
NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed);
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;

View File

@@ -245,7 +245,7 @@ struct CommandListCoreFamily : CommandListImp {
NEO::PipeControlArgs createBarrierFlags();
uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region);
MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize);
MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize, bool hostCopyAllowed);
ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
bool containsAnyKernel = false;

View File

@@ -409,7 +409,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendImageCopyFromMemory(ze_i
uint64_t bufferSize = getInputBufferSize(image->getImageInfo().imgDesc.imageType, bytesPerPixel, pDstRegion);
auto allocationStruct = getAlignedAllocation(this->device, srcPtr, bufferSize);
auto allocationStruct = getAlignedAllocation(this->device, srcPtr, bufferSize, true);
auto rowPitch = pDstRegion->width * bytesPerPixel;
auto slicePitch =
@@ -525,7 +525,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendImageCopyToMemory(void *
uint64_t bufferSize = getInputBufferSize(image->getImageInfo().imgDesc.imageType, bytesPerPixel, pSrcRegion);
auto allocationStruct = getAlignedAllocation(this->device, dstPtr, bufferSize);
auto allocationStruct = getAlignedAllocation(this->device, dstPtr, bufferSize, false);
auto rowPitch = pSrcRegion->width * bytesPerPixel;
auto slicePitch =
@@ -1083,8 +1083,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
DEBUG_BREAK_IF(size != leftSize + middleSizeBytes + rightSize);
auto dstAllocationStruct = getAlignedAllocation(this->device, dstptr, size);
auto srcAllocationStruct = getAlignedAllocation(this->device, srcptr, size);
auto dstAllocationStruct = getAlignedAllocation(this->device, dstptr, size, false);
auto srcAllocationStruct = getAlignedAllocation(this->device, srcptr, size, true);
if (size >= 4ull * MemoryConstants::gigaByte) {
isStateless = true;
@@ -1208,8 +1208,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(void *d
srcSize = (srcRegion->width * srcRegion->height) + hostPtrSrcOffset;
}
auto dstAllocationStruct = getAlignedAllocation(this->device, dstPtr, dstSize);
auto srcAllocationStruct = getAlignedAllocation(this->device, srcPtr, srcSize);
auto dstAllocationStruct = getAlignedAllocation(this->device, dstPtr, dstSize, false);
auto srcAllocationStruct = getAlignedAllocation(this->device, srcPtr, srcSize, true);
dstSize += dstAllocationStruct.offset;
srcSize += srcAllocationStruct.offset;
@@ -1436,7 +1436,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
}
}
auto dstAllocation = this->getAlignedAllocation(this->device, ptr, size);
auto dstAllocation = this->getAlignedAllocation(this->device, ptr, size, false);
if (size >= 4ull * MemoryConstants::gigaByte) {
isStateless = true;
}
@@ -1718,7 +1718,8 @@ inline uint64_t CommandListCoreFamily<gfxCoreFamily>::getInputBufferSize(NEO::Im
template <GFXCORE_FAMILY gfxCoreFamily>
inline AlignedAllocationData CommandListCoreFamily<gfxCoreFamily>::getAlignedAllocation(Device *device,
const void *buffer,
uint64_t bufferSize) {
uint64_t bufferSize,
bool hostCopyAllowed) {
NEO::SvmAllocationData *allocData = nullptr;
void *ptr = const_cast<void *>(buffer);
bool srcAllocFound = device->getDriverHandle()->findAllocationDataForRange(ptr,
@@ -1740,7 +1741,7 @@ inline AlignedAllocationData CommandListCoreFamily<gfxCoreFamily>::getAlignedAll
//get offset from base of allocation to arg address
offset += reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(alloc->getUnderlyingBuffer());
} else {
alloc = getHostPtrAlloc(buffer, bufferSize);
alloc = getHostPtrAlloc(buffer, bufferSize, hostCopyAllowed);
alignedPtr = static_cast<uintptr_t>(alignDown(alloc->getGpuAddress(), NEO::EncodeSurfaceState<GfxFamily>::getSurfaceBaseAddressAlignment()));
}
@@ -2035,7 +2036,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWriteGlobalTimestamp(
CommandListCoreFamily<gfxCoreFamily>::appendSignalEventPostWalker(hSignalEvent);
}
auto allocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(uint64_t));
auto allocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(uint64_t), false);
commandContainer.addToResidencyContainer(allocationStruct.alloc);
return ZE_RESULT_SUCCESS;
@@ -2055,7 +2056,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
const size_t *pOffsets, ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
auto dstptrAllocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(ze_kernel_timestamp_result_t) * numEvents);
auto dstptrAllocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(ze_kernel_timestamp_result_t) * numEvents, false);
commandContainer.addToResidencyContainer(dstptrAllocationStruct.alloc);
std::unique_ptr<EventData[]> timestampsData = std::make_unique<EventData[]>(numEvents);
@@ -2098,7 +2099,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::QueryKernelTimestamps);
builtinFunction->setArgumentValue(2u, sizeof(uint32_t), &useOnlyGlobalTimestamps);
} else {
auto pOffsetAllocationStruct = getAlignedAllocation(this->device, pOffsets, sizeof(size_t) * numEvents);
auto pOffsetAllocationStruct = getAlignedAllocation(this->device, pOffsets, sizeof(size_t) * numEvents, false);
auto offsetValPtr = static_cast<uintptr_t>(pOffsetAllocationStruct.alloc->getGpuAddress());
commandContainer.addToResidencyContainer(pOffsetAllocationStruct.alloc);
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::QueryKernelTimestampsWithOffsets);

View File

@@ -109,6 +109,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
KernelImp *kernelImp = static_cast<KernelImp *>(kernel);
this->containsStatelessUncachedResource |= kernelImp->getKernelRequiresUncachedMocs();
this->requiresUncachedMOCS = this->containsStatelessUncachedResource;
uint32_t partitionCount = 0;
NEO::Device *neoDevice = device->getNEODevice();

View File

@@ -82,6 +82,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
auto anyCommandListWithCooperativeKernels = false;
auto anyCommandListWithoutCooperativeKernels = false;
cachedMOCSAllowed = true;
for (auto i = 0u; i < numCommandLists; i++) {
auto commandList = CommandList::fromHandle(phCommandLists[i]);
if (peekIsCopyOnlyCommandQueue() != commandList->isCopyOnly()) {
@@ -97,6 +99,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
} else {
anyCommandListWithoutCooperativeKernels = true;
}
// If the Command List has commands that require uncached MOCS, then any changes to the commands in the queue requires the uncached MOCS
if (commandList->requiresUncachedMOCS && cachedMOCSAllowed == true) {
cachedMOCSAllowed = false;
}
}
bool isMixingRegularAndCooperativeKernelsAllowed = NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.get();

View File

@@ -58,7 +58,7 @@ void CommandQueueHw<gfxCoreFamily>::programStateBaseAddress(uint64_t gsba, bool
nullptr,
gsba,
true,
(device->getMOCS(true, false) >> 1),
(device->getMOCS(cachedMOCSAllowed, false) >> 1),
neoDevice->getMemoryManager()->getInternalHeapBaseAddress(device->getRootDeviceIndex(), useLocalMemoryForIndirectHeap),
neoDevice->getMemoryManager()->getInternalHeapBaseAddress(device->getRootDeviceIndex(), neoDevice->getMemoryManager()->isLocalMemoryUsedForIsa(neoDevice->getRootDeviceIndex())),
globalHeapsBase,

View File

@@ -81,6 +81,7 @@ struct CommandQueueImp : public CommandQueue {
ze_command_queue_mode_t getSynchronousMode() const;
virtual void dispatchTaskCountWrite(NEO::LinearStream &commandStream, bool flushDataCache) = 0;
virtual bool getPreemptionCmdProgramming() = 0;
bool cachedMOCSAllowed = true;
protected:
MOCKABLE_VIRTUAL int submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr,

View File

@@ -38,7 +38,7 @@ void CommandQueueHw<gfxCoreFamily>::programStateBaseAddress(uint64_t gsba, bool
nullptr,
0,
true,
(device->getMOCS(true, false) >> 1),
(device->getMOCS(cachedMOCSAllowed, false) >> 1),
neoDevice->getMemoryManager()->getInternalHeapBaseAddress(neoDevice->getRootDeviceIndex(), useLocalMemoryForIndirectHeap),
neoDevice->getMemoryManager()->getInternalHeapBaseAddress(neoDevice->getRootDeviceIndex(), neoDevice->getMemoryManager()->isLocalMemoryUsedForIsa(neoDevice->getRootDeviceIndex())),
globalHeapsBase,

View File

@@ -129,7 +129,7 @@ struct Device : _ze_device_handle_t {
virtual NEO::GraphicsAllocation *allocateManagedMemoryFromHostPtr(void *buffer,
size_t size, struct CommandList *commandList) = 0;
virtual NEO::GraphicsAllocation *allocateMemoryFromHostPtr(const void *buffer, size_t size) = 0;
virtual NEO::GraphicsAllocation *allocateMemoryFromHostPtr(const void *buffer, size_t size, bool hostCopyAllowed) = 0;
virtual void setSysmanHandle(SysmanDevice *pSysmanDevice) = 0;
virtual SysmanDevice *getSysmanHandle() = 0;
virtual ze_result_t getCsrForOrdinalAndIndex(NEO::CommandStreamReceiver **csr, uint32_t ordinal, uint32_t index) = 0;

View File

@@ -990,14 +990,14 @@ NEO::GraphicsAllocation *DeviceImp::allocateManagedMemoryFromHostPtr(void *buffe
return allocation;
}
NEO::GraphicsAllocation *DeviceImp::allocateMemoryFromHostPtr(const void *buffer, size_t size) {
NEO::GraphicsAllocation *DeviceImp::allocateMemoryFromHostPtr(const void *buffer, size_t size, bool hostCopyAllowed) {
NEO::AllocationProperties properties = {getRootDeviceIndex(), false, size,
NEO::GraphicsAllocation::AllocationType::EXTERNAL_HOST_PTR,
false, neoDevice->getDeviceBitfield()};
properties.flags.flushL3RequiredForRead = properties.flags.flushL3RequiredForWrite = true;
auto allocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(properties,
buffer);
if (allocation == nullptr) {
if (allocation == nullptr && hostCopyAllowed) {
allocation = neoDevice->getMemoryManager()->allocateInternalGraphicsMemoryWithHostCopy(neoDevice->getRootDeviceIndex(),
neoDevice->getDeviceBitfield(),
buffer,

View File

@@ -91,7 +91,7 @@ struct DeviceImp : public Device {
void setDebugSurface(NEO::GraphicsAllocation *debugSurface) { this->debugSurface = debugSurface; };
~DeviceImp() override;
NEO::GraphicsAllocation *allocateManagedMemoryFromHostPtr(void *buffer, size_t size, struct CommandList *commandList) override;
NEO::GraphicsAllocation *allocateMemoryFromHostPtr(const void *buffer, size_t size) override;
NEO::GraphicsAllocation *allocateMemoryFromHostPtr(const void *buffer, size_t size, bool hostCopyAllowed) override;
void setSysmanHandle(SysmanDevice *pSysman) override;
SysmanDevice *getSysmanHandle() override;
ze_result_t getCsrForOrdinalAndIndex(NEO::CommandStreamReceiver **csr, uint32_t ordinal, uint32_t index) override;

View File

@@ -11,6 +11,7 @@
#include "shared/source/gmm_helper/gmm.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/helpers/bindless_heaps_helper.h"
#include "shared/source/helpers/cache_policy.h"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/helpers/string.h"
#include "shared/source/kernel/implicit_args.h"
@@ -62,10 +63,20 @@ struct KernelHw : public KernelImp {
bufferSizeForSsh = alignUp(bufferSizeForSsh, alignment);
bool l3Enabled = true;
// Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD.
// Most commonly this issue will occur with Host Point Allocations from customers.
l3Enabled = isL3Capable(*alloc);
auto allocData = this->module->getDevice()->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
if (allocData && allocData->allocationFlagsProperty.flags.locallyUncachedResource) {
l3Enabled = false;
}
if (l3Enabled == false) {
this->kernelRequiresUncachedMocsCount++;
}
NEO::Device *neoDevice = module->getDevice()->getNEODevice();
NEO::EncodeSurfaceStateArgs args;