From 9c6433e55e0dc4b0a457a7aa90213b453a6c7e4b Mon Sep 17 00:00:00 2001 From: "Spruit, Neil R" Date: Tue, 9 Nov 2021 23:25:07 +0000 Subject: [PATCH] Fix to L3 Caching given unaligned memory & hostPtrCopy only for src host ptrs Signed-off-by: Spruit, Neil R --- level_zero/core/source/cmdlist/cmdlist.cpp | 4 +-- level_zero/core/source/cmdlist/cmdlist.h | 3 +- level_zero/core/source/cmdlist/cmdlist_hw.h | 2 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 25 ++++++++------- .../core/source/cmdlist/cmdlist_hw_base.inl | 1 + .../core/source/cmdqueue/cmdqueue_hw.inl | 6 ++++ .../core/source/cmdqueue/cmdqueue_hw_base.inl | 2 +- .../core/source/cmdqueue/cmdqueue_imp.h | 1 + .../cmdqueue_xe_hp_core_and_later.inl | 2 +- level_zero/core/source/device/device.h | 2 +- level_zero/core/source/device/device_imp.cpp | 4 +-- level_zero/core/source/device/device_imp.h | 2 +- level_zero/core/source/kernel/kernel_hw.h | 11 +++++++ .../core/test/unit_tests/mocks/mock_device.h | 2 +- .../sources/cmdlist/test_cmdlist_2.cpp | 4 +-- .../sources/cmdlist/test_cmdlist_3.cpp | 6 ++-- .../sources/cmdlist/test_cmdlist_4.cpp | 6 ++-- .../cmdlist/test_cmdlist_append_memory.cpp | 6 ++-- .../sources/cmdlist/test_cmdlist_blit.cpp | 2 +- .../test_cmdqueue_enqueue_cmdlist.cpp | 27 ++++++++++++++++ .../unit_tests/sources/device/test_device.cpp | 16 ++++++++-- .../unit_tests/sources/kernel/test_kernel.cpp | 32 +++++++++++++++++++ .../unit_tests/sources/memory/test_memory.cpp | 6 ++-- 23 files changed, 132 insertions(+), 40 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.cpp b/level_zero/core/source/cmdlist/cmdlist.cpp index 27bdf8a837..ccf4c5d7df 100644 --- a/level_zero/core/source/cmdlist/cmdlist.cpp +++ b/level_zero/core/source/cmdlist/cmdlist.cpp @@ -71,12 +71,12 @@ NEO::GraphicsAllocation *CommandList::getAllocationFromHostPtrMap(const void *bu return nullptr; } -NEO::GraphicsAllocation *CommandList::getHostPtrAlloc(const void *buffer, uint64_t bufferSize) { +NEO::GraphicsAllocation *CommandList::getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed) { NEO::GraphicsAllocation *alloc = getAllocationFromHostPtrMap(buffer, bufferSize); if (alloc) { return alloc; } - alloc = device->allocateMemoryFromHostPtr(buffer, bufferSize); + alloc = device->allocateMemoryFromHostPtr(buffer, bufferSize, hostCopyAllowed); UNRECOVERABLE_IF(alloc == nullptr); if (this->cmdListType == CommandListType::TYPE_IMMEDIATE && this->isFlushTaskSubmissionEnabled) { this->csr->getInternalAllocationStorage()->storeAllocation(std::unique_ptr(alloc), NEO::AllocationUsage::TEMPORARY_ALLOCATION); diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 9a43e7a047..1e416434a6 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -250,10 +250,11 @@ struct CommandList : _ze_command_list_handle_t { bool isFlushTaskSubmissionEnabled = false; bool isSyncModeQueue = false; bool commandListSLMEnabled = false; + bool requiresUncachedMOCS = false; protected: NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize); - NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize); + NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed); std::map hostPtrMap; std::vector ownedPrivateAllocations; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index ac2601a4bd..8849939261 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -245,7 +245,7 @@ struct CommandListCoreFamily : CommandListImp { NEO::PipeControlArgs createBarrierFlags(); uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region); - MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize); + MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize, bool hostCopyAllowed); ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents); bool containsAnyKernel = false; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 8df4138aae..40c59100cc 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -409,7 +409,7 @@ ze_result_t CommandListCoreFamily::appendImageCopyFromMemory(ze_i uint64_t bufferSize = getInputBufferSize(image->getImageInfo().imgDesc.imageType, bytesPerPixel, pDstRegion); - auto allocationStruct = getAlignedAllocation(this->device, srcPtr, bufferSize); + auto allocationStruct = getAlignedAllocation(this->device, srcPtr, bufferSize, true); auto rowPitch = pDstRegion->width * bytesPerPixel; auto slicePitch = @@ -525,7 +525,7 @@ ze_result_t CommandListCoreFamily::appendImageCopyToMemory(void * uint64_t bufferSize = getInputBufferSize(image->getImageInfo().imgDesc.imageType, bytesPerPixel, pSrcRegion); - auto allocationStruct = getAlignedAllocation(this->device, dstPtr, bufferSize); + auto allocationStruct = getAlignedAllocation(this->device, dstPtr, bufferSize, false); auto rowPitch = pSrcRegion->width * bytesPerPixel; auto slicePitch = @@ -1083,8 +1083,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, DEBUG_BREAK_IF(size != leftSize + middleSizeBytes + rightSize); - auto dstAllocationStruct = getAlignedAllocation(this->device, dstptr, size); - auto srcAllocationStruct = getAlignedAllocation(this->device, srcptr, size); + auto dstAllocationStruct = getAlignedAllocation(this->device, dstptr, size, false); + auto srcAllocationStruct = getAlignedAllocation(this->device, srcptr, size, true); if (size >= 4ull * MemoryConstants::gigaByte) { isStateless = true; @@ -1208,8 +1208,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyRegion(void *d srcSize = (srcRegion->width * srcRegion->height) + hostPtrSrcOffset; } - auto dstAllocationStruct = getAlignedAllocation(this->device, dstPtr, dstSize); - auto srcAllocationStruct = getAlignedAllocation(this->device, srcPtr, srcSize); + auto dstAllocationStruct = getAlignedAllocation(this->device, dstPtr, dstSize, false); + auto srcAllocationStruct = getAlignedAllocation(this->device, srcPtr, srcSize, true); dstSize += dstAllocationStruct.offset; srcSize += srcAllocationStruct.offset; @@ -1436,7 +1436,7 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, } } - auto dstAllocation = this->getAlignedAllocation(this->device, ptr, size); + auto dstAllocation = this->getAlignedAllocation(this->device, ptr, size, false); if (size >= 4ull * MemoryConstants::gigaByte) { isStateless = true; } @@ -1718,7 +1718,8 @@ inline uint64_t CommandListCoreFamily::getInputBufferSize(NEO::Im template inline AlignedAllocationData CommandListCoreFamily::getAlignedAllocation(Device *device, const void *buffer, - uint64_t bufferSize) { + uint64_t bufferSize, + bool hostCopyAllowed) { NEO::SvmAllocationData *allocData = nullptr; void *ptr = const_cast(buffer); bool srcAllocFound = device->getDriverHandle()->findAllocationDataForRange(ptr, @@ -1740,7 +1741,7 @@ inline AlignedAllocationData CommandListCoreFamily::getAlignedAll //get offset from base of allocation to arg address offset += reinterpret_cast(ptr) - reinterpret_cast(alloc->getUnderlyingBuffer()); } else { - alloc = getHostPtrAlloc(buffer, bufferSize); + alloc = getHostPtrAlloc(buffer, bufferSize, hostCopyAllowed); alignedPtr = static_cast(alignDown(alloc->getGpuAddress(), NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignment())); } @@ -2035,7 +2036,7 @@ ze_result_t CommandListCoreFamily::appendWriteGlobalTimestamp( CommandListCoreFamily::appendSignalEventPostWalker(hSignalEvent); } - auto allocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(uint64_t)); + auto allocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(uint64_t), false); commandContainer.addToResidencyContainer(allocationStruct.alloc); return ZE_RESULT_SUCCESS; @@ -2055,7 +2056,7 @@ ze_result_t CommandListCoreFamily::appendQueryKernelTimestamps( const size_t *pOffsets, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) { - auto dstptrAllocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(ze_kernel_timestamp_result_t) * numEvents); + auto dstptrAllocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(ze_kernel_timestamp_result_t) * numEvents, false); commandContainer.addToResidencyContainer(dstptrAllocationStruct.alloc); std::unique_ptr timestampsData = std::make_unique(numEvents); @@ -2098,7 +2099,7 @@ ze_result_t CommandListCoreFamily::appendQueryKernelTimestamps( builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::QueryKernelTimestamps); builtinFunction->setArgumentValue(2u, sizeof(uint32_t), &useOnlyGlobalTimestamps); } else { - auto pOffsetAllocationStruct = getAlignedAllocation(this->device, pOffsets, sizeof(size_t) * numEvents); + auto pOffsetAllocationStruct = getAlignedAllocation(this->device, pOffsets, sizeof(size_t) * numEvents, false); auto offsetValPtr = static_cast(pOffsetAllocationStruct.alloc->getGpuAddress()); commandContainer.addToResidencyContainer(pOffsetAllocationStruct.alloc); builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::QueryKernelTimestampsWithOffsets); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index c5f0ec9e3e..da1abd0a86 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -109,6 +109,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z KernelImp *kernelImp = static_cast(kernel); this->containsStatelessUncachedResource |= kernelImp->getKernelRequiresUncachedMocs(); + this->requiresUncachedMOCS = this->containsStatelessUncachedResource; uint32_t partitionCount = 0; NEO::Device *neoDevice = device->getNEODevice(); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index e3d0699630..9175424e5e 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -82,6 +82,8 @@ ze_result_t CommandQueueHw::executeCommandLists( auto anyCommandListWithCooperativeKernels = false; auto anyCommandListWithoutCooperativeKernels = false; + cachedMOCSAllowed = true; + for (auto i = 0u; i < numCommandLists; i++) { auto commandList = CommandList::fromHandle(phCommandLists[i]); if (peekIsCopyOnlyCommandQueue() != commandList->isCopyOnly()) { @@ -97,6 +99,10 @@ ze_result_t CommandQueueHw::executeCommandLists( } else { anyCommandListWithoutCooperativeKernels = true; } + // If the Command List has commands that require uncached MOCS, then any changes to the commands in the queue requires the uncached MOCS + if (commandList->requiresUncachedMOCS && cachedMOCSAllowed == true) { + cachedMOCSAllowed = false; + } } bool isMixingRegularAndCooperativeKernelsAllowed = NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.get(); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl index d15904efdf..ca9030caac 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl @@ -58,7 +58,7 @@ void CommandQueueHw::programStateBaseAddress(uint64_t gsba, bool nullptr, gsba, true, - (device->getMOCS(true, false) >> 1), + (device->getMOCS(cachedMOCSAllowed, false) >> 1), neoDevice->getMemoryManager()->getInternalHeapBaseAddress(device->getRootDeviceIndex(), useLocalMemoryForIndirectHeap), neoDevice->getMemoryManager()->getInternalHeapBaseAddress(device->getRootDeviceIndex(), neoDevice->getMemoryManager()->isLocalMemoryUsedForIsa(neoDevice->getRootDeviceIndex())), globalHeapsBase, diff --git a/level_zero/core/source/cmdqueue/cmdqueue_imp.h b/level_zero/core/source/cmdqueue/cmdqueue_imp.h index 31834067e1..3d9fe45492 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_imp.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_imp.h @@ -81,6 +81,7 @@ struct CommandQueueImp : public CommandQueue { ze_command_queue_mode_t getSynchronousMode() const; virtual void dispatchTaskCountWrite(NEO::LinearStream &commandStream, bool flushDataCache) = 0; virtual bool getPreemptionCmdProgramming() = 0; + bool cachedMOCSAllowed = true; protected: MOCKABLE_VIRTUAL int submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr, diff --git a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl index e263c72cab..0fd1b648e5 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl @@ -38,7 +38,7 @@ void CommandQueueHw::programStateBaseAddress(uint64_t gsba, bool nullptr, 0, true, - (device->getMOCS(true, false) >> 1), + (device->getMOCS(cachedMOCSAllowed, false) >> 1), neoDevice->getMemoryManager()->getInternalHeapBaseAddress(neoDevice->getRootDeviceIndex(), useLocalMemoryForIndirectHeap), neoDevice->getMemoryManager()->getInternalHeapBaseAddress(neoDevice->getRootDeviceIndex(), neoDevice->getMemoryManager()->isLocalMemoryUsedForIsa(neoDevice->getRootDeviceIndex())), globalHeapsBase, diff --git a/level_zero/core/source/device/device.h b/level_zero/core/source/device/device.h index e97572fee1..d989093099 100644 --- a/level_zero/core/source/device/device.h +++ b/level_zero/core/source/device/device.h @@ -129,7 +129,7 @@ struct Device : _ze_device_handle_t { virtual NEO::GraphicsAllocation *allocateManagedMemoryFromHostPtr(void *buffer, size_t size, struct CommandList *commandList) = 0; - virtual NEO::GraphicsAllocation *allocateMemoryFromHostPtr(const void *buffer, size_t size) = 0; + virtual NEO::GraphicsAllocation *allocateMemoryFromHostPtr(const void *buffer, size_t size, bool hostCopyAllowed) = 0; virtual void setSysmanHandle(SysmanDevice *pSysmanDevice) = 0; virtual SysmanDevice *getSysmanHandle() = 0; virtual ze_result_t getCsrForOrdinalAndIndex(NEO::CommandStreamReceiver **csr, uint32_t ordinal, uint32_t index) = 0; diff --git a/level_zero/core/source/device/device_imp.cpp b/level_zero/core/source/device/device_imp.cpp index dbb79c40c8..f98e0302ca 100644 --- a/level_zero/core/source/device/device_imp.cpp +++ b/level_zero/core/source/device/device_imp.cpp @@ -990,14 +990,14 @@ NEO::GraphicsAllocation *DeviceImp::allocateManagedMemoryFromHostPtr(void *buffe return allocation; } -NEO::GraphicsAllocation *DeviceImp::allocateMemoryFromHostPtr(const void *buffer, size_t size) { +NEO::GraphicsAllocation *DeviceImp::allocateMemoryFromHostPtr(const void *buffer, size_t size, bool hostCopyAllowed) { NEO::AllocationProperties properties = {getRootDeviceIndex(), false, size, NEO::GraphicsAllocation::AllocationType::EXTERNAL_HOST_PTR, false, neoDevice->getDeviceBitfield()}; properties.flags.flushL3RequiredForRead = properties.flags.flushL3RequiredForWrite = true; auto allocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(properties, buffer); - if (allocation == nullptr) { + if (allocation == nullptr && hostCopyAllowed) { allocation = neoDevice->getMemoryManager()->allocateInternalGraphicsMemoryWithHostCopy(neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield(), buffer, diff --git a/level_zero/core/source/device/device_imp.h b/level_zero/core/source/device/device_imp.h index fce8a0ca1d..4b557b92e4 100644 --- a/level_zero/core/source/device/device_imp.h +++ b/level_zero/core/source/device/device_imp.h @@ -91,7 +91,7 @@ struct DeviceImp : public Device { void setDebugSurface(NEO::GraphicsAllocation *debugSurface) { this->debugSurface = debugSurface; }; ~DeviceImp() override; NEO::GraphicsAllocation *allocateManagedMemoryFromHostPtr(void *buffer, size_t size, struct CommandList *commandList) override; - NEO::GraphicsAllocation *allocateMemoryFromHostPtr(const void *buffer, size_t size) override; + NEO::GraphicsAllocation *allocateMemoryFromHostPtr(const void *buffer, size_t size, bool hostCopyAllowed) override; void setSysmanHandle(SysmanDevice *pSysman) override; SysmanDevice *getSysmanHandle() override; ze_result_t getCsrForOrdinalAndIndex(NEO::CommandStreamReceiver **csr, uint32_t ordinal, uint32_t index) override; diff --git a/level_zero/core/source/kernel/kernel_hw.h b/level_zero/core/source/kernel/kernel_hw.h index f21c786d1b..8fbdfa2be1 100644 --- a/level_zero/core/source/kernel/kernel_hw.h +++ b/level_zero/core/source/kernel/kernel_hw.h @@ -11,6 +11,7 @@ #include "shared/source/gmm_helper/gmm.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/bindless_heaps_helper.h" +#include "shared/source/helpers/cache_policy.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/string.h" #include "shared/source/kernel/implicit_args.h" @@ -62,10 +63,20 @@ struct KernelHw : public KernelImp { bufferSizeForSsh = alignUp(bufferSizeForSsh, alignment); bool l3Enabled = true; + + // Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD. + // Most commonly this issue will occur with Host Point Allocations from customers. + l3Enabled = isL3Capable(*alloc); + auto allocData = this->module->getDevice()->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast(alloc->getGpuAddress())); if (allocData && allocData->allocationFlagsProperty.flags.locallyUncachedResource) { l3Enabled = false; } + + if (l3Enabled == false) { + this->kernelRequiresUncachedMocsCount++; + } + NEO::Device *neoDevice = module->getDevice()->getNEODevice(); NEO::EncodeSurfaceStateArgs args; diff --git a/level_zero/core/test/unit_tests/mocks/mock_device.h b/level_zero/core/test/unit_tests/mocks/mock_device.h index b445c330f5..f0a0b93882 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_device.h +++ b/level_zero/core/test/unit_tests/mocks/mock_device.h @@ -94,7 +94,7 @@ struct Mock : public Device { (override)); ADDMETHOD_CONST_NOBASE(getDebugSurface, NEO::GraphicsAllocation *, nullptr, ()); ADDMETHOD_NOBASE(allocateManagedMemoryFromHostPtr, NEO::GraphicsAllocation *, nullptr, (void *buffer, size_t size, struct L0::CommandList *commandList)); - ADDMETHOD_NOBASE(allocateMemoryFromHostPtr, NEO::GraphicsAllocation *, nullptr, (const void *buffer, size_t size)); + ADDMETHOD_NOBASE(allocateMemoryFromHostPtr, NEO::GraphicsAllocation *, nullptr, (const void *buffer, size_t size, bool hostCopyAllowed)); ADDMETHOD_NOBASE_VOIDRETURN(setSysmanHandle, (SysmanDevice *)); ADDMETHOD_NOBASE(getSysmanHandle, SysmanDevice *, nullptr, ()); ADDMETHOD_NOBASE(getCsrForOrdinalAndIndex, ze_result_t, ZE_RESULT_SUCCESS, (NEO::CommandStreamReceiver * *csr, uint32_t ordinal, uint32_t index)); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index e28845951b..6d4e24c424 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -25,7 +25,7 @@ class MockCommandListHw : public WhiteBox<::L0::CommandListCoreFamily>() {} MockCommandListHw(bool failOnFirst) : WhiteBox<::L0::CommandListCoreFamily>(), failOnFirstCopy(failOnFirst) {} - AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize) override { + AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize, bool allowHostCopy) override { return {0, 0, nullptr, true}; } ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, @@ -960,7 +960,7 @@ class MockCommandListForRegionSize : public WhiteBox<::L0::CommandListCoreFamily public: MockCommandListForRegionSize() : WhiteBox<::L0::CommandListCoreFamily>() {} - AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize) override { + AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize, bool allowHostCopy) override { return {0, 0, nullptr, true}; } ze_result_t appendMemoryCopyBlitRegion(NEO::GraphicsAllocation *srcAllocation, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp index 8413bc17f2..65496164ec 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp @@ -154,7 +154,7 @@ HWTEST2_F(CommandListCreate, givenHostAllocInMapWhenGetHostPtrAllocCalledThenCor size_t expectedOffset = 0x10; auto newBufferPtr = ptrOffset(cpuPtr, expectedOffset); auto newBufferSize = allocSize - 0x20; - auto newAlloc = commandList->getHostPtrAlloc(newBufferPtr, newBufferSize); + auto newAlloc = commandList->getHostPtrAlloc(newBufferPtr, newBufferSize, false); EXPECT_NE(nullptr, newAlloc); commandList->hostPtrMap.clear(); } @@ -222,7 +222,7 @@ HWTEST2_F(CommandListCreate, void *baseAddress = alignDown(startMemory, MemoryConstants::pageSize); size_t expectedOffset = ptrDiff(startMemory, baseAddress); - AlignedAllocationData outData = commandList->getAlignedAllocation(device, startMemory, cmdListHostPtrSize); + AlignedAllocationData outData = commandList->getAlignedAllocation(device, startMemory, cmdListHostPtrSize, false); ASSERT_NE(nullptr, outData.alloc); auto firstAlloc = outData.alloc; auto expectedGpuAddress = static_cast(alignDown(outData.alloc->getGpuAddress(), MemoryConstants::pageSize)); @@ -235,7 +235,7 @@ HWTEST2_F(CommandListCreate, expectedOffset = ptrDiff(offsetMemory, baseAddress); EXPECT_EQ(outData.offset + offset, expectedOffset); - outData = commandList->getAlignedAllocation(device, offsetMemory, 4u); + outData = commandList->getAlignedAllocation(device, offsetMemory, 4u, false); ASSERT_NE(nullptr, outData.alloc); EXPECT_EQ(firstAlloc, outData.alloc); EXPECT_EQ(startMemory, outData.alloc->getUnderlyingBuffer()); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp index 5ff2ef4394..e690231e7e 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp @@ -300,7 +300,7 @@ HWTEST2_F(HostPointerManagerCommandListTest, size_t offsetSize = 20; void *offsetPointer = ptrOffset(importPointer, allocOffset); - AlignedAllocationData outData = commandList->getAlignedAllocation(device, importPointer, importSize); + AlignedAllocationData outData = commandList->getAlignedAllocation(device, importPointer, importSize, false); auto gpuBaseAddress = static_cast(hostAllocation->getGpuAddress()); auto expectedAlignedAddress = alignDown(gpuBaseAddress, NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignment()); size_t expectedOffset = gpuBaseAddress - expectedAlignedAddress; @@ -309,7 +309,7 @@ HWTEST2_F(HostPointerManagerCommandListTest, EXPECT_EQ(hostAllocation, outData.alloc); EXPECT_EQ(expectedOffset, outData.offset); - outData = commandList->getAlignedAllocation(device, offsetPointer, offsetSize); + outData = commandList->getAlignedAllocation(device, offsetPointer, offsetSize, false); expectedOffset += allocOffset; EXPECT_EQ(importPointer, hostAllocation->getUnderlyingBuffer()); EXPECT_EQ(expectedAlignedAddress, outData.alignedAllocationPtr); @@ -336,7 +336,7 @@ HWTEST2_F(HostPointerManagerCommandListTest, auto hostAllocation = hostDriverHandle->findHostPointerAllocation(offsetPointer, pointerSize, device->getRootDeviceIndex()); ASSERT_NE(nullptr, hostAllocation); - AlignedAllocationData outData = commandList->getAlignedAllocation(device, offsetPointer, pointerSize); + AlignedAllocationData outData = commandList->getAlignedAllocation(device, offsetPointer, pointerSize, false); auto expectedAlignedAddress = static_cast(hostAllocation->getGpuAddress()); EXPECT_EQ(heapPointer, hostAllocation->getUnderlyingBuffer()); EXPECT_EQ(expectedAlignedAddress, outData.alignedAllocationPtr); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp index ba8da563f5..276828d8c3 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp @@ -22,7 +22,7 @@ class MockCommandListHw : public WhiteBox<::L0::CommandListCoreFamily>() {} - AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize) override { + AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize, bool allowHostCopy) override { return {0, 0, nullptr, true}; } ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, @@ -125,8 +125,8 @@ using AppendMemoryCopy = CommandListCreate; template class MockAppendMemoryCopy : public MockCommandListHw { public: - AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize) override { - return L0::CommandListCoreFamily::getAlignedAllocation(device, buffer, bufferSize); + AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize, bool allowHostCopy) override { + return L0::CommandListCoreFamily::getAlignedAllocation(device, buffer, bufferSize, allowHostCopy); } ze_result_t appendMemoryCopyKernel2d(AlignedAllocationData *dstAlignedAllocation, AlignedAllocationData *srcAlignedAllocation, Builtin builtin, const ze_copy_region_t *dstRegion, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp index c80df3f611..1f37858e53 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp @@ -25,7 +25,7 @@ class MockCommandListForMemFill : public WhiteBox<::L0::CommandListCoreFamily>() {} - AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize) override { + AlignedAllocationData getAlignedAllocation(L0::Device *device, const void *buffer, uint64_t bufferSize, bool allowHostCopy) override { return {0, 0, nullptr, true}; } ze_result_t appendMemoryCopyBlit(uintptr_t dstPtr, diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist.cpp index 3696ca45a7..758fb2d19d 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist.cpp @@ -90,6 +90,33 @@ struct MultiDeviceCommandQueueExecuteCommandLists : public TestgetDefaultEngine().commandStreamReceiver, + &desc, + false, + false, + returnValue)); + ASSERT_NE(nullptr, commandQueue->commandStream); + + auto commandList1 = whitebox_cast(CommandList::fromHandle(commandLists[0])); + auto commandList2 = whitebox_cast(CommandList::fromHandle(commandLists[1])); + commandList1->requiresUncachedMOCS = true; + commandList2->requiresUncachedMOCS = true; + auto result = commandQueue->executeCommandLists(numCommandLists, commandLists, nullptr, true); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + ASSERT_EQ(commandQueue->cachedMOCSAllowed, false); + + commandQueue->destroy(); +} + HWTEST_F(CommandQueueExecuteCommandLists, whenASecondLevelBatchBufferPerCommandListAddedThenProperSizeExpected) { using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END; diff --git a/level_zero/core/test/unit_tests/sources/device/test_device.cpp b/level_zero/core/test/unit_tests/sources/device/test_device.cpp index c9109dcbaf..42a7cd44ab 100644 --- a/level_zero/core/test/unit_tests/sources/device/test_device.cpp +++ b/level_zero/core/test/unit_tests/sources/device/test_device.cpp @@ -456,7 +456,7 @@ TEST_F(DeviceTest, givenEmptySVmAllocStorageWhenAllocateMemoryFromHostPtrThenVal constexpr auto allocationSize = sizeof(int) * dataSize; - auto allocation = device->allocateMemoryFromHostPtr(data.get(), allocationSize); + auto allocation = device->allocateMemoryFromHostPtr(data.get(), allocationSize, false); EXPECT_NE(nullptr, allocation); EXPECT_EQ(NEO::GraphicsAllocation::AllocationType::EXTERNAL_HOST_PTR, allocation->getAllocationType()); EXPECT_EQ(rootDeviceIndex, allocation->getRootDeviceIndex()); @@ -565,7 +565,7 @@ TEST_F(DeviceHostPointerTest, givenHostPointerNotAcceptedByKernelThenNewAllocati buffer[i] = i + 10; } - auto allocation = device->allocateMemoryFromHostPtr(buffer, size); + auto allocation = device->allocateMemoryFromHostPtr(buffer, size, true); EXPECT_NE(nullptr, allocation); EXPECT_EQ(NEO::GraphicsAllocation::AllocationType::INTERNAL_HOST_MEMORY, allocation->getAllocationType()); EXPECT_EQ(rootDeviceIndex, allocation->getRootDeviceIndex()); @@ -577,6 +577,18 @@ TEST_F(DeviceHostPointerTest, givenHostPointerNotAcceptedByKernelThenNewAllocati delete[] buffer; } +TEST_F(DeviceHostPointerTest, givenHostPointerNotAcceptedByKernelAndHostPointerCopyIsNotAllowedThenAllocationIsNull) { + size_t size = 55; + uint64_t *buffer = new uint64_t[size]; + for (uint32_t i = 0; i < size; i++) { + buffer[i] = i + 10; + } + + auto allocation = device->allocateMemoryFromHostPtr(buffer, size, false); + EXPECT_EQ(nullptr, allocation); + delete[] buffer; +} + TEST_F(DeviceTest, givenKernelExtendedPropertiesStructureWhenKernelPropertiesCalledThenSuccessIsReturnedAndPropertiesAreSet) { ze_device_module_properties_t kernelProperties = {}; diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index 24403f106c..a08bbd0bde 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -1500,6 +1500,38 @@ HWTEST2_F(KernelImpPatchBindlessTest, GivenKernelImpWhenSetSurfaceStateBindfulTh EXPECT_TRUE(memcmp(&surfaceStateAfter, &surfaceStateBefore, size) == 0); } +using KernelImpL3CachingTests = Test; + +HWTEST2_F(KernelImpL3CachingTests, GivenKernelImpWhenSetSurfaceStateWithUnalignedMemoryThenL3CachingIsDisabled, MatchAny) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + ze_kernel_desc_t desc = {}; + desc.pKernelName = kernelName.c_str(); + + WhiteBoxKernelHw mockKernel; + mockKernel.module = module.get(); + mockKernel.initialize(&desc); + + auto &arg = const_cast(mockKernel.kernelImmData->getDescriptor().payloadMappings.explicitArgs[0].template as()); + arg.bindless = undefined; + arg.bindful = 0x40; + + neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[neoDevice->getRootDeviceIndex()]->createBindlessHeapsHelper(neoDevice->getMemoryManager(), + neoDevice->getNumGenericSubDevices() > 1, + neoDevice->getRootDeviceIndex(), + neoDevice->getDeviceBitfield()); + auto &hwHelper = NEO::HwHelper::get(device->getHwInfo().platform.eRenderCoreFamily); + size_t size = hwHelper.getRenderSurfaceStateSize(); + uint64_t gpuAddress = 0x2000; + void *buffer = reinterpret_cast(0x20123); + + NEO::MockGraphicsAllocation mockAllocation(buffer, gpuAddress, size); + auto expectedSsInHeap = device->getNEODevice()->getBindlessHeapsHelper()->allocateSSInHeap(size, &mockAllocation, NEO::BindlessHeapsHelper::GLOBAL_SSH); + + memset(expectedSsInHeap.ssPtr, 0, size); + mockKernel.setBufferSurfaceState(0, buffer, &mockAllocation); + EXPECT_EQ(mockKernel.getKernelRequiresUncachedMocs(), true); +} + struct MyMockKernel : public Mock { void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override { setSurfaceStateCalled = true; diff --git a/level_zero/core/test/unit_tests/sources/memory/test_memory.cpp b/level_zero/core/test/unit_tests/sources/memory/test_memory.cpp index a2f7bb2ff6..6f07665b67 100644 --- a/level_zero/core/test/unit_tests/sources/memory/test_memory.cpp +++ b/level_zero/core/test/unit_tests/sources/memory/test_memory.cpp @@ -1889,7 +1889,7 @@ HWTEST2_F(MultipleDevicePeerAllocationTest, auto commandList = std::make_unique<::L0::ult::CommandListCoreFamily>(); commandList->initialize(device1, NEO::EngineGroupType::RenderCompute, 0u); - EXPECT_THROW(commandList->getAlignedAllocation(device1, ptr, size), std::exception); + EXPECT_THROW(commandList->getAlignedAllocation(device1, ptr, size, false), std::exception); result = context->freeMem(ptr); ASSERT_EQ(result, ZE_RESULT_SUCCESS); @@ -1915,7 +1915,7 @@ HWTEST2_F(MultipleDevicePeerAllocationTest, auto commandList = std::make_unique<::L0::ult::CommandListCoreFamily>(); commandList->initialize(device1, NEO::EngineGroupType::RenderCompute, 0u); - AlignedAllocationData outData = commandList->getAlignedAllocation(device1, ptr, size); + AlignedAllocationData outData = commandList->getAlignedAllocation(device1, ptr, size, false); EXPECT_NE(outData.alignedAllocationPtr, 0u); result = context->freeMem(ptr); @@ -1942,7 +1942,7 @@ HWTEST2_F(MultipleDevicePeerAllocationTest, auto commandList = std::make_unique<::L0::ult::CommandListCoreFamily>(); commandList->initialize(device0, NEO::EngineGroupType::RenderCompute, 0u); - AlignedAllocationData outData = commandList->getAlignedAllocation(device0, ptr, size); + AlignedAllocationData outData = commandList->getAlignedAllocation(device0, ptr, size, false); EXPECT_NE(outData.alignedAllocationPtr, 0u); result = context->freeMem(ptr);