diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index 83fed699db..af1451731c 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -593,17 +593,21 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( previousTimestampPacketNodes->makeResident(getGpgpuCommandStreamReceiver()); } + bool anyUncacheableArgs = false; auto requiresCoherency = false; for (auto surface : CreateRange(surfaces, surfaceCount)) { surface->makeResident(getGpgpuCommandStreamReceiver()); requiresCoherency |= surface->IsCoherent; + if (!surface->allowsL3Caching()) { + anyUncacheableArgs = true; + } } auto mediaSamplerRequired = false; uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber; auto specialPipelineSelectMode = false; Kernel *kernel = nullptr; - bool anyUncacheableArgs = false; + for (auto &dispatchInfo : multiDispatchInfo) { if (kernel != dispatchInfo.getKernel()) { kernel = dispatchInfo.getKernel(); @@ -659,10 +663,6 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( } } - if (anyUncacheableArgs) { - getGpgpuCommandStreamReceiver().setDisableL3Cache(true); - } - DispatchFlags dispatchFlags; dispatchFlags.blocking = blocking; dispatchFlags.dcFlush = shouldFlushDC(commandType, printfHandler) || allocNeedsFlushDC; @@ -685,6 +685,10 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( dispatchFlags.multiEngineQueue = this->multiEngineQueue; DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady); + if (anyUncacheableArgs) { + dispatchFlags.l3CacheSettings = L3CachingSettings::l3CacheOff; + } + if (gtpinIsGTPinInitialized()) { gtpinNotifyPreFlushTask(this); } diff --git a/runtime/command_stream/command_stream_receiver.cpp b/runtime/command_stream/command_stream_receiver.cpp index 5bb58dde2d..47b9ee3d16 100644 --- a/runtime/command_stream/command_stream_receiver.cpp +++ b/runtime/command_stream/command_stream_receiver.cpp @@ -104,9 +104,6 @@ void CommandStreamReceiver::makeSurfacePackNonResident(ResidencyContainer &alloc void CommandStreamReceiver::makeResidentHostPtrAllocation(GraphicsAllocation *gfxAllocation) { makeResident(*gfxAllocation); - if (!isL3Capable(*gfxAllocation)) { - setDisableL3Cache(true); - } } void CommandStreamReceiver::waitForTaskCountAndCleanAllocationList(uint32_t requiredTaskCount, uint32_t allocationUsage) { diff --git a/runtime/command_stream/command_stream_receiver.h b/runtime/command_stream/command_stream_receiver.h index 342849b0d7..692ed42c7f 100644 --- a/runtime/command_stream/command_stream_receiver.h +++ b/runtime/command_stream/command_stream_receiver.h @@ -169,9 +169,6 @@ class CommandStreamReceiver { virtual cl_int expectMemory(const void *gfxAddress, const void *srcAddress, size_t length, uint32_t compareOperation); - void setDisableL3Cache(bool val) { - disableL3Cache = val; - } bool isMultiOsContextCapable() const; void setLatestSentTaskCount(uint32_t latestSentTaskCount) { @@ -246,7 +243,6 @@ class CommandStreamReceiver { bool bindingTableBaseAddressRequired = false; bool mediaVfeStateDirty = true; bool lastVmeSubslicesConfig = false; - bool disableL3Cache = false; bool stallingPipeControlOnNextFlushRequired = false; bool timestampPacketWriteEnabled = false; bool nTo1SubmissionModelEnabled = false; diff --git a/runtime/command_stream/command_stream_receiver_hw_base.inl b/runtime/command_stream/command_stream_receiver_hw_base.inl index c47e910daa..61a887115c 100644 --- a/runtime/command_stream/command_stream_receiver_hw_base.inl +++ b/runtime/command_stream/command_stream_receiver_hw_base.inl @@ -271,9 +271,8 @@ CompletionStamp CommandStreamReceiverHw::flushTask( auto isStateBaseAddressDirty = dshDirty || iohDirty || sshDirty || stateBaseAddressDirty; auto requiredL3Index = CacheSettings::l3CacheOn; - if (this->disableL3Cache) { + if (dispatchFlags.l3CacheSettings == L3CachingSettings::l3CacheOff) { requiredL3Index = CacheSettings::l3CacheOff; - this->disableL3Cache = false; } if (requiredL3Index != latestSentStatelessMocsConfig) { diff --git a/runtime/command_stream/csr_definitions.h b/runtime/command_stream/csr_definitions.h index daf1d41b50..25cf99a191 100644 --- a/runtime/command_stream/csr_definitions.h +++ b/runtime/command_stream/csr_definitions.h @@ -30,6 +30,10 @@ constexpr auto csOverfetchSize = MemoryConstants::pageSize; namespace TimeoutControls { constexpr int64_t maxTimeout = std::numeric_limits::max(); } +namespace L3CachingSettings { +constexpr uint32_t l3CacheOn = 0u; +constexpr uint32_t l3CacheOff = 1u; +} // namespace L3CachingSettings struct DispatchFlags { CsrDependencies csrDependencies; @@ -37,6 +41,7 @@ struct DispatchFlags { QueueThrottle throttle = QueueThrottle::MEDIUM; PreemptionMode preemptionMode = PreemptionMode::Disabled; uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber; + uint32_t l3CacheSettings = L3CachingSettings::l3CacheOn; bool blocking = false; bool dcFlush = false; bool useSLM = false; diff --git a/runtime/helpers/task_information.cpp b/runtime/helpers/task_information.cpp index 3ffc59a9b6..952de32e2b 100644 --- a/runtime/helpers/task_information.cpp +++ b/runtime/helpers/task_information.cpp @@ -126,10 +126,14 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate IndirectHeap *ssh = kernelOperation->ssh.get(); auto requiresCoherency = false; + auto anyUncacheableArgs = false; for (auto &surface : surfaces) { DEBUG_BREAK_IF(!surface); surface->makeResident(commandStreamReceiver); requiresCoherency |= surface->IsCoherent; + if (!surface->allowsL3Caching()) { + anyUncacheableArgs = true; + } } if (printfHandler) { @@ -187,6 +191,10 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate } dispatchFlags.specialPipelineSelectMode = kernel->requiresSpecialPipelineSelectMode(); + if (anyUncacheableArgs) { + dispatchFlags.l3CacheSettings = L3CachingSettings::l3CacheOff; + } + DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady); gtpinNotifyPreFlushTask(&commandQueue); diff --git a/runtime/memory_manager/surface.h b/runtime/memory_manager/surface.h index 6a3d46baf7..4cde9c88c8 100644 --- a/runtime/memory_manager/surface.h +++ b/runtime/memory_manager/surface.h @@ -7,6 +7,7 @@ #pragma once #include "runtime/command_stream/command_stream_receiver.h" +#include "runtime/helpers/cache_policy.h" #include "runtime/mem_obj/mem_obj.h" #include "runtime/memory_manager/graphics_allocation.h" @@ -18,6 +19,7 @@ class Surface { virtual ~Surface() = default; virtual void makeResident(CommandStreamReceiver &csr) = 0; virtual Surface *duplicate() = 0; + virtual bool allowsL3Caching() { return true; } bool IsCoherent; }; @@ -73,6 +75,10 @@ class HostPtrSurface : public Surface { return isPtrCopyAllowed; } + virtual bool allowsL3Caching() override { + return isL3Capable(*gfxAllocation); + } + protected: void *memoryPointer; size_t surfaceSize; diff --git a/unit_tests/command_queue/enqueue_read_buffer_tests.cpp b/unit_tests/command_queue/enqueue_read_buffer_tests.cpp index 5cc9c4ddc5..6267cd17a9 100644 --- a/unit_tests/command_queue/enqueue_read_buffer_tests.cpp +++ b/unit_tests/command_queue/enqueue_read_buffer_tests.cpp @@ -313,7 +313,6 @@ HWTEST_F(EnqueueReadBufferTypeTest, givenNotAlignedPointerAndAlignedSizeWhenRead EXPECT_EQ(CL_SUCCESS, retVal); auto &csr = pDevice->getUltCommandStreamReceiver(); EXPECT_EQ(CacheSettings::l3CacheOff, csr.latestSentStatelessMocsConfig); - EXPECT_FALSE(csr.disableL3Cache); void *ptr2 = (void *)0x1040; @@ -328,7 +327,6 @@ HWTEST_F(EnqueueReadBufferTypeTest, givenNotAlignedPointerAndAlignedSizeWhenRead nullptr); EXPECT_EQ(CacheSettings::l3CacheOn, csr.latestSentStatelessMocsConfig); - EXPECT_FALSE(csr.disableL3Cache); } HWTEST_F(EnqueueReadBufferTypeTest, givenOOQWithEnabledSupportCpuCopiesAndDstPtrEqualSrcPtrAndZeroCopyBufferWhenReadBufferIsExecutedThenTaskLevelNotIncreased) { diff --git a/unit_tests/command_stream/command_stream_receiver_tests.cpp b/unit_tests/command_stream/command_stream_receiver_tests.cpp index 8c87db41c7..84925d46a3 100644 --- a/unit_tests/command_stream/command_stream_receiver_tests.cpp +++ b/unit_tests/command_stream/command_stream_receiver_tests.cpp @@ -154,36 +154,6 @@ TEST_F(CommandStreamReceiverTest, givenCommandStreamReceiverWhenGetCSIsCalledThe EXPECT_EQ(GraphicsAllocation::AllocationType::COMMAND_BUFFER, commandStreamAllocation->getAllocationType()); } -HWTEST_F(CommandStreamReceiverTest, givenPtrAndSizeThatMeetL3CriteriaWhenMakeResidentHostPtrThenCsrEnableL3) { - void *hostPtr = reinterpret_cast(0xF000); - auto size = 0x2000u; - - auto memoryManager = commandStreamReceiver->getMemoryManager(); - GraphicsAllocation *graphicsAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{false, size}, hostPtr); - ASSERT_NE(nullptr, graphicsAllocation); - commandStreamReceiver->makeResidentHostPtrAllocation(graphicsAllocation); - - auto &csr = pDevice->getUltCommandStreamReceiver(); - - EXPECT_FALSE(csr.disableL3Cache); - memoryManager->freeGraphicsMemory(graphicsAllocation); -} - -HWTEST_F(CommandStreamReceiverTest, givenPtrAndSizeThatDoNotMeetL3CriteriaWhenMakeResidentHostPtrThenCsrDisableL3) { - void *hostPtr = reinterpret_cast(0xF001); - auto size = 0x2001u; - - auto memoryManager = commandStreamReceiver->getMemoryManager(); - GraphicsAllocation *graphicsAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{false, size}, hostPtr); - ASSERT_NE(nullptr, graphicsAllocation); - commandStreamReceiver->makeResidentHostPtrAllocation(graphicsAllocation); - - auto &csr = pDevice->getUltCommandStreamReceiver(); - - EXPECT_TRUE(csr.disableL3Cache); - memoryManager->freeGraphicsMemory(graphicsAllocation); -} - TEST_F(CommandStreamReceiverTest, memoryManagerHasAccessToCSR) { auto *memoryManager = commandStreamReceiver->getMemoryManager(); EXPECT_EQ(commandStreamReceiver, memoryManager->getDefaultCommandStreamReceiver(0)); @@ -206,7 +176,6 @@ HWTEST_F(CommandStreamReceiverTest, whenStoreAllocationThenStoredAllocationHasTa HWTEST_F(CommandStreamReceiverTest, givenCommandStreamReceiverWhenCheckedForInitialStatusOfStatelessMocsIndexThenUnknownMocsIsReturend) { auto &csr = pDevice->getUltCommandStreamReceiver(); EXPECT_EQ(CacheSettings::unknownMocs, csr.latestSentStatelessMocsConfig); - EXPECT_FALSE(csr.disableL3Cache); } TEST_F(CommandStreamReceiverTest, makeResidentPushesAllocationToMemoryManagerResidencyList) { diff --git a/unit_tests/libult/ult_command_stream_receiver.h b/unit_tests/libult/ult_command_stream_receiver.h index b21230281c..d2e27ef8e1 100644 --- a/unit_tests/libult/ult_command_stream_receiver.h +++ b/unit_tests/libult/ult_command_stream_receiver.h @@ -35,7 +35,6 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ using BaseClass::CommandStreamReceiver::bindingTableBaseAddressRequired; using BaseClass::CommandStreamReceiver::cleanupResources; using BaseClass::CommandStreamReceiver::commandStream; - using BaseClass::CommandStreamReceiver::disableL3Cache; using BaseClass::CommandStreamReceiver::dispatchMode; using BaseClass::CommandStreamReceiver::executionEnvironment; using BaseClass::CommandStreamReceiver::experimentalCmdBuffer;