From 09d2ffb9ed23cb2a31cde050423e4f788017c478 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Mon, 6 Dec 2021 10:01:46 +0000 Subject: [PATCH] Add missing cache flush Resolves: NEO-6505 Signed-off-by: Lukasz Jobczyk --- .../source/cmdlist/cmdlist_hw_immediate.inl | 3 +- opencl/source/command_queue/command_queue.cpp | 4 ++ opencl/source/command_queue/command_queue.h | 19 +++--- opencl/source/command_queue/enqueue_common.h | 7 ++- opencl/source/helpers/task_information.cpp | 9 ++- .../command_queue/command_queue_tests.cpp | 23 ++++++++ .../command_stream/compute_mode_tests.h | 2 +- .../test/unit_test/mocks/mock_command_queue.h | 1 + .../command_stream_receiver_hw_base.inl | 1 + .../source/command_stream/csr_definitions.h | 58 ++++++++++--------- .../common/helpers/dispatch_flags_helper.h | 3 +- 11 files changed, 85 insertions(+), 45 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 12ca497a86..21668fe84d 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -54,7 +54,8 @@ ze_result_t CommandListCoreFamilyImmediate::executeCommandListImm false, //useSingleSubdevice false, //useGlobalAtomics this->device->getNEODevice()->getNumGenericSubDevices() > 1, //areMultipleSubDevicesInContext - false //memoryMigrationRequired + false, //memoryMigrationRequired + false //textureCacheFlush ); this->commandContainer.removeDuplicatesFromResidencyContainer(); diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 3931845e92..41731478d5 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -685,6 +685,10 @@ uint32_t CommandQueue::peekBcsTaskCount(aub_stream::EngineType bcsEngineType) co return state.taskCount; } +bool CommandQueue::isTextureCacheFlushNeeded(uint32_t commandType) const { + return commandType == CL_COMMAND_COPY_IMAGE && getGpgpuCommandStreamReceiver().isDirectSubmissionEnabled(); +} + IndirectHeap &CommandQueue::getIndirectHeap(IndirectHeap::Type heapType, size_t minRequiredSize) { return getGpgpuCommandStreamReceiver().getIndirectHeap(heapType, minRequiredSize); } diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index d5c21a73d1..64f3aff962 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -45,14 +45,6 @@ enum class QueuePriority { HIGH }; -inline bool shouldFlushDC(uint32_t commandType, PrintfHandler *printfHandler) { - return (commandType == CL_COMMAND_READ_BUFFER || - commandType == CL_COMMAND_READ_BUFFER_RECT || - commandType == CL_COMMAND_READ_IMAGE || - commandType == CL_COMMAND_SVM_MAP || - printfHandler); -} - template <> struct OpenCLObjectMapper<_cl_command_queue> { typedef class CommandQueue DerivedType; @@ -371,6 +363,17 @@ class CommandQueue : public BaseObject<_cl_command_queue> { void providePerformanceHint(TransferProperties &transferProperties); bool queueDependenciesClearRequired() const; bool blitEnqueueAllowed(const CsrSelectionArgs &args) const; + + bool isTextureCacheFlushNeeded(uint32_t commandType) const; + inline bool shouldFlushDC(uint32_t commandType, PrintfHandler *printfHandler) const { + return (commandType == CL_COMMAND_READ_BUFFER || + commandType == CL_COMMAND_READ_BUFFER_RECT || + commandType == CL_COMMAND_READ_IMAGE || + commandType == CL_COMMAND_SVM_MAP || + printfHandler || + isTextureCacheFlushNeeded(commandType)); + } + MOCKABLE_VIRTUAL bool blitEnqueueImageAllowed(const size_t *origin, const size_t *region, const Image &image) const; void aubCaptureHook(bool &blocking, bool &clearAllDependencies, const MultiDispatchInfo &multiDispatchInfo); virtual bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const = 0; diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 2d5df69fe8..d6abd24c7d 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -892,8 +892,8 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( kernel->isSingleSubdevicePreferred(), //useSingleSubdevice useGlobalAtomics, //useGlobalAtomics kernel->areMultipleSubDevicesInContext(), //areMultipleSubDevicesInContext - kernel->requiresMemoryMigration() //memoryMigrationRequired - ); + kernel->requiresMemoryMigration(), //memoryMigrationRequired + isTextureCacheFlushNeeded(commandType)); //textureCacheFlush dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired; dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = specialPipelineSelectMode; @@ -1116,7 +1116,8 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( false, //useSingleSubdevice false, //useGlobalAtomics context->containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext - false); //memoryMigrationRequired + false, //memoryMigrationRequired + false); //textureCacheFlush if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index 13cfa0bee8..af9bc46e38 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -79,7 +79,8 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) { false, //useSingleSubdevice false, //useGlobalAtomics false, //areMultipleSubDevicesInContext - false); //memoryMigrationRequired + false, //memoryMigrationRequired + false); //textureCacheFlush DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady); @@ -246,7 +247,8 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate kernel->isSingleSubdevicePreferred(), //useSingleSubdevice kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, //useGlobalAtomics kernel->areMultipleSubDevicesInContext(), //areMultipleSubDevicesInContext - kernel->requiresMemoryMigration()); //memoryMigrationRequired + kernel->requiresMemoryMigration(), //memoryMigrationRequired + false); //textureCacheFlush if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); @@ -388,7 +390,8 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate false, //useSingleSubdevice false, //useGlobalAtomics commandQueue.getContext().containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext - false); //memoryMigrationRequired + false, //memoryMigrationRequired + false); //textureCacheFlush if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp index e15a985ae3..fb827a6e34 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp @@ -379,6 +379,29 @@ HWTEST_F(CommandQueueCommandStreamTest, givenCommandQueueThatWaitsOnAbortedUserE EXPECT_EQ(100u, cmdQ.taskLevel); } +HWTEST_F(CommandQueueCommandStreamTest, WhenCheckIsTextureCacheFlushNeededThenReturnProperValue) { + MockContext context; + auto mockDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(nullptr)); + MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false); + auto &commandStreamReceiver = mockDevice->getUltCommandStreamReceiver(); + + EXPECT_FALSE(cmdQ.isTextureCacheFlushNeeded(CL_COMMAND_COPY_BUFFER_RECT)); + + for (auto i = CL_COMMAND_NDRANGE_KERNEL; i < CL_COMMAND_RELEASE_GL_OBJECTS; i++) { + if (i == CL_COMMAND_COPY_IMAGE) { + commandStreamReceiver.directSubmissionAvailable = true; + EXPECT_TRUE(cmdQ.isTextureCacheFlushNeeded(i)); + commandStreamReceiver.directSubmissionAvailable = false; + EXPECT_FALSE(cmdQ.isTextureCacheFlushNeeded(i)); + } else { + commandStreamReceiver.directSubmissionAvailable = true; + EXPECT_FALSE(cmdQ.isTextureCacheFlushNeeded(i)); + commandStreamReceiver.directSubmissionAvailable = false; + EXPECT_FALSE(cmdQ.isTextureCacheFlushNeeded(i)); + } + } +} + TEST_F(CommandQueueCommandStreamTest, GivenValidCommandQueueWhenGettingCommandStreamThenValidObjectIsReturned) { const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0}; MockCommandQueue commandQueue(context.get(), pClDevice, props, false); diff --git a/opencl/test/unit_test/command_stream/compute_mode_tests.h b/opencl/test/unit_test/command_stream/compute_mode_tests.h index b628be1ff3..2f14c04541 100644 --- a/opencl/test/unit_test/command_stream/compute_mode_tests.h +++ b/opencl/test/unit_test/command_stream/compute_mode_tests.h @@ -83,6 +83,6 @@ struct ComputeModeRequirements : public ::testing::Test { CommandStreamReceiver *csr = nullptr; std::unique_ptr device; - DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, AdditionalKernelExecInfo::NotApplicable, KernelExecutionType::NotApplicable, MemoryCompressionState::NotApplicable, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false}; + DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, AdditionalKernelExecInfo::NotApplicable, KernelExecutionType::NotApplicable, MemoryCompressionState::NotApplicable, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false}; GraphicsAllocation *alloc = nullptr; }; diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 11e9522ac4..237e868205 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -26,6 +26,7 @@ class MockCommandQueue : public CommandQueue { using CommandQueue::device; using CommandQueue::gpgpuEngine; using CommandQueue::isCopyOnly; + using CommandQueue::isTextureCacheFlushNeeded; using CommandQueue::obtainNewTimestampPacketNodes; using CommandQueue::overrideEngine; using CommandQueue::queueCapabilities; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 93356336e6..4ca53b45f0 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -217,6 +217,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( PipeControlArgs args(dispatchFlags.dcFlush); args.notifyEnable = isUsedNotifyEnableForPostSync(); args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired; + args.textureCacheInvalidationEnable |= dispatchFlags.textureCacheFlush; args.workloadPartitionOffset = this->activePartitions > 1 && this->staticWorkPartitioningEnabled; MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( commandStreamTask, diff --git a/shared/source/command_stream/csr_definitions.h b/shared/source/command_stream/csr_definitions.h index d7f7e3e14d..2a509adbe2 100644 --- a/shared/source/command_stream/csr_definitions.h +++ b/shared/source/command_stream/csr_definitions.h @@ -55,34 +55,35 @@ struct DispatchFlags { KernelExecutionType kernelExecutionTypeP, MemoryCompressionState memoryCompressionStateP, uint64_t sliceCountP, bool blockingP, bool dcFlushP, bool useSLMP, bool guardCommandBufferWithPipeControlP, bool gsba32BitRequiredP, bool requiresCoherencyP, bool lowPriorityP, bool implicitFlushP, bool outOfOrderExecutionAllowedP, bool epilogueRequiredP, - bool usePerDSSbackedBufferP, bool useSingleSubdeviceP, bool useGlobalAtomicsP, bool areMultipleSubDevicesInContextP, bool memoryMigrationRequiredP) : csrDependencies(csrDependenciesP), - barrierTimestampPacketNodes(barrierTimestampPacketNodesP), - pipelineSelectArgs(pipelineSelectArgsP), - flushStampReference(flushStampReferenceP), - throttle(throttleP), - preemptionMode(preemptionModeP), - numGrfRequired(numGrfRequiredP), - l3CacheSettings(l3CacheSettingsP), - threadArbitrationPolicy(threadArbitrationPolicyP), - additionalKernelExecInfo(additionalKernelExecInfoP), - kernelExecutionType(kernelExecutionTypeP), - memoryCompressionState(memoryCompressionStateP), - sliceCount(sliceCountP), - blocking(blockingP), - dcFlush(dcFlushP), - useSLM(useSLMP), - guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP), - gsba32BitRequired(gsba32BitRequiredP), - requiresCoherency(requiresCoherencyP), - lowPriority(lowPriorityP), - implicitFlush(implicitFlushP), - outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP), - epilogueRequired(epilogueRequiredP), - usePerDssBackedBuffer(usePerDSSbackedBufferP), - useSingleSubdevice(useSingleSubdeviceP), - useGlobalAtomics(useGlobalAtomicsP), - areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP), - memoryMigrationRequired(memoryMigrationRequiredP){}; + bool usePerDSSbackedBufferP, bool useSingleSubdeviceP, bool useGlobalAtomicsP, bool areMultipleSubDevicesInContextP, bool memoryMigrationRequiredP, bool textureCacheFlush) : csrDependencies(csrDependenciesP), + barrierTimestampPacketNodes(barrierTimestampPacketNodesP), + pipelineSelectArgs(pipelineSelectArgsP), + flushStampReference(flushStampReferenceP), + throttle(throttleP), + preemptionMode(preemptionModeP), + numGrfRequired(numGrfRequiredP), + l3CacheSettings(l3CacheSettingsP), + threadArbitrationPolicy(threadArbitrationPolicyP), + additionalKernelExecInfo(additionalKernelExecInfoP), + kernelExecutionType(kernelExecutionTypeP), + memoryCompressionState(memoryCompressionStateP), + sliceCount(sliceCountP), + blocking(blockingP), + dcFlush(dcFlushP), + useSLM(useSLMP), + guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP), + gsba32BitRequired(gsba32BitRequiredP), + requiresCoherency(requiresCoherencyP), + lowPriority(lowPriorityP), + implicitFlush(implicitFlushP), + outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP), + epilogueRequired(epilogueRequiredP), + usePerDssBackedBuffer(usePerDSSbackedBufferP), + useSingleSubdevice(useSingleSubdeviceP), + useGlobalAtomics(useGlobalAtomicsP), + areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP), + memoryMigrationRequired(memoryMigrationRequiredP), + textureCacheFlush(textureCacheFlush){}; CsrDependencies csrDependencies; TimestampPacketContainer *barrierTimestampPacketNodes = nullptr; @@ -113,6 +114,7 @@ struct DispatchFlags { bool useGlobalAtomics = false; bool areMultipleSubDevicesInContext = false; bool memoryMigrationRequired = false; + bool textureCacheFlush = false; }; struct CsrSizeRequestFlags { diff --git a/shared/test/common/helpers/dispatch_flags_helper.h b/shared/test/common/helpers/dispatch_flags_helper.h index 8e3d7c4485..e7725f24cc 100644 --- a/shared/test/common/helpers/dispatch_flags_helper.h +++ b/shared/test/common/helpers/dispatch_flags_helper.h @@ -41,7 +41,8 @@ struct DispatchFlagsHelper { false, //useSingleSubdevice false, //useGlobalAtomics false, //areMultipleSubDevicesInContext - false //memoryMigrationRequired + false, //memoryMigrationRequired + false //textureCacheFlush ); } };