From f94528097af06fe9506ee19baf207ebfad10e276 Mon Sep 17 00:00:00 2001 From: "Cencelewska, Katarzyna" Date: Thu, 12 Jan 2023 16:58:18 +0000 Subject: [PATCH] fix: add set stateCacheInvalidationEnable to flush cache when blit operation and dcflush needed resolves problem with corruptions visible when switch ccs with bcs on platforms without engines coherency Resolves: NEO-7577 Signed-off-by: Cencelewska, Katarzyna --- .../source/cmdlist/cmdlist_hw_immediate.inl | 4 +- opencl/source/command_queue/enqueue_common.h | 10 ++- opencl/source/helpers/task_information.cpp | 9 ++- .../enqueue_command_without_kernel_tests.cpp | 37 +++++++++++ .../command_stream_receiver_hw_base.inl | 2 +- .../source/command_stream/csr_definitions.h | 64 ++++++++++--------- .../common/helpers/dispatch_flags_helper.h | 5 +- .../command_stream/compute_mode_tests.h | 2 +- 8 files changed, 91 insertions(+), 42 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 94164c8b3d..03ed09c933 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -117,8 +117,8 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate::flushRegular false, // memoryMigrationRequired false, // textureCacheFlush hasStallingCmds, // hasStallingCmds - hasRelaxedOrderingDependencies // hasRelaxedOrderingDependencies - ); + hasRelaxedOrderingDependencies, // hasRelaxedOrderingDependencies + false); // stateCacheInvalidation this->updateDispatchFlagsWithRequiredStreamState(dispatchFlags); this->csr->setRequiredScratchSizes(this->getCommandListPerThreadScratchSize(), this->getCommandListPerThreadPrivateScratchSize()); diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 633eaaec34..55b2c1c440 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -844,7 +844,8 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( kernel->requiresMemoryMigration(), // memoryMigrationRequired isTextureCacheFlushNeeded(commandType), // textureCacheFlush false, // hasStallingCmds - false); // hasRelaxedOrderingDependencies + false, // hasRelaxedOrderingDependencies + false); // stateCacheInvalidation dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired; dispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode = systolicPipelineSelectMode; @@ -1060,6 +1061,10 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( for (auto surface : createRange(surfaces, surfaceCount)) { surface->makeResident(getGpgpuCommandStreamReceiver()); } + bool stateCacheInvalidationNeeded = false; + if (getGpgpuCommandStreamReceiver().getDcFlushSupport() && enqueueProperties.operation == EnqueueProperties::Operation::Blit) { + stateCacheInvalidationNeeded = true; + } auto rootDeviceIndex = getDevice().getRootDeviceIndex(); DispatchFlags dispatchFlags( @@ -1092,7 +1097,8 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( false, // memoryMigrationRequired false, // textureCacheFlush false, // hasStallingCmds - false); // hasRelaxedOrderingDependencies + false, // hasRelaxedOrderingDependencies + stateCacheInvalidationNeeded); // stateCacheInvalidation const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index 5a7e823581..8e827e5eef 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -81,7 +81,8 @@ CompletionStamp &CommandMapUnmap::submit(TaskCountType taskLevel, bool terminate false, // memoryMigrationRequired false, // textureCacheFlush false, // hasStallingCmds - false); // hasRelaxedOrderingDependencies + false, // hasRelaxedOrderingDependencies + false); // stateCacheInvalidation DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady); @@ -215,7 +216,8 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term kernel->requiresMemoryMigration(), // memoryMigrationRequired commandQueue.isTextureCacheFlushNeeded(this->commandType), // textureCacheFlush false, // hasStallingCmds - false); // hasRelaxedOrderingDependencies + false, // hasRelaxedOrderingDependencies + false); // stateCacheInvalidation if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver); @@ -390,7 +392,8 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term false, // memoryMigrationRequired false, // textureCacheFlush false, // hasStallingCmds - false); // hasRelaxedOrderingDependencies + false, // hasRelaxedOrderingDependencies + false); // stateCacheInvalidation if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver); diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp index 81306777d4..35eb00168f 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp @@ -366,6 +366,43 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitEnqueueWhenDispatchingCommandsWithoutK EXPECT_EQ(GrfConfig::NotApplicable, mockCsr->passedDispatchFlags.numGrfRequired); } +HWTEST_F(DispatchFlagsBlitTests, givenBlitOperationWhenEnqueueCommandWithoutKernelThenDispatchFlagStateCacheInvalidationInFlushTaskIsSetCorrectly) { + using CsrType = MockCsrHw2; + setUpImpl(); + REQUIRE_FULL_BLITTER_OR_SKIP(device->getRootDeviceEnvironment()); + + auto mockCmdQ = std::make_unique>(context.get(), device.get(), nullptr); + auto mockCsr = static_cast(&mockCmdQ->getGpgpuCommandStreamReceiver()); + mockCsr->skipBlitCalls = true; + cl_int retVal = CL_SUCCESS; + auto buffer = std::unique_ptr(Buffer::create(context.get(), 0, 1, nullptr, retVal)); + auto &bcsCsr = *mockCmdQ->bcsEngines[0]->commandStreamReceiver; + + auto blocking = true; + TimestampPacketDependencies timestampPacketDependencies; + EventsRequest eventsRequest(0, nullptr, nullptr); + EventBuilder eventBuilder; + BuiltinOpParams builtinOpParams; + builtinOpParams.srcMemObj = buffer.get(); + builtinOpParams.dstPtr = reinterpret_cast(0x1234); + MultiDispatchInfo multiDispatchInfo; + multiDispatchInfo.setBuiltinOpParams(builtinOpParams); + CsrDependencies csrDeps; + + BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, + eventsRequest, &mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false, nullptr); + + BlitPropertiesContainer blitPropertiesContainer; + blitPropertiesContainer.push_back(blitProperties); + + EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer); + mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, + eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr); + + auto expectedValue = mockCmdQ->getGpgpuCommandStreamReceiver().getDcFlushSupport(); + EXPECT_EQ(expectedValue, mockCsr->passedDispatchFlags.stateCacheInvalidation); +} + HWTEST_F(DispatchFlagsBlitTests, givenN1EnabledWhenDispatchingWithoutKernelThenAllowOutOfOrderExecution) { using CsrType = MockCsrHw2; DebugManager.flags.EnableTimestampPacket.set(1); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 161228b435..f41487055e 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -325,6 +325,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired; args.textureCacheInvalidationEnable |= dispatchFlags.textureCacheFlush; args.workloadPartitionOffset = isMultiTileOperationEnabled(); + args.stateCacheInvalidationEnable = dispatchFlags.stateCacheInvalidation; MemorySynchronizationCommands::addBarrierWithPostSyncOperation( commandStreamTask, PostSyncMode::ImmediateData, @@ -550,7 +551,6 @@ CompletionStamp CommandStreamReceiverHw::flushTask( programStateSip(commandStreamCSR, device); DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "this->taskLevel", (uint32_t)this->taskLevel); - if (executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getHardwareInfo()->workaroundTable.flags.waSamplerCacheFlushBetweenRedescribedSurfaceReads) { if (this->samplerCacheFlushRequired != SamplerCacheFlushState::samplerCacheFlushNotRequired) { PipeControlArgs args; diff --git a/shared/source/command_stream/csr_definitions.h b/shared/source/command_stream/csr_definitions.h index 45cfd82455..7e883d4402 100644 --- a/shared/source/command_stream/csr_definitions.h +++ b/shared/source/command_stream/csr_definitions.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -68,36 +68,37 @@ struct DispatchFlags { uint64_t sliceCountP, bool blockingP, bool dcFlushP, bool useSLMP, bool guardCommandBufferWithPipeControlP, bool gsba32BitRequiredP, bool requiresCoherencyP, bool lowPriorityP, bool implicitFlushP, bool outOfOrderExecutionAllowedP, bool epilogueRequiredP, bool usePerDSSbackedBufferP, bool useGlobalAtomicsP, bool areMultipleSubDevicesInContextP, bool memoryMigrationRequiredP, bool textureCacheFlush, - bool hasStallingCmds, bool hasRelaxedOrderingDependencies) : csrDependencies(csrDependenciesP), - barrierTimestampPacketNodes(barrierTimestampPacketNodesP), - pipelineSelectArgs(pipelineSelectArgsP), - flushStampReference(flushStampReferenceP), - throttle(throttleP), - preemptionMode(preemptionModeP), - numGrfRequired(numGrfRequiredP), - l3CacheSettings(l3CacheSettingsP), - threadArbitrationPolicy(threadArbitrationPolicyP), - additionalKernelExecInfo(additionalKernelExecInfoP), - kernelExecutionType(kernelExecutionTypeP), - memoryCompressionState(memoryCompressionStateP), - sliceCount(sliceCountP), - blocking(blockingP), - dcFlush(dcFlushP), - useSLM(useSLMP), - guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP), - gsba32BitRequired(gsba32BitRequiredP), - requiresCoherency(requiresCoherencyP), - lowPriority(lowPriorityP), - implicitFlush(implicitFlushP), - outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP), - epilogueRequired(epilogueRequiredP), - usePerDssBackedBuffer(usePerDSSbackedBufferP), - useGlobalAtomics(useGlobalAtomicsP), - areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP), - memoryMigrationRequired(memoryMigrationRequiredP), - textureCacheFlush(textureCacheFlush), - hasStallingCmds(hasStallingCmds), - hasRelaxedOrderingDependencies(hasRelaxedOrderingDependencies){}; + bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool stateCacheInvalidation) : csrDependencies(csrDependenciesP), + barrierTimestampPacketNodes(barrierTimestampPacketNodesP), + pipelineSelectArgs(pipelineSelectArgsP), + flushStampReference(flushStampReferenceP), + throttle(throttleP), + preemptionMode(preemptionModeP), + numGrfRequired(numGrfRequiredP), + l3CacheSettings(l3CacheSettingsP), + threadArbitrationPolicy(threadArbitrationPolicyP), + additionalKernelExecInfo(additionalKernelExecInfoP), + kernelExecutionType(kernelExecutionTypeP), + memoryCompressionState(memoryCompressionStateP), + sliceCount(sliceCountP), + blocking(blockingP), + dcFlush(dcFlushP), + useSLM(useSLMP), + guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP), + gsba32BitRequired(gsba32BitRequiredP), + requiresCoherency(requiresCoherencyP), + lowPriority(lowPriorityP), + implicitFlush(implicitFlushP), + outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP), + epilogueRequired(epilogueRequiredP), + usePerDssBackedBuffer(usePerDSSbackedBufferP), + useGlobalAtomics(useGlobalAtomicsP), + areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP), + memoryMigrationRequired(memoryMigrationRequiredP), + textureCacheFlush(textureCacheFlush), + hasStallingCmds(hasStallingCmds), + hasRelaxedOrderingDependencies(hasRelaxedOrderingDependencies), + stateCacheInvalidation(stateCacheInvalidation){}; CsrDependencies csrDependencies; TimestampPacketContainer *barrierTimestampPacketNodes = nullptr; @@ -131,6 +132,7 @@ struct DispatchFlags { bool hasStallingCmds = false; bool hasRelaxedOrderingDependencies = false; bool disableEUFusion = false; + bool stateCacheInvalidation = false; }; struct CsrSizeRequestFlags { diff --git a/shared/test/common/helpers/dispatch_flags_helper.h b/shared/test/common/helpers/dispatch_flags_helper.h index dbfdc3aa23..50fe58e051 100644 --- a/shared/test/common/helpers/dispatch_flags_helper.h +++ b/shared/test/common/helpers/dispatch_flags_helper.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2022 Intel Corporation + * Copyright (C) 2019-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -43,7 +43,8 @@ struct DispatchFlagsHelper { false, // memoryMigrationRequired false, // textureCacheFlush false, // hasStallingCmds - false // hasRelaxedOrderingDependencies + false, // hasRelaxedOrderingDependencies + false // stateCacheInvalidation ); } }; diff --git a/shared/test/unit_test/command_stream/compute_mode_tests.h b/shared/test/unit_test/command_stream/compute_mode_tests.h index faf34408d0..5fd8f25512 100644 --- a/shared/test/unit_test/command_stream/compute_mode_tests.h +++ b/shared/test/unit_test/command_stream/compute_mode_tests.h @@ -97,6 +97,6 @@ struct ComputeModeRequirements : public ::testing::Test { CommandStreamReceiver *csr = nullptr; std::unique_ptr device; - DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, AdditionalKernelExecInfo::NotApplicable, KernelExecutionType::NotApplicable, MemoryCompressionState::NotApplicable, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false}; + DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, AdditionalKernelExecInfo::NotApplicable, KernelExecutionType::NotApplicable, MemoryCompressionState::NotApplicable, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false}; GraphicsAllocation *alloc = nullptr; };