diff --git a/core/command_stream/csr_definitions.h b/core/command_stream/csr_definitions.h index 101def9273..58b9497069 100644 --- a/core/command_stream/csr_definitions.h +++ b/core/command_stream/csr_definitions.h @@ -50,26 +50,27 @@ struct DispatchFlags { uint32_t l3CacheSettings, uint32_t threadArbitrationPolicy, uint64_t sliceCount, bool blocking, bool dcFlush, bool useSLM, bool guardCommandBufferWithPipeControl, bool gsba32BitRequired, bool requiresCoherency, bool lowPriority, bool implicitFlush, - bool outOfOrderExecutionAllowed, bool epilogueRequired) : csrDependencies(csrDependencies), - barrierTimestampPacketNodes(barrierTimestampPacketNodes), - pipelineSelectArgs(pipelineSelectArgs), - flushStampReference(flushStampReference), - throttle(throttle), - preemptionMode(preemptionMode), - numGrfRequired(numGrfRequired), - l3CacheSettings(l3CacheSettings), - threadArbitrationPolicy(threadArbitrationPolicy), - sliceCount(sliceCount), - blocking(blocking), - dcFlush(dcFlush), - useSLM(useSLM), - guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControl), - gsba32BitRequired(gsba32BitRequired), - requiresCoherency(requiresCoherency), - lowPriority(lowPriority), - implicitFlush(implicitFlush), - outOfOrderExecutionAllowed(outOfOrderExecutionAllowed), - epilogueRequired(epilogueRequired){}; + bool outOfOrderExecutionAllowed, bool epilogueRequired, bool usePerDSSbackedBuffer) : csrDependencies(csrDependencies), + barrierTimestampPacketNodes(barrierTimestampPacketNodes), + pipelineSelectArgs(pipelineSelectArgs), + flushStampReference(flushStampReference), + throttle(throttle), + preemptionMode(preemptionMode), + numGrfRequired(numGrfRequired), + l3CacheSettings(l3CacheSettings), + threadArbitrationPolicy(threadArbitrationPolicy), + sliceCount(sliceCount), + blocking(blocking), + dcFlush(dcFlush), + useSLM(useSLM), + guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControl), + gsba32BitRequired(gsba32BitRequired), + requiresCoherency(requiresCoherency), + lowPriority(lowPriority), + implicitFlush(implicitFlush), + outOfOrderExecutionAllowed(outOfOrderExecutionAllowed), + epilogueRequired(epilogueRequired), + usePerDssBackedBuffer(usePerDSSbackedBuffer){}; CsrDependencies csrDependencies; TimestampPacketContainer *barrierTimestampPacketNodes = nullptr; PipelineSelectArgs pipelineSelectArgs; @@ -91,6 +92,7 @@ struct DispatchFlags { bool implicitFlush = false; bool outOfOrderExecutionAllowed = false; bool epilogueRequired = false; + bool usePerDssBackedBuffer = false; }; struct CsrSizeRequestFlags { diff --git a/core/helpers/preamble_base.inl b/core/helpers/preamble_base.inl index 0780d73ea8..b2c0cd2319 100644 --- a/core/helpers/preamble_base.inl +++ b/core/helpers/preamble_base.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -75,7 +75,7 @@ void PreambleHelper::programPreamble(LinearStream *pCommandStream, De programKernelDebugging(pCommandStream); } programGenSpecificPreambleWorkArounds(pCommandStream, device.getHardwareInfo()); - if (DebugManager.flags.ForcePerDssBackedBufferProgramming.get()) { + if (perDssBackedBuffer != nullptr) { programPerDssBackedBuffer(pCommandStream, device.getHardwareInfo(), perDssBackedBuffer); } } diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index 790cfa5082..96fc867acd 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -673,6 +673,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber; auto specialPipelineSelectMode = false; Kernel *kernel = nullptr; + bool usePerDssBackedBuffer = false; for (auto &dispatchInfo : multiDispatchInfo) { if (kernel != dispatchInfo.getKernel()) { @@ -689,6 +690,10 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( if (kernel->hasUncacheableStatelessArgs()) { anyUncacheableArgs = true; } + + if (kernel->requiresPerDssBackedBuffer()) { + usePerDssBackedBuffer = true; + } } if (mediaSamplerRequired) { @@ -747,7 +752,8 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( (QueuePriority::LOW == priority), //lowPriority implicitFlush, //implicitFlush !eventBuilder.getEvent() || getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed - false //epilogueRequired + false, //epilogueRequired + usePerDssBackedBuffer //usePerDssBackedBuffer ); dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired; @@ -942,7 +948,8 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( false, //lowPriority (enqueueProperties.operation == EnqueueProperties::Operation::Blit), //implicitFlush getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed - false //epilogueRequired + false, //epilogueRequired + false //usePerDssBackedBuffer ); if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { diff --git a/runtime/command_stream/command_stream_receiver_hw_base.inl b/runtime/command_stream/command_stream_receiver_hw_base.inl index 73e5f01aaf..77318b634d 100644 --- a/runtime/command_stream/command_stream_receiver_hw_base.inl +++ b/runtime/command_stream/command_stream_receiver_hw_base.inl @@ -242,7 +242,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( } } - if (DebugManager.flags.ForcePerDssBackedBufferProgramming.get()) { + if (dispatchFlags.usePerDssBackedBuffer) { if (!perDssBackedBuffer) { createPerDssBackedBuffer(device); } @@ -730,7 +730,8 @@ inline void CommandStreamReceiverHw::programStateSip(LinearStream &cm template inline void CommandStreamReceiverHw::programPreamble(LinearStream &csr, Device &device, DispatchFlags &dispatchFlags, uint32_t &newL3Config) { if (!this->isPreambleSent) { - PreambleHelper::programPreamble(&csr, device, newL3Config, this->requiredThreadArbitrationPolicy, this->preemptionAllocation, this->perDssBackedBuffer); + GraphicsAllocation *perDssBackedBufferToUse = dispatchFlags.usePerDssBackedBuffer ? this->perDssBackedBuffer : nullptr; + PreambleHelper::programPreamble(&csr, device, newL3Config, this->requiredThreadArbitrationPolicy, this->preemptionAllocation, perDssBackedBufferToUse); this->isPreambleSent = true; this->lastSentL3Config = newL3Config; this->lastSentThreadArbitrationPolicy = this->requiredThreadArbitrationPolicy; diff --git a/runtime/helpers/task_information.cpp b/runtime/helpers/task_information.cpp index 1219c6e5d4..67f4036c7a 100644 --- a/runtime/helpers/task_information.cpp +++ b/runtime/helpers/task_information.cpp @@ -69,7 +69,8 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) { commandQueue.getPriority() == QueuePriority::LOW, //lowPriority false, //implicitFlush commandQueue.getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed - false //epilogueRequired + false, //epilogueRequired + false //usePerDssBackedBuffer ); DEBUG_BREAK_IF(taskLevel >= CompletionStamp::levelNotReady); @@ -227,7 +228,8 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate commandQueue.getPriority() == QueuePriority::LOW, //lowPriority false, //implicitFlush commandQueue.getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed - false //epilogueRequired + false, //epilogueRequired + kernel->requiresPerDssBackedBuffer() //usePerDssBackedBuffer ); if (timestampPacketDependencies) { @@ -332,7 +334,8 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate commandQueue.getPriority() == QueuePriority::LOW, //lowPriority false, //implicitFlush commandStreamReceiver.isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed - false //epilogueRequired + false, //epilogueRequired + false //usePerDssBackedBuffer ); UNRECOVERABLE_IF(!commandStreamReceiver.peekTimestampPacketWriteEnabled()); diff --git a/runtime/kernel/kernel.h b/runtime/kernel/kernel.h index b28e065ff1..7775099731 100644 --- a/runtime/kernel/kernel.h +++ b/runtime/kernel/kernel.h @@ -408,6 +408,8 @@ class Kernel : public BaseObject<_cl_kernel> { const bool kernelUsesLocalIds, const bool isCssUsed) const; + bool requiresPerDssBackedBuffer() const; + protected: struct ObjectCounts { uint32_t imageCount; diff --git a/runtime/kernel/kernel_extra.cpp b/runtime/kernel/kernel_extra.cpp index ce6c0f77e3..a059ba9e72 100644 --- a/runtime/kernel/kernel_extra.cpp +++ b/runtime/kernel/kernel_extra.cpp @@ -5,6 +5,7 @@ * */ +#include "core/debug_settings/debug_settings_manager.h" #include "runtime/kernel/kernel.h" namespace NEO { @@ -26,4 +27,9 @@ int Kernel::setKernelThreadArbitrationPolicy(uint32_t policy) { } return CL_SUCCESS; } + +bool Kernel::requiresPerDssBackedBuffer() const { + return DebugManager.flags.ForcePerDssBackedBufferProgramming.get(); +} + } // namespace NEO \ No newline at end of file diff --git a/unit_tests/command_queue/command_queue_hw_tests.cpp b/unit_tests/command_queue/command_queue_hw_tests.cpp index eec8da9a9b..92ebbd31ef 100644 --- a/unit_tests/command_queue/command_queue_hw_tests.cpp +++ b/unit_tests/command_queue/command_queue_hw_tests.cpp @@ -1308,3 +1308,20 @@ HWTEST_F(CommandQueueHwTest, givenFinishWhenFlushBatchedSubmissionsFailsThenErro HWTEST_F(CommandQueueHwTest, givenEmptyDispatchGlobalsArgsWhenEnqueueInitDispatchGlobalsCalledThenErrorIsReturned) { EXPECT_EQ(CL_INVALID_VALUE, pCmdQ->enqueueInitDispatchGlobals(nullptr, 0, nullptr, nullptr)); } + +HWTEST_F(CommandQueueHwTest, WhenForcePerDssBackedBufferProgrammingSetThenDispatchFlagsAreSetAccordingly) { + DebugManagerStateRestore restore; + DebugManager.flags.ForcePerDssBackedBufferProgramming = true; + + MockKernelWithInternals mockKernelWithInternals(*pClDevice); + auto mockKernel = mockKernelWithInternals.mockKernel; + auto &csr = pDevice->getUltCommandStreamReceiver(); + + size_t offset = 0; + size_t gws = 64; + size_t lws = 16; + + cl_int status = pCmdQ->enqueueKernel(mockKernel, 1, &offset, &gws, &lws, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, status); + EXPECT_TRUE(csr.recordedDispatchFlags.usePerDssBackedBuffer); +} diff --git a/unit_tests/command_stream/command_stream_receiver_flush_task_3_tests.cpp b/unit_tests/command_stream/command_stream_receiver_flush_task_3_tests.cpp index 7942a4c136..2bfe07398e 100644 --- a/unit_tests/command_stream/command_stream_receiver_flush_task_3_tests.cpp +++ b/unit_tests/command_stream/command_stream_receiver_flush_task_3_tests.cpp @@ -1578,11 +1578,10 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, whenPerDssBackBufferIsAllocatedItI } HWTEST_F(CommandStreamReceiverFlushTaskTests, whenPerDssBackBufferProgrammingEnabledThenAllocationIsCreated) { - DebugManagerStateRestore restore; - DebugManager.flags.ForcePerDssBackedBufferProgramming.set(true); auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.usePerDssBackedBuffer = true; commandStreamReceiver.flushTask(commandStream, 0, @@ -1598,14 +1597,12 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, whenPerDssBackBufferProgrammingEna } HWTEST_F(CommandStreamReceiverFlushTaskTests, whenPerDssBackBufferProgrammingEnabledAndPerDssBackedBufferAlreadyPresentThenNewAllocationIsNotCreated) { - DebugManagerStateRestore restore; - DebugManager.flags.ForcePerDssBackedBufferProgramming.set(true); - auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); auto memoryManager = pDevice->getMemoryManager(); commandStreamReceiver.perDssBackedBuffer = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{MemoryConstants::pageSize}); DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.usePerDssBackedBuffer = true; commandStreamReceiver.flushTask(commandStream, 0, diff --git a/unit_tests/command_stream/compute_mode_tests.h b/unit_tests/command_stream/compute_mode_tests.h index 1eee9b4b63..d9be49ad6f 100644 --- a/unit_tests/command_stream/compute_mode_tests.h +++ b/unit_tests/command_stream/compute_mode_tests.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -73,6 +73,6 @@ struct ComputeModeRequirements : public ::testing::Test { CommandStreamReceiver *csr = nullptr; std::unique_ptr device; - DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false}; + DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false}; GraphicsAllocation *alloc = nullptr; }; diff --git a/unit_tests/helpers/dispatch_flags_helper.h b/unit_tests/helpers/dispatch_flags_helper.h index f6fb351930..406fa2c50d 100644 --- a/unit_tests/helpers/dispatch_flags_helper.h +++ b/unit_tests/helpers/dispatch_flags_helper.h @@ -32,7 +32,8 @@ struct DispatchFlagsHelper { false, //lowPriority false, //implicitFlush false, //outOfOrderExecutionAllowed - false //epilogueRequired + false, //epilogueRequired + false //usePerDssBackedBuffer ); } }; diff --git a/unit_tests/kernel/kernel_tests.cpp b/unit_tests/kernel/kernel_tests.cpp index 5a99b484f5..5c8626d1aa 100644 --- a/unit_tests/kernel/kernel_tests.cpp +++ b/unit_tests/kernel/kernel_tests.cpp @@ -2993,6 +2993,23 @@ TEST(KernelTest, givenKernelLocalIdGenerationByRuntimeFalseAndLocalIdsNotUsedWhe device->getMemoryManager()->freeGraphicsMemory(mockKernel.kernelInfo.getGraphicsAllocation()); } +TEST(KernelTest, givenKernelWhenForcePerDssBackedBufferProgrammingIsSetThenKernelRequiresPerDssBackedBuffer) { + DebugManagerStateRestore restore; + DebugManager.flags.ForcePerDssBackedBufferProgramming.set(true); + + auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(platformDevices[0])); + MockKernelWithInternals kernel(*device); + + EXPECT_TRUE(kernel.mockKernel->requiresPerDssBackedBuffer()); +} + +TEST(KernelTest, givenKernelWhenForcePerDssBackedBufferProgrammingIsNotSetThenKernelDoesntRequirePerDssBackedBuffer) { + auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(platformDevices[0])); + MockKernelWithInternals kernel(*device); + + EXPECT_FALSE(kernel.mockKernel->requiresPerDssBackedBuffer()); +} + namespace NEO { template