diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index 6a540fad30..c8540dbf86 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -143,12 +143,6 @@ void HardwareInterface::dispatchWalker( DebugPauseState::hasUserStartConfirmation, hwInfo); } - mainKernel->performKernelTuning(commandQueue.getGpgpuCommandStreamReceiver(), - multiDispatchInfo.begin()->getLocalWorkgroupSize(), - multiDispatchInfo.begin()->getActualWorkgroupSize(), - multiDispatchInfo.begin()->getOffset(), - walkerArgs.currentTimestampPacketNodes); - walkerArgs.currentDispatchIndex = 0; for (auto &dispatchInfo : multiDispatchInfo) { diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 9799083843..2b95da2894 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -1230,88 +1230,10 @@ inline void Kernel::makeArgsResident(CommandStreamReceiver &commandStreamReceive } } -void Kernel::performKernelTuning(CommandStreamReceiver &commandStreamReceiver, const Vec3 &lws, const Vec3 &gws, const Vec3 &offsets, TimestampPacketContainer *timestampContainer) { - auto performTunning = TunningType::disabled; - - if (debugManager.flags.EnableKernelTunning.get() != -1) { - performTunning = static_cast(debugManager.flags.EnableKernelTunning.get()); - } - - if (performTunning == TunningType::full) { - KernelConfig config{gws, lws, offsets}; - - auto submissionDataIt = this->kernelSubmissionMap.find(config); - if (submissionDataIt == this->kernelSubmissionMap.end()) { - KernelSubmissionData submissionData; - submissionData.kernelStandardTimestamps = std::make_unique(); - submissionData.kernelSubdeviceTimestamps = std::make_unique(); - submissionData.status = TunningStatus::standardTunningInProgress; - submissionData.kernelStandardTimestamps->assignAndIncrementNodesRefCounts(*timestampContainer); - this->kernelSubmissionMap[config] = std::move(submissionData); - this->singleSubdevicePreferredInCurrentEnqueue = false; - return; - } - - auto &submissionData = submissionDataIt->second; - - if (submissionData.status == TunningStatus::tunningDone) { - this->singleSubdevicePreferredInCurrentEnqueue = submissionData.singleSubdevicePreferred; - } - - if (submissionData.status == TunningStatus::subdeviceTunningInProgress) { - if (this->hasTunningFinished(submissionData)) { - submissionData.status = TunningStatus::tunningDone; - submissionData.kernelStandardTimestamps.reset(); - submissionData.kernelSubdeviceTimestamps.reset(); - this->singleSubdevicePreferredInCurrentEnqueue = submissionData.singleSubdevicePreferred; - } else { - this->singleSubdevicePreferredInCurrentEnqueue = false; - } - } - - if (submissionData.status == TunningStatus::standardTunningInProgress) { - submissionData.status = TunningStatus::subdeviceTunningInProgress; - submissionData.kernelSubdeviceTimestamps->assignAndIncrementNodesRefCounts(*timestampContainer); - this->singleSubdevicePreferredInCurrentEnqueue = true; - } - } -} - -bool Kernel::hasTunningFinished(KernelSubmissionData &submissionData) { - if (!this->hasRunFinished(submissionData.kernelStandardTimestamps.get()) || - !this->hasRunFinished(submissionData.kernelSubdeviceTimestamps.get())) { - return false; - } - - uint64_t globalStartTS = 0u; - uint64_t globalEndTS = 0u; - - Event::getBoundaryTimestampValues(submissionData.kernelStandardTimestamps.get(), globalStartTS, globalEndTS); - auto standardTSDiff = globalEndTS - globalStartTS; - - Event::getBoundaryTimestampValues(submissionData.kernelSubdeviceTimestamps.get(), globalStartTS, globalEndTS); - auto subdeviceTSDiff = globalEndTS - globalStartTS; - - submissionData.singleSubdevicePreferred = standardTSDiff > subdeviceTSDiff; - - return true; -} - -bool Kernel::hasRunFinished(TimestampPacketContainer *timestampContainer) { - for (const auto &node : timestampContainer->peekNodes()) { - for (uint32_t i = 0; i < node->getPacketsUsed(); i++) { - if (node->getContextEndValue(i) == 1) { - return false; - } - } - } - return true; -} - bool Kernel::isSingleSubdevicePreferred() const { auto &gfxCoreHelper = this->getGfxCoreHelper(); - return this->singleSubdevicePreferredInCurrentEnqueue || gfxCoreHelper.singleTileExecImplicitScalingRequired(this->usesSyncBuffer()); + return gfxCoreHelper.singleTileExecImplicitScalingRequired(this->usesSyncBuffer()); } void Kernel::setInlineSamplers() { diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h index 152cdf2e9b..269f0f5e18 100644 --- a/opencl/source/kernel/kernel.h +++ b/opencl/source/kernel/kernel.h @@ -74,18 +74,6 @@ class Kernel : public ReferenceTrackedObject, NEO::NonCopyableAndNonMova bool isSetToNullptr = false; }; - enum class TunningStatus { - standardTunningInProgress, - subdeviceTunningInProgress, - tunningDone - }; - - enum class TunningType { - disabled, - simple, - full - }; - typedef int32_t (Kernel::*KernelArgHandler)(uint32_t argIndex, size_t argSize, const void *argVal); @@ -279,7 +267,6 @@ class Kernel : public ReferenceTrackedObject, NEO::NonCopyableAndNonMova bool isVmeKernel() const { return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesVme; } bool requiresSystolicPipelineSelectMode() const { return systolicPipelineSelectMode; } - void performKernelTuning(CommandStreamReceiver &commandStreamReceiver, const Vec3 &lws, const Vec3 &gws, const Vec3 &offsets, TimestampPacketContainer *timestampContainer); MOCKABLE_VIRTUAL bool isSingleSubdevicePreferred() const; void setInlineSamplers(); @@ -405,41 +392,6 @@ class Kernel : public ReferenceTrackedObject, NEO::NonCopyableAndNonMova } protected: - struct KernelConfig { - Vec3 gws; - Vec3 lws; - Vec3 offsets; - bool operator==(const KernelConfig &other) const { return this->gws == other.gws && this->lws == other.lws && this->offsets == other.offsets; } - }; - struct KernelConfigHash { - size_t operator()(KernelConfig const &config) const { - auto hash = std::hash{}; - size_t gwsHashX = hash(config.gws.x); - size_t gwsHashY = hash(config.gws.y); - size_t gwsHashZ = hash(config.gws.z); - size_t gwsHash = hashCombine(gwsHashX, gwsHashY, gwsHashZ); - size_t lwsHashX = hash(config.lws.x); - size_t lwsHashY = hash(config.lws.y); - size_t lwsHashZ = hash(config.lws.z); - size_t lwsHash = hashCombine(lwsHashX, lwsHashY, lwsHashZ); - size_t offsetsHashX = hash(config.offsets.x); - size_t offsetsHashY = hash(config.offsets.y); - size_t offsetsHashZ = hash(config.offsets.z); - size_t offsetsHash = hashCombine(offsetsHashX, offsetsHashY, offsetsHashZ); - return hashCombine(gwsHash, lwsHash, offsetsHash); - } - - size_t hashCombine(size_t hash1, size_t hash2, size_t hash3) const { - return (hash1 ^ (hash2 << 1u)) ^ (hash3 << 2u); - } - }; - struct KernelSubmissionData { - std::unique_ptr kernelStandardTimestamps; - std::unique_ptr kernelSubdeviceTimestamps; - TunningStatus status; - bool singleSubdevicePreferred = false; - }; - Kernel(Program *programArg, const KernelInfo &kernelInfo, ClDevice &clDevice); void makeArgsResident(CommandStreamReceiver &commandStreamReceiver); @@ -462,9 +414,6 @@ class Kernel : public ReferenceTrackedObject, NEO::NonCopyableAndNonMova } cl_int patchPrivateSurface(); - bool hasTunningFinished(KernelSubmissionData &submissionData); - bool hasRunFinished(TimestampPacketContainer *timestampContainer); - void initializeLocalIdsCache(); std::unique_ptr localIdsCache; @@ -472,8 +421,6 @@ class Kernel : public ReferenceTrackedObject, NEO::NonCopyableAndNonMova std::map migratableArgsMap{}; - std::unordered_map kernelSubmissionMap; - std::vector kernelArguments; std::vector kernelArgHandlers; std::vector kernelSvmGfxAllocations; @@ -522,7 +469,6 @@ class Kernel : public ReferenceTrackedObject, NEO::NonCopyableAndNonMova bool auxTranslationRequired = false; bool systolicPipelineSelectMode = false; bool isUnifiedMemorySyncRequired = true; - bool singleSubdevicePreferredInCurrentEnqueue = false; bool kernelHasIndirectAccess = true; bool anyKernelArgumentUsingSystemMemory = false; bool isDestinationAllocationInSystemMemory = false; diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index 7cd86b0748..a17b32695f 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -423,13 +423,9 @@ TEST_F(KernelTests, WhenIsSingleSubdevicePreferredIsCalledThenCorrectValuesAreRe std::unique_ptr kernel{MockKernel::create(pClDevice->getDevice(), pProgram)}; for (auto usesSyncBuffer : ::testing::Bool()) { kernel->getAllocatedKernelInfo()->kernelDescriptor.kernelAttributes.flags.usesSyncBuffer = usesSyncBuffer; - for (auto singleSubdevicePreferredInCurrentEnqueue : ::testing::Bool()) { - kernel->singleSubdevicePreferredInCurrentEnqueue = singleSubdevicePreferredInCurrentEnqueue; - EXPECT_EQ(usesSyncBuffer, kernel->usesSyncBuffer()); - auto expectedSingleSubdevicePreferredInCurrentEnqueue = singleSubdevicePreferredInCurrentEnqueue || helper.singleTileExecImplicitScalingRequired(usesSyncBuffer); - EXPECT_EQ(expectedSingleSubdevicePreferredInCurrentEnqueue, kernel->isSingleSubdevicePreferred()); - } + EXPECT_EQ(usesSyncBuffer, kernel->usesSyncBuffer()); + EXPECT_EQ(helper.singleTileExecImplicitScalingRequired(usesSyncBuffer), kernel->isSingleSubdevicePreferred()); } } @@ -2371,118 +2367,6 @@ HWTEST_F(KernelResidencyTest, givenKernelWithNoKernelArgAtomicAndImplicitArgsHas memoryManager->freeGraphicsMemory(pKernelInfo->kernelAllocation); } -TEST(KernelConfigTests, givenTwoKernelConfigsWhenCompareThenResultsAreCorrect) { - Vec3 lws{1, 1, 1}; - Vec3 gws{1, 1, 1}; - Vec3 offsets{1, 1, 1}; - MockKernel::KernelConfig config{gws, lws, offsets}; - MockKernel::KernelConfig config2{gws, lws, offsets}; - EXPECT_TRUE(config == config2); - - config2.offsets.z = 2; - EXPECT_FALSE(config == config2); - - config2.lws.z = 2; - config2.offsets.z = 1; - EXPECT_FALSE(config == config2); - - config2.lws.z = 1; - config2.gws.z = 2; - EXPECT_FALSE(config == config2); -} - -HWTEST_F(KernelResidencyTest, givenEnableFullKernelTuningWhenPerformTunningThenKernelConfigDataIsTracked) { - using TimestampPacketType = typename FamilyType::TimestampPacketType; - DebugManagerStateRestore restorer; - debugManager.flags.EnableKernelTunning.set(2u); - - auto &commandStreamReceiver = this->pDevice->getUltCommandStreamReceiver(); - MockKernelWithInternals mockKernel(*this->pClDevice); - - Vec3 lws{1, 1, 1}; - Vec3 gws{1, 1, 1}; - Vec3 offsets{1, 1, 1}; - MockKernel::KernelConfig config{gws, lws, offsets}; - - MockTimestampPacketContainer container(*commandStreamReceiver.getTimestampPacketAllocator(), 1); - MockTimestampPacketContainer subdeviceContainer(*commandStreamReceiver.getTimestampPacketAllocator(), 2); - - auto result = mockKernel.mockKernel->kernelSubmissionMap.find(config); - EXPECT_EQ(result, mockKernel.mockKernel->kernelSubmissionMap.end()); - - mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &container); - - result = mockKernel.mockKernel->kernelSubmissionMap.find(config); - EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end()); - EXPECT_EQ(result->second.status, MockKernel::TunningStatus::standardTunningInProgress); - EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue); - - mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &subdeviceContainer); - - result = mockKernel.mockKernel->kernelSubmissionMap.find(config); - EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end()); - EXPECT_EQ(result->second.status, MockKernel::TunningStatus::subdeviceTunningInProgress); - EXPECT_TRUE(mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue); - - mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &container); - - result = mockKernel.mockKernel->kernelSubmissionMap.find(config); - EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end()); - EXPECT_EQ(result->second.status, MockKernel::TunningStatus::subdeviceTunningInProgress); - EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue); - - TimestampPacketType data[4] = {static_cast(container.getNode(0u)->getContextStartValue(0)), - static_cast(container.getNode(0u)->getGlobalStartValue(0)), - 2, 2}; - - container.getNode(0u)->assignDataToAllTimestamps(0, data); - - mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &container); - - result = mockKernel.mockKernel->kernelSubmissionMap.find(config); - EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end()); - EXPECT_EQ(result->second.status, MockKernel::TunningStatus::subdeviceTunningInProgress); - EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue); - - data[0] = static_cast(subdeviceContainer.getNode(0u)->getContextStartValue(0)); - data[1] = static_cast(subdeviceContainer.getNode(0u)->getGlobalStartValue(0)); - data[2] = 2; - data[3] = 2; - - subdeviceContainer.getNode(0u)->assignDataToAllTimestamps(0, data); - - mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &container); - - result = mockKernel.mockKernel->kernelSubmissionMap.find(config); - EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end()); - EXPECT_NE(result->second.kernelStandardTimestamps.get(), nullptr); - EXPECT_NE(result->second.kernelSubdeviceTimestamps.get(), nullptr); - EXPECT_EQ(result->second.status, MockKernel::TunningStatus::subdeviceTunningInProgress); - EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue); - - data[0] = static_cast(subdeviceContainer.getNode(1u)->getContextStartValue(0)); - data[1] = static_cast(subdeviceContainer.getNode(1u)->getGlobalStartValue(0)); - data[2] = 2; - data[3] = 2; - - subdeviceContainer.getNode(1u)->assignDataToAllTimestamps(0, data); - - mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &container); - - result = mockKernel.mockKernel->kernelSubmissionMap.find(config); - EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end()); - EXPECT_EQ(result->second.kernelStandardTimestamps.get(), nullptr); - EXPECT_EQ(result->second.kernelSubdeviceTimestamps.get(), nullptr); - EXPECT_EQ(result->second.status, MockKernel::TunningStatus::tunningDone); - EXPECT_EQ(result->second.singleSubdevicePreferred, mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue); - - mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &container); - result = mockKernel.mockKernel->kernelSubmissionMap.find(config); - EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end()); - EXPECT_EQ(result->second.status, MockKernel::TunningStatus::tunningDone); - EXPECT_EQ(result->second.singleSubdevicePreferred, mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue); -} - HWTEST_F(KernelResidencyTest, givenSimpleKernelWhenExecEnvDoesNotHavePageFaultManagerThenPageFaultDoesNotMoveAllocation) { auto mockPageFaultManager = std::make_unique(); MockKernelWithInternals mockKernel(*this->pClDevice); diff --git a/opencl/test/unit_test/mocks/mock_kernel.h b/opencl/test/unit_test/mocks/mock_kernel.h index c90838a11c..beead0909f 100644 --- a/opencl/test/unit_test/mocks/mock_kernel.h +++ b/opencl/test/unit_test/mocks/mock_kernel.h @@ -111,9 +111,7 @@ class MockKernel : public Kernel { using Kernel::isUnifiedMemorySyncRequired; using Kernel::kernelArgHandlers; using Kernel::kernelArguments; - using Kernel::KernelConfig; using Kernel::kernelHasIndirectAccess; - using Kernel::kernelSubmissionMap; using Kernel::kernelSvmGfxAllocations; using Kernel::kernelUnifiedMemoryGfxAllocations; using Kernel::localBindingTableOffset; @@ -129,7 +127,6 @@ class MockKernel : public Kernel { using Kernel::preferredWkgMultipleOffset; using Kernel::privateSurface; using Kernel::setInlineSamplers; - using Kernel::singleSubdevicePreferredInCurrentEnqueue; using Kernel::unifiedMemoryControls; using Kernel::implicitArgsVersion;