mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-20 00:24:58 +08:00
performance: Cache suggest group size
Resolves: NEO-7968 Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
1146f42bcb
commit
0cf975605b
@@ -374,6 +374,16 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz
|
||||
uint32_t dim = (globalSizeY > 1U) ? 2 : 1U;
|
||||
dim = (globalSizeZ > 1U) ? 3 : dim;
|
||||
|
||||
auto cachedGroupSize = std::find_if(this->suggestGroupSizeCache.begin(), this->suggestGroupSizeCache.end(), [&](const auto &other) {
|
||||
return other.first == workItems;
|
||||
});
|
||||
if (cachedGroupSize != this->suggestGroupSizeCache.end()) {
|
||||
*groupSizeX = static_cast<uint32_t>(cachedGroupSize->second.x);
|
||||
*groupSizeY = static_cast<uint32_t>(cachedGroupSize->second.y);
|
||||
*groupSizeZ = static_cast<uint32_t>(cachedGroupSize->second.z);
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
if (NEO::DebugManager.flags.EnableComputeWorkSizeND.get()) {
|
||||
auto usesImages = kernelDescriptor.kernelAttributes.flags.usesImages;
|
||||
auto neoDevice = module->getDevice()->getNEODevice();
|
||||
@@ -402,6 +412,7 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz
|
||||
*groupSizeX = static_cast<uint32_t>(retGroupSize[0]);
|
||||
*groupSizeY = static_cast<uint32_t>(retGroupSize[1]);
|
||||
*groupSizeZ = static_cast<uint32_t>(retGroupSize[2]);
|
||||
this->suggestGroupSizeCache.push_back(std::make_pair(Vec3(workItems), Vec3(retGroupSize)));
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "shared/source/command_stream/thread_arbitration_policy.h"
|
||||
#include "shared/source/helpers/vec.h"
|
||||
#include "shared/source/kernel/dispatch_kernel_encoder_interface.h"
|
||||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||
#include "shared/source/unified_memory/unified_memory.h"
|
||||
@@ -16,6 +17,7 @@
|
||||
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
|
||||
namespace L0 {
|
||||
|
||||
@@ -236,6 +238,9 @@ struct KernelImp : Kernel {
|
||||
|
||||
std::unique_ptr<KernelExt> pExtension;
|
||||
std::mutex printfLock;
|
||||
|
||||
using SuggestGroupSizeCacheT = std::vector<std::pair<Vec3<size_t>, Vec3<size_t>>>;
|
||||
SuggestGroupSizeCacheT suggestGroupSizeCache;
|
||||
};
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -63,6 +63,7 @@ struct WhiteBox<::L0::Kernel> : public ::L0::KernelImp {
|
||||
using ::L0::KernelImp::requiredWorkgroupOrder;
|
||||
using ::L0::KernelImp::residencyContainer;
|
||||
using ::L0::KernelImp::setAssertBuffer;
|
||||
using ::L0::KernelImp::suggestGroupSizeCache;
|
||||
using ::L0::KernelImp::surfaceStateHeapData;
|
||||
using ::L0::KernelImp::surfaceStateHeapDataSize;
|
||||
using ::L0::KernelImp::unifiedMemoryControls;
|
||||
|
||||
@@ -112,6 +112,71 @@ TEST_F(KernelImp, WhenSuggestingGroupSizeThenClampToMaxGroupSize) {
|
||||
EXPECT_EQ(1U, groupSize[2]);
|
||||
}
|
||||
|
||||
TEST_F(KernelImp, WhenSuggestingGroupSizeThenCacheValues) {
|
||||
DebugManagerStateRestore restorer;
|
||||
|
||||
WhiteBox<KernelImmutableData> kernelInfo = {};
|
||||
NEO::KernelDescriptor descriptor;
|
||||
kernelInfo.kernelDescriptor = &descriptor;
|
||||
|
||||
NEO::DebugManager.flags.EnableComputeWorkSizeND.set(false);
|
||||
|
||||
Mock<Module> module(device, nullptr);
|
||||
module.getMaxGroupSizeResult = 8;
|
||||
|
||||
Mock<Kernel> kernel;
|
||||
kernel.kernelImmData = &kernelInfo;
|
||||
kernel.module = &module;
|
||||
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache.size(), 0u);
|
||||
|
||||
uint32_t groupSize[3];
|
||||
kernel.KernelImp::suggestGroupSize(256, 1, 1, groupSize, groupSize + 1, groupSize + 2);
|
||||
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache.size(), 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].first[0], 256u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].first[1], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].first[2], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[0], 8u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[1], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[2], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[0], groupSize[0]);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[1], groupSize[1]);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[2], groupSize[2]);
|
||||
|
||||
kernel.KernelImp::suggestGroupSize(256, 1, 1, groupSize, groupSize + 1, groupSize + 2);
|
||||
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache.size(), 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].first[0], 256u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].first[1], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].first[2], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[0], 8u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[1], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[2], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[0], groupSize[0]);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[1], groupSize[1]);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[2], groupSize[2]);
|
||||
|
||||
kernel.KernelImp::suggestGroupSize(2048, 1, 1, groupSize, groupSize + 1, groupSize + 2);
|
||||
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache.size(), 2u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].first[0], 256u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].first[1], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].first[2], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[0], 8u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[1], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[2], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[1].first[0], 2048u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[1].first[1], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[1].first[2], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[1].second[0], 8u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[1].second[1], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[1].second[2], 1u);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[0], groupSize[0]);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[1], groupSize[1]);
|
||||
EXPECT_EQ(kernel.suggestGroupSizeCache[0].second[2], groupSize[2]);
|
||||
}
|
||||
|
||||
class KernelImpSuggestGroupSize : public DeviceFixture, public ::testing::TestWithParam<uint32_t> {
|
||||
public:
|
||||
void SetUp() override {
|
||||
@@ -201,24 +266,14 @@ TEST_P(KernelImpSuggestGroupSize, WhenSlmSizeExceedsLocalMemorySizeAndSuggesting
|
||||
function.kernelImmData = &funcInfo;
|
||||
function.module = &module;
|
||||
uint32_t groupSize[3];
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, function.KernelImp::suggestGroupSize(size, 1, 1, groupSize, groupSize + 1, groupSize + 2));
|
||||
|
||||
::testing::internal::CaptureStderr();
|
||||
|
||||
auto localMemSize = static_cast<uint32_t>(device->getNEODevice()->getDeviceInfo().localMemSize);
|
||||
|
||||
::testing::internal::CaptureStderr();
|
||||
|
||||
funcInfo.kernelDescriptor->kernelAttributes.slmInlineSize = localMemSize - 10u;
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, function.KernelImp::suggestGroupSize(size, 1, 1, groupSize, groupSize + 1, groupSize + 2));
|
||||
|
||||
std::string output = testing::internal::GetCapturedStderr();
|
||||
EXPECT_EQ(std::string(""), output);
|
||||
|
||||
::testing::internal::CaptureStderr();
|
||||
|
||||
funcInfo.kernelDescriptor->kernelAttributes.slmInlineSize = localMemSize + 10u;
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, function.KernelImp::suggestGroupSize(size, 1, 1, groupSize, groupSize + 1, groupSize + 2));
|
||||
|
||||
output = testing::internal::GetCapturedStderr();
|
||||
auto output = testing::internal::GetCapturedStderr();
|
||||
const auto &slmInlineSize = funcInfo.kernelDescriptor->kernelAttributes.slmInlineSize;
|
||||
std::string expectedOutput = "Size of SLM (" + std::to_string(slmInlineSize) + ") larger than available (" + std::to_string(localMemSize) + ")\n";
|
||||
EXPECT_EQ(expectedOutput, output);
|
||||
|
||||
Reference in New Issue
Block a user