From 2778043d67ee09d94cf9fe748ed06ee4ec86cd0d Mon Sep 17 00:00:00 2001 From: Maciej Bielski Date: Tue, 31 Jan 2023 12:56:03 +0000 Subject: [PATCH] fix(l0): check for largeGRF when computing maxWorkGroupSize Sizing context (PVC): When using LargeGRF (a.k.a GRF256) there are only 4 HW threads per EU (instead of default 8). Together with SIMD16 that means that there can be max 64 work-items per EU. With 8 EU per subslice this gives 512 work-items on a single subslice. For correct intra-WG synchronization all its WIs must be executed on the same subslice (to access the same SLM, where the synchronization primitives are stored). Thus, with SIMD16 and LargeGRF the work-group size must not exceed 512 (PVC example). So far `maxWorkGroupSize` is taken solely from a DeviceInfo structure both in `ModuleTranslationUnit::processUnpackedBinary()` and `ModuleImp::initialize()`. This method does not take kernel parameters (LargeGRF) into account. It allows to submit a kernel using LargeGRF with SIMD16 with the work-group size set to 1024. That leads to a hang. Fix the `.maxWorkGroupSize` computation so that it takes the kernel parameters into consideration. Add new (for discrete platforms >= XeHP) and adapt existing tests, fix cosmetics by the way. Similar check for OCL: https://github.com/intel/compute-runtime/blob/master/opencl/source/comma nd_queue/enqueue_kernel.h#L130 Related-To: NEO-7684 Signed-off-by: Maciej Bielski --- level_zero/core/source/kernel/kernel_imp.cpp | 18 ++- level_zero/core/source/module/module.h | 8 +- level_zero/core/source/module/module_imp.cpp | 10 +- level_zero/core/source/module/module_imp.h | 10 +- .../unit_tests/fixtures/module_fixture.cpp | 11 +- .../test/unit_tests/fixtures/module_fixture.h | 6 +- .../core/test/unit_tests/mocks/mock_module.h | 80 +++++++--- .../debugger/test_module_with_debug.cpp | 36 ++++- .../unit_tests/sources/kernel/test_kernel.cpp | 19 ++- .../unit_tests/sources/module/test_module.cpp | 145 +++++++++++++++++- .../sources/module/test_module_2.cpp | 6 + .../xe_hpg_core/test_module_xe_hpg_core.cpp | 8 +- shared/source/helpers/gfx_core_helper.h | 4 + .../helpers/gfx_core_helper_bdw_and_later.inl | 5 + .../gfx_core_helper_xehp_and_later.inl | 9 ++ .../helpers/gfx_core_helper_tests.cpp | 20 +++ .../gfx_core_helper_tests_dg2_and_later.cpp | 22 +++ 17 files changed, 360 insertions(+), 57 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index a3fb7c6783..38e8a0ebc8 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -282,7 +282,11 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, Vec3 groupSize{groupSizeX, groupSizeY, groupSizeZ}; auto itemsInGroup = Math::computeTotalElementsCount(groupSize); - if (itemsInGroup > module->getMaxGroupSize()) { + const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor(); + if (auto maxGroupSize = module->getMaxGroupSize(kernelDescriptor); itemsInGroup > maxGroupSize) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, + "Requested work-group size (%lu) exceeds maximum value (%u) for the kernel \"%s\" \n", + itemsInGroup, maxGroupSize, kernelDescriptor.kernelMetadata.kernelName.c_str()); DEBUG_BREAK_IF(true); return ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION; } @@ -290,7 +294,6 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, this->groupSize[0] = groupSizeX; this->groupSize[1] = groupSizeY; this->groupSize[2] = groupSizeZ; - const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor(); for (uint32_t i = 0u; i < 3u; i++) { if (kernelDescriptor.kernelAttributes.requiredWorkgroupSize[i] != 0 && kernelDescriptor.kernelAttributes.requiredWorkgroupSize[i] != this->groupSize[i]) { @@ -349,14 +352,15 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz uint32_t globalSizeZ, uint32_t *groupSizeX, uint32_t *groupSizeY, uint32_t *groupSizeZ) { size_t retGroupSize[3] = {}; - auto maxWorkGroupSize = module->getMaxGroupSize(); - auto simd = kernelImmData->getDescriptor().kernelAttributes.simdSize; + const auto &kernelDescriptor = this->getImmutableData()->getDescriptor(); + auto maxWorkGroupSize = module->getMaxGroupSize(kernelDescriptor); + auto simd = kernelDescriptor.kernelAttributes.simdSize; size_t workItems[3] = {globalSizeX, globalSizeY, globalSizeZ}; uint32_t dim = (globalSizeY > 1U) ? 2 : 1U; dim = (globalSizeZ > 1U) ? 3 : dim; if (NEO::DebugManager.flags.EnableComputeWorkSizeND.get()) { - auto usesImages = getImmutableData()->getDescriptor().kernelAttributes.flags.usesImages; + auto usesImages = kernelDescriptor.kernelAttributes.flags.usesImages; auto neoDevice = module->getDevice()->getNEODevice(); const auto &deviceInfo = neoDevice->getDeviceInfo(); uint32_t numThreadsPerSubSlice = (uint32_t)deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU; @@ -367,9 +371,9 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; } - NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, kernelImmData->getDescriptor().kernelAttributes.usesBarriers(), simd, this->getSlmTotalSize(), + NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, kernelDescriptor.kernelAttributes.usesBarriers(), simd, this->getSlmTotalSize(), neoDevice->getRootDeviceEnvironment(), numThreadsPerSubSlice, localMemSize, - usesImages, false, kernelImmData->getDescriptor().kernelAttributes.flags.requiresDisabledEUFusion); + usesImages, false, kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion); NEO::computeWorkgroupSizeND(wsInfo, retGroupSize, workItems, dim); } else { if (1U == dim) { diff --git a/level_zero/core/source/module/module.h b/level_zero/core/source/module/module.h index 26deb7d7fa..4c83735af8 100644 --- a/level_zero/core/source/module/module.h +++ b/level_zero/core/source/module/module.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -15,6 +15,10 @@ struct _ze_module_handle_t {}; +namespace NEO { +struct KernelDescriptor; +} + namespace L0 { struct Device; struct ModuleBuildLog; @@ -48,7 +52,7 @@ struct Module : _ze_module_handle_t { virtual const KernelImmutableData *getKernelImmutableData(const char *kernelName) const = 0; virtual const std::vector> &getKernelImmutableDataVector() const = 0; - virtual uint32_t getMaxGroupSize() const = 0; + virtual uint32_t getMaxGroupSize(const NEO::KernelDescriptor &kernelDescriptor) const = 0; virtual bool isDebugEnabled() const = 0; virtual bool shouldAllocatePrivateMemoryPerDispatch() const = 0; virtual uint32_t getProfileFlags() const = 0; diff --git a/level_zero/core/source/module/module_imp.cpp b/level_zero/core/source/module/module_imp.cpp index 94c02d7bbe..cce921bcd2 100644 --- a/level_zero/core/source/module/module_imp.cpp +++ b/level_zero/core/source/module/module_imp.cpp @@ -26,8 +26,10 @@ #include "shared/source/helpers/api_specific_config.h" #include "shared/source/helpers/compiler_product_helper.h" #include "shared/source/helpers/constants.h" +#include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/kernel_helpers.h" #include "shared/source/helpers/string.h" +#include "shared/source/kernel/kernel_descriptor.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/memory_operations_handler.h" #include "shared/source/memory_manager/unified_memory_manager.h" @@ -367,7 +369,6 @@ ze_result_t ModuleTranslationUnit::processUnpackedBinary() { size_t slmAvailable = 0U; NEO::DeviceInfoKernelPayloadConstants deviceInfoConstants; slmAvailable = static_cast(device->getDeviceInfo().localMemSize); - deviceInfoConstants.maxWorkGroupSize = static_cast(device->getDeviceInfo().maxWorkGroupSize); deviceInfoConstants.computeUnitsUsedForScratch = static_cast(device->getDeviceInfo().computeUnitsUsedForScratch); deviceInfoConstants.slmWindowSize = static_cast(device->getDeviceInfo().localMemSize); if (NEO::requiresLocalMemoryWindowVA(programInfo)) { @@ -390,6 +391,7 @@ ze_result_t ModuleTranslationUnit::processUnpackedBinary() { } for (auto &kernelInfo : this->programInfo.kernelInfos) { + deviceInfoConstants.maxWorkGroupSize = gfxCoreHelper.calculateMaxWorkGroupSize(kernelInfo->kernelDescriptor, static_cast(device->getDeviceInfo().maxWorkGroupSize)); kernelInfo->apply(deviceInfoConstants); } @@ -617,7 +619,7 @@ ze_result_t ModuleImp::initialize(const ze_module_desc_t *desc, NEO::Device *neo registerElfInDebuggerL0(); - this->maxGroupSize = static_cast(neoDevice->getDeviceInfo().maxWorkGroupSize); + this->defaultMaxGroupSize = static_cast(neoDevice->getDeviceInfo().maxWorkGroupSize); checkIfPrivateMemoryPerDispatchIsNeeded(); @@ -709,6 +711,10 @@ const KernelImmutableData *ModuleImp::getKernelImmutableData(const char *kernelN return nullptr; } +uint32_t ModuleImp::getMaxGroupSize(const NEO::KernelDescriptor &kernelDescriptor) const { + return this->device->getGfxCoreHelper().calculateMaxWorkGroupSize(kernelDescriptor, this->defaultMaxGroupSize); +} + void ModuleImp::createBuildOptions(const char *pBuildFlags, std::string &apiOptions, std::string &internalBuildOptions) { if (pBuildFlags != nullptr) { std::string buildFlags(pBuildFlags); diff --git a/level_zero/core/source/module/module_imp.h b/level_zero/core/source/module/module_imp.h index e760b7efa1..974631809b 100644 --- a/level_zero/core/source/module/module_imp.h +++ b/level_zero/core/source/module/module_imp.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -20,9 +20,11 @@ #include namespace NEO { +struct KernelDescriptor; + namespace Debug { struct Segments; -} +} // namespace Debug } // namespace NEO namespace L0 { @@ -115,7 +117,7 @@ struct ModuleImp : public Module { const std::vector> &getKernelImmutableDataVector() const override { return kernelImmDatas; } - uint32_t getMaxGroupSize() const override { return maxGroupSize; } + uint32_t getMaxGroupSize(const NEO::KernelDescriptor &kernelDescriptor) const override; void createBuildOptions(const char *pBuildFlags, std::string &buildOptions, std::string &internalBuildOptions); bool moveOptLevelOption(std::string &dstOptionsSet, std::string &srcOptionSet); @@ -160,7 +162,7 @@ struct ModuleImp : public Module { std::unique_ptr translationUnit; ModuleBuildLog *moduleBuildLog = nullptr; NEO::GraphicsAllocation *exportedFunctionsSurface = nullptr; - uint32_t maxGroupSize = 0U; + uint32_t defaultMaxGroupSize = 0U; std::vector> kernelImmDatas; NEO::Linker::RelocatedSymbolsMap symbols; diff --git a/level_zero/core/test/unit_tests/fixtures/module_fixture.cpp b/level_zero/core/test/unit_tests/fixtures/module_fixture.cpp index ff8fddf6c9..8832238931 100644 --- a/level_zero/core/test/unit_tests/fixtures/module_fixture.cpp +++ b/level_zero/core/test/unit_tests/fixtures/module_fixture.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -10,6 +10,8 @@ #include "shared/source/command_container/implicit_scaling.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" +#include "level_zero/core/test/unit_tests/mocks/mock_module.h" + #include "gtest/gtest.h" namespace L0 { @@ -55,7 +57,8 @@ ModuleImmutableDataFixture::MockModule::MockModule(L0::Device *device, L0::ModuleType type, uint32_t perHwThreadPrivateMemorySize, MockImmutableData *inMockKernelImmData) : ModuleImp(device, moduleBuildLog, type), mockKernelImmData(inMockKernelImmData) { - mockKernelImmData->setDevice(device); + this->mockKernelImmData->setDevice(device); + this->translationUnit.reset(new MockModuleTranslationUnit(this->translationUnit.get())); } void ModuleImmutableDataFixture::MockModule::checkIfPrivateMemoryPerDispatchIsNeeded() { @@ -110,8 +113,8 @@ void ModuleImmutableDataFixture::tearDown() { DeviceFixture::tearDown(); } -Module *ModuleFixture::ProxyModuleImp::create(L0::Device *device, const ze_module_desc_t *desc, - ModuleBuildLog *moduleBuildLog, ModuleType type, ze_result_t *result) { +L0::Module *ModuleFixture::ProxyModuleImp::create(L0::Device *device, const ze_module_desc_t *desc, + ModuleBuildLog *moduleBuildLog, ModuleType type, ze_result_t *result) { auto module = new ProxyModuleImp(device, moduleBuildLog, type); *result = module->initialize(desc, device->getNEODevice()); diff --git a/level_zero/core/test/unit_tests/fixtures/module_fixture.h b/level_zero/core/test/unit_tests/fixtures/module_fixture.h index 96f2c138c1..c52575b1c8 100644 --- a/level_zero/core/test/unit_tests/fixtures/module_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/module_fixture.h @@ -51,9 +51,9 @@ struct ModuleImmutableDataFixture : public DeviceFixture { struct MockModule : public L0::ModuleImp { using ModuleImp::allocatePrivateMemoryPerDispatch; + using ModuleImp::defaultMaxGroupSize; using ModuleImp::getKernelImmutableDataVector; using ModuleImp::kernelImmDatas; - using ModuleImp::maxGroupSize; using ModuleImp::translationUnit; using ModuleImp::type; @@ -124,8 +124,8 @@ struct ModuleFixture : public DeviceFixture { return kernelImmDatas; } - static Module *create(L0::Device *device, const ze_module_desc_t *desc, - ModuleBuildLog *moduleBuildLog, ModuleType type, ze_result_t *result); + static L0::Module *create(L0::Device *device, const ze_module_desc_t *desc, + ModuleBuildLog *moduleBuildLog, ModuleType type, ze_result_t *result); }; void setUp(); diff --git a/level_zero/core/test/unit_tests/mocks/mock_module.h b/level_zero/core/test/unit_tests/mocks/mock_module.h index 1ca78fbcf6..6d6dc1adf8 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_module.h +++ b/level_zero/core/test/unit_tests/mocks/mock_module.h @@ -1,11 +1,13 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once +#include "shared/source/compiler_interface/external_functions.h" +#include "shared/source/program/kernel_info.h" #include "shared/test/common/mocks/mock_cif.h" #include "shared/test/common/mocks/mock_compiler_interface.h" #include "shared/test/common/test_macros/mock_method_macros.h" @@ -17,6 +19,52 @@ namespace L0 { namespace ult { +struct MockModuleTranslationUnit : public L0::ModuleTranslationUnit { + using BaseClass = L0::ModuleTranslationUnit; + + MockModuleTranslationUnit(L0::Device *device) : BaseClass{device} {} + + MockModuleTranslationUnit(L0::ModuleTranslationUnit *orig) : BaseClass{orig->device} { + std::swap(this->globalConstBuffer, orig->globalConstBuffer); + std::swap(this->globalVarBuffer, orig->globalVarBuffer); + std::swap(this->programInfo, orig->programInfo); + std::swap(this->options, orig->options); + std::swap(this->shouldSuppressRebuildWarning, orig->shouldSuppressRebuildWarning); + std::swap(this->buildLog, orig->buildLog); + std::swap(this->irBinary, orig->irBinary); + std::swap(this->irBinarySize, orig->irBinarySize); + std::swap(this->unpackedDeviceBinary, orig->unpackedDeviceBinary); + std::swap(this->unpackedDeviceBinarySize, orig->unpackedDeviceBinarySize); + std::swap(this->packedDeviceBinary, orig->packedDeviceBinary); + std::swap(this->packedDeviceBinarySize, orig->packedDeviceBinarySize); + std::swap(this->debugData, orig->debugData); + std::swap(this->debugDataSize, orig->debugDataSize); + std::swap(this->alignedvIsas, orig->alignedvIsas); + std::swap(this->specConstantsValues, orig->specConstantsValues); + std::swap(this->isBuiltIn, orig->isBuiltIn); + } + + ADDMETHOD(processUnpackedBinary, ze_result_t, true, ZE_RESULT_SUCCESS, (), ()); + + ze_result_t compileGenBinary(NEO::TranslationInput inputArgs, bool staticLink) override { + if (unpackedDeviceBinarySize && unpackedDeviceBinary) { + return ZE_RESULT_SUCCESS; + } else { + return ModuleTranslationUnit::compileGenBinary(inputArgs, staticLink); + } + } + + void setDummyKernelInfo() { + this->programInfo.kernelInfos.push_back(dummyKernelInfo.get()); + } + + std::unique_ptr dummyKernelInfo = {}; +}; + +constexpr inline MockModuleTranslationUnit *toMockPtr(L0::ModuleTranslationUnit *tu) { + return static_cast(tu); +} + template <> struct WhiteBox<::L0::Module> : public ::L0::ModuleImp { using BaseClass = ::L0::ModuleImp; @@ -30,11 +78,16 @@ struct WhiteBox<::L0::Module> : public ::L0::ModuleImp { using BaseClass::isFunctionSymbolExportEnabled; using BaseClass::isGlobalSymbolExportEnabled; using BaseClass::kernelImmDatas; - using BaseClass::maxGroupSize; using BaseClass::symbols; using BaseClass::translationUnit; using BaseClass::type; using BaseClass::unresolvedExternalsInfo; + uint32_t &maxGroupSize{BaseClass::defaultMaxGroupSize}; + + WhiteBox(Device *device, ModuleBuildLog *moduleBuildLog, ModuleType type) + : ::L0::ModuleImp{device, moduleBuildLog, type} { + this->translationUnit.reset(new MockModuleTranslationUnit{device}); + } }; using Module = WhiteBox<::L0::Module>; @@ -50,7 +103,7 @@ struct Mock : public Module { ADDMETHOD_NOBASE(getFunctionPointer, ze_result_t, ZE_RESULT_SUCCESS, (const char *pKernelName, void **pfnFunction)); ADDMETHOD_NOBASE(getNativeBinary, ze_result_t, ZE_RESULT_SUCCESS, (size_t * pSize, uint8_t *pModuleNativeBinary)); ADDMETHOD_CONST_NOBASE(getKernelImmutableData, const L0::KernelImmutableData *, nullptr, (const char *kernelName)); - ADDMETHOD_CONST_NOBASE(getMaxGroupSize, uint32_t, 256, ()); + ADDMETHOD_CONST_NOBASE(getMaxGroupSize, uint32_t, 256, (const NEO::KernelDescriptor &)); ADDMETHOD_NOBASE(getKernelNames, ze_result_t, ZE_RESULT_SUCCESS, (uint32_t * pCount, const char **pNames)); ADDMETHOD_NOBASE(performDynamicLink, ze_result_t, ZE_RESULT_SUCCESS, (uint32_t numModules, ze_module_handle_t *phModules, ze_module_build_log_handle_t *phLinkLog)); @@ -59,23 +112,6 @@ struct Mock : public Module { ADDMETHOD_CONST_NOBASE(isDebugEnabled, bool, false, ()); }; -struct MockModuleTranslationUnit : public L0::ModuleTranslationUnit { - MockModuleTranslationUnit(L0::Device *device) : L0::ModuleTranslationUnit(device) { - } - - ze_result_t processUnpackedBinary() override { - return ZE_RESULT_SUCCESS; - } - - ze_result_t compileGenBinary(NEO::TranslationInput inputArgs, bool staticLink) override { - if (unpackedDeviceBinarySize && unpackedDeviceBinary) { - return ZE_RESULT_SUCCESS; - } else { - return ModuleTranslationUnit::compileGenBinary(inputArgs, staticLink); - } - } -}; - struct MockModule : public L0::ModuleImp { using ModuleImp::debugEnabled; using ModuleImp::debugModuleHandle; @@ -86,11 +122,13 @@ struct MockModule : public L0::ModuleImp { using ModuleImp::populateHostGlobalSymbolsMap; using ModuleImp::symbols; using ModuleImp::translationUnit; + uint32_t &maxGroupSize = ModuleImp::defaultMaxGroupSize; MockModule(L0::Device *device, L0::ModuleBuildLog *moduleBuildLog, L0::ModuleType type) : ModuleImp(device, moduleBuildLog, type) { - maxGroupSize = 32; + this->translationUnit.reset(new MockModuleTranslationUnit{device}); + this->maxGroupSize = 32u; }; ~MockModule() override = default; diff --git a/level_zero/core/test/unit_tests/sources/debugger/test_module_with_debug.cpp b/level_zero/core/test/unit_tests/sources/debugger/test_module_with_debug.cpp index de8a1fbe80..7d1aa79967 100644 --- a/level_zero/core/test/unit_tests/sources/debugger/test_module_with_debug.cpp +++ b/level_zero/core/test/unit_tests/sources/debugger/test_module_with_debug.cpp @@ -181,6 +181,8 @@ TEST_F(ModuleWithSLDTest, GivenNoDebugDataWhenInitializingModuleThenRelocatedDeb moduleBuildLog, ModuleType::User); module->translationUnit = std::make_unique(device); + auto mockTranslationUnit = toMockPtr(module->translationUnit.get()); + mockTranslationUnit->processUnpackedBinaryCallBase = false; uint32_t kernelHeap = 0; auto kernelInfo = new KernelInfo(); @@ -203,6 +205,7 @@ TEST_F(ModuleWithSLDTest, GivenNoDebugDataWhenInitializingModuleThenRelocatedDeb result = module->initialize(&moduleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_EQ(nullptr, kernelInfo->kernelDescriptor.external.relocatedDebugData); } @@ -222,6 +225,8 @@ TEST_F(ModuleWithSLDTest, GivenDebugDataWithSingleRelocationWhenInitializingModu std::unique_ptr moduleMock = std::make_unique(device, moduleBuildLog, ModuleType::User); moduleMock->translationUnit = std::make_unique(device); + auto mockTranslationUnit = toMockPtr(moduleMock->translationUnit.get()); + mockTranslationUnit->processUnpackedBinaryCallBase = false; uint32_t kernelHeap = 0; auto kernelInfo = new KernelInfo(); @@ -252,6 +257,7 @@ TEST_F(ModuleWithSLDTest, GivenDebugDataWithSingleRelocationWhenInitializingModu result = moduleMock->initialize(&moduleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_EQ(nullptr, kernelInfo->kernelDescriptor.external.relocatedDebugData); } @@ -271,6 +277,8 @@ TEST_F(ModuleWithSLDTest, GivenDebugDataWithMultipleRelocationsWhenInitializingM std::unique_ptr moduleMock = std::make_unique(device, moduleBuildLog, ModuleType::User); moduleMock->translationUnit = std::make_unique(device); + auto mockTranslationUnit = toMockPtr(moduleMock->translationUnit.get()); + mockTranslationUnit->processUnpackedBinaryCallBase = false; uint32_t kernelHeap = 0; auto kernelInfo = new KernelInfo(); @@ -298,6 +306,7 @@ TEST_F(ModuleWithSLDTest, GivenDebugDataWithMultipleRelocationsWhenInitializingM result = moduleMock->initialize(&moduleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_NE(nullptr, kernelInfo->kernelDescriptor.external.relocatedDebugData); } @@ -444,6 +453,8 @@ HWTEST_F(ModuleWithDebuggerL0MultiTileTest, GivenSubDeviceWhenCreatingModuleThen std::unique_ptr moduleMock = std::make_unique(subDevice0, moduleBuildLog, ModuleType::User); moduleMock->translationUnit = std::make_unique(subDevice0); + auto mockTranslationUnit = toMockPtr(moduleMock->translationUnit.get()); + mockTranslationUnit->processUnpackedBinaryCallBase = false; uint32_t kernelHeap = 0; auto kernelInfo = new KernelInfo(); @@ -471,6 +482,7 @@ HWTEST_F(ModuleWithDebuggerL0MultiTileTest, GivenSubDeviceWhenCreatingModuleThen EXPECT_EQ(1u, debuggerL0Hw->notifyModuleCreateCount); EXPECT_EQ(subDevice0->getNEODevice(), debuggerL0Hw->notifyModuleLoadAllocationsCapturedDevice); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_EQ(1, memoryOperationsHandler->makeResidentCalledCount); } @@ -488,6 +500,8 @@ HWTEST_F(ModuleWithDebuggerL0Test, GivenDebugDataWithRelocationsWhenInitializing std::unique_ptr moduleMock = std::make_unique(device, moduleBuildLog, ModuleType::User); moduleMock->translationUnit = std::make_unique(device); + auto mockTranslationUnit = toMockPtr(moduleMock->translationUnit.get()); + mockTranslationUnit->processUnpackedBinaryCallBase = false; uint32_t kernelHeap = 0; auto kernelInfo = new KernelInfo(); @@ -515,6 +529,7 @@ HWTEST_F(ModuleWithDebuggerL0Test, GivenDebugDataWithRelocationsWhenInitializing EXPECT_EQ(1u, getMockDebuggerL0Hw()->registerElfAndLinkCount); EXPECT_EQ(1u, getMockDebuggerL0Hw()->notifyModuleCreateCount); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_NE(nullptr, kernelInfo->kernelDescriptor.external.relocatedDebugData.get()); EXPECT_EQ(reinterpret_cast(kernelInfo->kernelDescriptor.external.relocatedDebugData.get()), getMockDebuggerL0Hw()->lastReceivedElf); } @@ -542,9 +557,11 @@ HWTEST_F(ModuleWithDebuggerL0Test, GivenBuiltinModuleWhenInitializingModuleThenM kernelMock.module = moduleMock.get(); kernelMock.immutableData.kernelInfo = kernelInfo; kernelInfo->kernelDescriptor.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful = 0; + kernelInfo->kernelDescriptor.external.debugData = std::make_unique(); moduleMock->kernelImmData = &kernelMock.immutableData; moduleMock->translationUnit->programInfo.kernelInfos.push_back(kernelInfo); - kernelInfo->kernelDescriptor.external.debugData = std::make_unique(); + auto mockTranslationUnit = toMockPtr(moduleMock->translationUnit.get()); + mockTranslationUnit->processUnpackedBinaryCallBase = false; auto debugData = MockElfEncoder<>::createRelocateableDebugDataElf(); kernelInfo->kernelDescriptor.external.debugData->vIsaSize = static_cast(debugData.size()); @@ -557,6 +574,7 @@ HWTEST_F(ModuleWithDebuggerL0Test, GivenBuiltinModuleWhenInitializingModuleThenM EXPECT_EQ(0u, getMockDebuggerL0Hw()->registerElfAndLinkCount); EXPECT_EQ(0u, getMockDebuggerL0Hw()->notifyModuleCreateCount); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_NE(nullptr, kernelInfo->kernelDescriptor.external.relocatedDebugData.get()); EXPECT_EQ(nullptr, getMockDebuggerL0Hw()->lastReceivedElf); } @@ -575,6 +593,8 @@ HWTEST_F(ModuleWithDebuggerL0Test, GivenDebugDataWithoutRelocationsWhenInitializ std::unique_ptr moduleMock = std::make_unique(device, moduleBuildLog, ModuleType::User); moduleMock->translationUnit = std::make_unique(device); + auto mockTranslationUnit = toMockPtr(moduleMock->translationUnit.get()); + mockTranslationUnit->processUnpackedBinaryCallBase = false; uint32_t kernelHeap = 0; auto kernelInfo = new KernelInfo(); @@ -608,6 +628,7 @@ HWTEST_F(ModuleWithDebuggerL0Test, GivenDebugDataWithoutRelocationsWhenInitializ EXPECT_EQ(1u, getMockDebuggerL0Hw()->registerElfAndLinkCount); EXPECT_EQ(1u, getMockDebuggerL0Hw()->notifyModuleCreateCount); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_EQ(nullptr, kernelInfo->kernelDescriptor.external.relocatedDebugData.get()); EXPECT_EQ(kernelInfo->kernelDescriptor.external.debugData->vIsa, getMockDebuggerL0Hw()->lastReceivedElf); } @@ -626,6 +647,8 @@ HWTEST_F(ModuleWithDebuggerL0Test, GivenNoDebugDataWhenInitializingModuleThenDoN std::unique_ptr moduleMock = std::make_unique(device, moduleBuildLog, ModuleType::User); moduleMock->translationUnit = std::make_unique(device); + auto mockTranslationUnit = toMockPtr(moduleMock->translationUnit.get()); + mockTranslationUnit->processUnpackedBinaryCallBase = false; uint32_t kernelHeap = 0; auto kernelInfo = new KernelInfo(); @@ -642,6 +665,7 @@ HWTEST_F(ModuleWithDebuggerL0Test, GivenNoDebugDataWhenInitializingModuleThenDoN EXPECT_EQ(0u, getMockDebuggerL0Hw()->registerElfCount); EXPECT_EQ(moduleMock->initialize(&moduleDesc, neoDevice), ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_EQ(0u, getMockDebuggerL0Hw()->registerElfCount); EXPECT_EQ(1u, getMockDebuggerL0Hw()->notifyModuleCreateCount); } @@ -699,9 +723,12 @@ HWTEST_F(ModuleWithZebinAndL0DebuggerTest, GivenZebinNoDebugDataWhenInitializing std::unique_ptr moduleMock = std::make_unique(device, nullptr, ModuleType::User); moduleMock->translationUnit = std::make_unique(device); + auto mockTranslationUnit = toMockPtr(moduleMock->translationUnit.get()); + mockTranslationUnit->processUnpackedBinaryCallBase = false; EXPECT_EQ(0u, getMockDebuggerL0Hw()->registerElfCount); EXPECT_EQ(moduleMock->initialize(&moduleDesc, neoDevice), ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_EQ(0u, getMockDebuggerL0Hw()->registerElfCount); EXPECT_EQ(0u, getMockDebuggerL0Hw()->notifyModuleCreateCount); } @@ -779,6 +806,8 @@ HWTEST_F(ModuleWithDebuggerL0Test, GivenNonZebinBinaryWhenDestroyModuleThenModul moduleMock->kernelImmData = &kernelMock.immutableData; moduleMock->translationUnit->programInfo.kernelInfos.push_back(kernelInfo); + auto mockTranslationUnit = toMockPtr(moduleMock->translationUnit.get()); + mockTranslationUnit->processUnpackedBinaryCallBase = false; kernelInfo->kernelDescriptor.external.debugData = std::make_unique(); @@ -795,6 +824,7 @@ HWTEST_F(ModuleWithDebuggerL0Test, GivenNonZebinBinaryWhenDestroyModuleThenModul kernelInfo->kernelDescriptor.external.debugData->genIsaSize = 0; EXPECT_EQ(moduleMock->initialize(&moduleDesc, neoDevice), ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); moduleMock->destroy(); moduleMock.release(); EXPECT_EQ(1u, getMockDebuggerL0Hw()->notifyModuleDestroyCount); @@ -814,6 +844,8 @@ HWTEST_F(ModuleWithDebuggerL0Test, GivenNoDebugDataWhenDestroyingModuleThenNotif std::unique_ptr moduleMock = std::make_unique(device, moduleBuildLog, ModuleType::User); moduleMock->translationUnit = std::make_unique(device); + auto mockTranslationUnit = toMockPtr(moduleMock->translationUnit.get()); + mockTranslationUnit->processUnpackedBinaryCallBase = false; uint32_t kernelHeap = 0; auto kernelInfo = new KernelInfo(); @@ -829,6 +861,7 @@ HWTEST_F(ModuleWithDebuggerL0Test, GivenNoDebugDataWhenDestroyingModuleThenNotif moduleMock->translationUnit->programInfo.kernelInfos.push_back(kernelInfo); EXPECT_EQ(moduleMock->initialize(&moduleDesc, neoDevice), ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); moduleMock->destroy(); moduleMock.release(); EXPECT_EQ(1u, getMockDebuggerL0Hw()->notifyModuleDestroyCount); @@ -853,7 +886,6 @@ HWTEST_F(ModuleWithZebinAndL0DebuggerTest, GivenModuleDebugHandleZeroWhenInitial auto kernelImmutableData = ::std::make_unique(device); kernelImmutableData->initialize(kernelInfo.get(), device, 0, nullptr, nullptr, false); std::unique_ptr moduleMock = std::make_unique(device, nullptr, ModuleType::User); - moduleMock->translationUnit = std::make_unique(device); moduleMock->kernelImmDatas.push_back(std::move(kernelImmutableData)); auto zebin = ZebinTestData::ValidEmptyProgram<>(); diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index da04e3a7f2..72cb80ac03 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -331,7 +331,7 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeDisabledWhenSet EXPECT_EQ(nullptr, mockKernel.perThreadDataForWholeThreadGroup); } -TEST_F(KernelImpSetGroupSizeTest, givenIncorrectGroupSizeWhenSettingGroupSizeThenInvalidGroupSizeDimensionErrorIsReturned) { +TEST_F(KernelImpSetGroupSizeTest, givenIncorrectGroupSizeDimensionWhenSettingGroupSizeThenInvalidGroupSizeDimensionErrorIsReturned) { Mock mockKernel; Mock mockModule(this->device, nullptr); for (auto i = 0u; i < 3u; i++) { @@ -800,6 +800,8 @@ TEST_F(KernelImmutableDataTests, givenInternalModuleWhenKernelIsCreatedIsaIsNotC std::unique_ptr moduleMock = std::make_unique(device, moduleBuildLog, ModuleType::Builtin); moduleMock->translationUnit = std::make_unique(device); moduleMock->translationUnit->programInfo.linkerInput = std::move(linkerInput); + auto mockTranslationUnit = toMockPtr(moduleMock->translationUnit.get()); + mockTranslationUnit->processUnpackedBinaryCallBase = false; uint32_t kernelHeap = 0; auto kernelInfo = new KernelInfo(); @@ -821,6 +823,7 @@ TEST_F(KernelImmutableDataTests, givenInternalModuleWhenKernelIsCreatedIsaIsNotC ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = moduleMock->initialize(&moduleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); size_t expectedPreviouscopyMemoryToAllocationCalledTimes = previouscopyMemoryToAllocationCalledTimes; EXPECT_EQ(expectedPreviouscopyMemoryToAllocationCalledTimes, mockMemoryManager->copyMemoryToAllocationCalledTimes); @@ -941,7 +944,7 @@ TEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueThenRayTracingIsInitialized ModuleType::User, 32u, mockKernelImmutableData.get()); - module->maxGroupSize = 10; + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); @@ -983,7 +986,7 @@ TEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndPatchTokenPointerSizeIsZ ModuleType::User, 32u, mockKernelImmutableData.get()); - module->maxGroupSize = 10; + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); @@ -1024,7 +1027,7 @@ HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndNoRTDispatchGlobalsIs ModuleType::User, 32u, mockKernelImmutableData.get()); - module->maxGroupSize = 10; + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); @@ -1066,7 +1069,7 @@ HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndRTStackAllocationFail ModuleType::User, 32u, mockKernelImmutableData.get()); - module->maxGroupSize = 10; + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); @@ -1109,7 +1112,7 @@ HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndRTDispatchGlobalsArra ModuleType::User, 32u, mockKernelImmutableData.get()); - module->maxGroupSize = 10; + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); @@ -1144,7 +1147,7 @@ TEST_F(KernelImmutableDataTests, whenHasRTCallsIsFalseThenRayTracingIsNotInitial ModuleType::User, 32u, mockKernelImmutableData.get()); - module->maxGroupSize = 10; + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); @@ -1185,7 +1188,7 @@ TEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueThenCrossThreadDataIsPatche ModuleType::User, 32u, mockKernelImmutableData.get()); - module->maxGroupSize = 10; + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); diff --git a/level_zero/core/test/unit_tests/sources/module/test_module.cpp b/level_zero/core/test/unit_tests/sources/module/test_module.cpp index d63f216345..7fa47d7800 100644 --- a/level_zero/core/test/unit_tests/sources/module/test_module.cpp +++ b/level_zero/core/test/unit_tests/sources/module/test_module.cpp @@ -589,9 +589,11 @@ struct ModuleSpecConstantsFixture : public DeviceFixture { auto module = new Module(device, nullptr, ModuleType::User); module->translationUnit.reset(mockTranslationUnit); + mockTranslationUnit->processUnpackedBinaryCallBase = false; ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = module->initialize(&moduleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); for (uint32_t i = 0; i < mockCompiler->moduleNumSpecConstants / 2; i++) { EXPECT_EQ(static_cast(module->translationUnit->specConstantsValues[mockCompiler->moduleSpecConstantsIds[2 * i]]), static_cast(mockCompiler->moduleSpecConstantsValuesT2[i])); EXPECT_EQ(static_cast(module->translationUnit->specConstantsValues[mockCompiler->moduleSpecConstantsIds[2 * i + 1]]), static_cast(mockCompiler->moduleSpecConstantsValuesT1[i])); @@ -646,9 +648,11 @@ struct ModuleSpecConstantsFixture : public DeviceFixture { auto module = new Module(device, nullptr, ModuleType::User); module->translationUnit.reset(mockTranslationUnit); + mockTranslationUnit->processUnpackedBinaryCallBase = false; ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = module->initialize(&combinedModuleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); for (uint32_t i = 0; i < mockCompiler->moduleNumSpecConstants / 2; i++) { EXPECT_EQ(static_cast(module->translationUnit->specConstantsValues[mockCompiler->moduleSpecConstantsIds[2 * i]]), static_cast(mockCompiler->moduleSpecConstantsValuesT2[i])); EXPECT_EQ(static_cast(module->translationUnit->specConstantsValues[mockCompiler->moduleSpecConstantsIds[2 * i + 1]]), static_cast(mockCompiler->moduleSpecConstantsValuesT1[i])); @@ -937,6 +941,7 @@ struct ModuleStaticLinkFixture : public DeviceFixture { auto rootDeviceEnvironment = neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[0].get(); rootDeviceEnvironment->compilerInterface.reset(mockCompiler); mockTranslationUnit = new MockModuleTranslationUnit(device); + mockTranslationUnit->processUnpackedBinaryCallBase = false; loadModules(testMultiple); @@ -955,6 +960,7 @@ struct ModuleStaticLinkFixture : public DeviceFixture { ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = module->initialize(&combinedModuleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); module->destroy(); } void runSprivLinkBuildWithOneModule() { @@ -963,6 +969,7 @@ struct ModuleStaticLinkFixture : public DeviceFixture { auto rootDeviceEnvironment = neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[0].get(); rootDeviceEnvironment->compilerInterface.reset(mockCompiler); mockTranslationUnit = new MockModuleTranslationUnit(device); + mockTranslationUnit->processUnpackedBinaryCallBase = false; loadModules(testSingle); @@ -979,6 +986,7 @@ struct ModuleStaticLinkFixture : public DeviceFixture { ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = module->initialize(&combinedModuleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); module->destroy(); } std::unique_ptr zebinData; @@ -1045,6 +1053,7 @@ HWTEST_F(ModuleLinkingTest, givenFailureDuringLinkingWhenCreatingModuleThenModul rootDeviceEnvironment->compilerInterface.reset(mockCompiler); auto mockTranslationUnit = new MockModuleTranslationUnit(device); + mockTranslationUnit->processUnpackedBinaryCallBase = false; auto linkerInput = std::make_unique<::WhiteBox>(); linkerInput->valid = false; @@ -1063,6 +1072,7 @@ HWTEST_F(ModuleLinkingTest, givenFailureDuringLinkingWhenCreatingModuleThenModul ze_result_t result = ZE_RESULT_SUCCESS; result = module.initialize(&moduleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_ERROR_MODULE_LINK_FAILURE); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); } HWTEST_F(ModuleLinkingTest, givenRemainingUnresolvedSymbolsDuringLinkingWhenCreatingModuleThenModuleIsNotLinkedFully) { @@ -1071,6 +1081,7 @@ HWTEST_F(ModuleLinkingTest, givenRemainingUnresolvedSymbolsDuringLinkingWhenCrea rootDeviceEnvironment->compilerInterface.reset(mockCompiler); auto mockTranslationUnit = new MockModuleTranslationUnit(device); + mockTranslationUnit->processUnpackedBinaryCallBase = false; auto linkerInput = std::make_unique<::WhiteBox>(); @@ -1093,6 +1104,7 @@ HWTEST_F(ModuleLinkingTest, givenRemainingUnresolvedSymbolsDuringLinkingWhenCrea ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = module.initialize(&moduleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_FALSE(module.isFullyLinked); } @@ -1102,6 +1114,7 @@ HWTEST_F(ModuleLinkingTest, givenModuleCompiledThenCachingIsTrue) { rootDeviceEnvironment->compilerInterface.reset(mockCompiler); auto mockTranslationUnit = new MockModuleTranslationUnit(device); + mockTranslationUnit->processUnpackedBinaryCallBase = false; auto linkerInput = std::make_unique<::WhiteBox>(); @@ -1126,6 +1139,7 @@ HWTEST_F(ModuleLinkingTest, givenModuleCompiledThenCachingIsTrue) { ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = module.initialize(&moduleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_TRUE(mockCompiler->cachingPassed); } @@ -2304,10 +2318,14 @@ HWTEST_F(ModuleTranslationUnitTest, WhenCreatingFromNativeBinaryThenSetsUpPacked target.maxPointerSizeInBytes = programTokens.header->GPUPointerSizeInBytes; auto arData = encoder.encode(); - L0::ModuleTranslationUnit moduleTuValid(this->device); + auto moduleTuValid = MockModuleTranslationUnit{this->device}; + moduleTuValid.processUnpackedBinaryCallBase = false; + moduleTuValid.setDummyKernelInfo(); + ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = moduleTuValid.createFromNativeBinary(reinterpret_cast(arData.data()), arData.size()); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(moduleTuValid.processUnpackedBinaryCalled, 1u); EXPECT_NE(moduleTuValid.packedDeviceBinarySize, arData.size()); } @@ -2326,6 +2344,113 @@ HWTEST_F(ModuleTranslationUnitTest, WhenCreatingFromZebinThenAppendAllowZebinFla EXPECT_STREQ(expectedOptions.c_str(), moduleTu.options.c_str()); } +HWTEST2_F(ModuleTranslationUnitTest, givenLargeGrfAndSimd16WhenProcessingBinaryThenKernelGroupSizeReducedToFitWithinSubslice, IsWithinXeGfxFamily) { + std::string validZeInfo = std::string("version :\'") + versionToString(zeInfoDecoderVersion) + R"===(' +kernels: + - name : kernel_with_default_maxWGS + execution_env : + simd_size : 8 + grf_count: )===" + + std::to_string(GrfConfig::DefaultGrfNumber) + R"===( + - name : kernel_with_reduced_maxWGS + execution_env : + simd_size : 16 + grf_count: )===" + + std::to_string(GrfConfig::LargeGrfNumber) + "\n"; + + uint8_t kernelIsa[8]{0U}; + ZebinTestData::ValidEmptyProgram zebin; + zebin.removeSection(NEO::Elf::SHT_ZEBIN::SHT_ZEBIN_ZEINFO, NEO::Elf::SectionsNamesZebin::zeInfo); + zebin.appendSection(NEO::Elf::SHT_ZEBIN::SHT_ZEBIN_ZEINFO, NEO::Elf::SectionsNamesZebin::zeInfo, ArrayRef::fromAny(validZeInfo.data(), validZeInfo.size())); + zebin.appendSection(NEO::Elf::SHT_PROGBITS, NEO::Elf::SectionsNamesZebin::textPrefix.str() + "kernel_with_default_maxWGS", {kernelIsa, sizeof(kernelIsa)}); + zebin.appendSection(NEO::Elf::SHT_PROGBITS, NEO::Elf::SectionsNamesZebin::textPrefix.str() + "kernel_with_reduced_maxWGS", {kernelIsa, sizeof(kernelIsa)}); + zebin.elfHeader->machine = this->device->getNEODevice()->getHardwareInfo().platform.eProductFamily; + + MockModule mockModule{this->device, nullptr, ModuleType::User}; + mockModule.maxGroupSize = static_cast(this->device->getDeviceInfo().maxWorkGroupSize); + auto mockTU = mockModule.translationUnit.get(); + auto result = mockTU->createFromNativeBinary(reinterpret_cast(zebin.storage.data()), zebin.storage.size()); + EXPECT_EQ(result, ZE_RESULT_SUCCESS); + + auto &defaultKernelDescriptor = mockTU->programInfo.kernelInfos[0]->kernelDescriptor; + auto &reducedKernelDescriptor = mockTU->programInfo.kernelInfos[1]->kernelDescriptor; + EXPECT_EQ(mockModule.getMaxGroupSize(defaultKernelDescriptor), mockModule.maxGroupSize); + EXPECT_EQ(mockModule.getMaxGroupSize(reducedKernelDescriptor), (mockModule.maxGroupSize >> 1)); + + uint32_t groupSize[3] = {8, 4, (mockModule.maxGroupSize >> 5)}; // default max WGS + Mock defaultKernel; + defaultKernel.module = &mockModule; + defaultKernel.descriptor.kernelAttributes = defaultKernelDescriptor.kernelAttributes; + EXPECT_EQ(ZE_RESULT_SUCCESS, defaultKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2])); + + Mock reducedKernel; + reducedKernel.module = &mockModule; + reducedKernel.descriptor.kernelAttributes = reducedKernelDescriptor.kernelAttributes; + EXPECT_EQ(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION, reducedKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2])); + groupSize[2] >>= 2; // align to max WGS reduced due to SIMD16 + LargeGrf + EXPECT_EQ(ZE_RESULT_SUCCESS, reducedKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2])); +} + +HWTEST2_F(ModuleTranslationUnitTest, givenLargeGrfAndSimd16WhenProcessingBinaryThenSuggestedKernelGroupSizeFitsWithinSubslice, IsWithinXeGfxFamily) { + std::string validZeInfo = std::string("version :\'") + versionToString(zeInfoDecoderVersion) + R"===(' +kernels: + - name : kernel_with_default_maxWGS + execution_env : + simd_size : 8 + grf_count: )===" + + std::to_string(GrfConfig::DefaultGrfNumber) + R"===( + - name : kernel_with_reduced_maxWGS + execution_env : + simd_size : 16 + grf_count: )===" + + std::to_string(GrfConfig::LargeGrfNumber) + "\n"; + + uint8_t kernelIsa[8]{0U}; + ZebinTestData::ValidEmptyProgram zebin; + zebin.removeSection(NEO::Elf::SHT_ZEBIN::SHT_ZEBIN_ZEINFO, NEO::Elf::SectionsNamesZebin::zeInfo); + zebin.appendSection(NEO::Elf::SHT_ZEBIN::SHT_ZEBIN_ZEINFO, NEO::Elf::SectionsNamesZebin::zeInfo, ArrayRef::fromAny(validZeInfo.data(), validZeInfo.size())); + zebin.appendSection(NEO::Elf::SHT_PROGBITS, NEO::Elf::SectionsNamesZebin::textPrefix.str() + "kernel_with_default_maxWGS", {kernelIsa, sizeof(kernelIsa)}); + zebin.appendSection(NEO::Elf::SHT_PROGBITS, NEO::Elf::SectionsNamesZebin::textPrefix.str() + "kernel_with_reduced_maxWGS", {kernelIsa, sizeof(kernelIsa)}); + zebin.elfHeader->machine = this->device->getNEODevice()->getHardwareInfo().platform.eProductFamily; + + MockModule mockModule{this->device, nullptr, ModuleType::User}; + mockModule.maxGroupSize = static_cast(device->getDeviceInfo().maxWorkGroupSize); + auto mockTU = mockModule.translationUnit.get(); + auto result = mockTU->createFromNativeBinary(reinterpret_cast(zebin.storage.data()), zebin.storage.size()); + EXPECT_EQ(result, ZE_RESULT_SUCCESS); + + auto &defaultKernelDescriptor = mockTU->programInfo.kernelInfos[0]->kernelDescriptor; + auto &reducedKernelDescriptor = mockTU->programInfo.kernelInfos[1]->kernelDescriptor; + EXPECT_EQ(mockModule.getMaxGroupSize(defaultKernelDescriptor), mockModule.maxGroupSize); + EXPECT_EQ(mockModule.getMaxGroupSize(reducedKernelDescriptor), (mockModule.maxGroupSize >> 1)); + + uint32_t groupSize[3] = {0u, 0u, 0u}; + Mock defaultKernel; + defaultKernel.module = &mockModule; + defaultKernel.descriptor.kernelAttributes = defaultKernelDescriptor.kernelAttributes; + EXPECT_EQ(ZE_RESULT_SUCCESS, defaultKernel.suggestGroupSize(4096u, 4096u, 4096u, &groupSize[0], &groupSize[1], &groupSize[2])); + EXPECT_GT(groupSize[0] * groupSize[1] * groupSize[2], 0u); + EXPECT_LE(groupSize[0] * groupSize[1] * groupSize[2], mockModule.maxGroupSize); + + groupSize[0] = groupSize[1] = groupSize[2] = 0u; + EXPECT_EQ(ZE_RESULT_SUCCESS, defaultKernel.suggestGroupSize(mockModule.maxGroupSize, 1u, 1u, &groupSize[0], &groupSize[1], &groupSize[2])); + EXPECT_GT(groupSize[0] * groupSize[1] * groupSize[2], 0u); + EXPECT_LE(groupSize[0] * groupSize[1] * groupSize[2], mockModule.maxGroupSize); + + groupSize[0] = groupSize[1] = groupSize[2] = 0u; + Mock reducedKernel; + reducedKernel.module = &mockModule; + reducedKernel.descriptor.kernelAttributes = reducedKernelDescriptor.kernelAttributes; + EXPECT_EQ(ZE_RESULT_SUCCESS, reducedKernel.suggestGroupSize(mockModule.maxGroupSize, 1u, 1u, &groupSize[0], &groupSize[1], &groupSize[2])); + EXPECT_GT(groupSize[0] * groupSize[1] * groupSize[2], 0u); + EXPECT_LE(groupSize[0] * groupSize[1] * groupSize[2], mockModule.getMaxGroupSize(reducedKernelDescriptor)); + + groupSize[0] = groupSize[1] = groupSize[2] = 0u; + EXPECT_EQ(ZE_RESULT_SUCCESS, reducedKernel.suggestGroupSize(4096u, 4096u, 4096u, &groupSize[0], &groupSize[1], &groupSize[2])); + EXPECT_GT(groupSize[0] * groupSize[1] * groupSize[2], 0u); + EXPECT_LE(groupSize[0] * groupSize[1] * groupSize[2], mockModule.getMaxGroupSize(reducedKernelDescriptor)); +} + TEST_F(ModuleTranslationUnitTest, WhenCreatingFromZeBinaryAndGlobalsAreExportedThenTheirAllocationTypeIsUSMDevice) { std::string zeInfo = std::string("version :\'") + versionToString(zeInfoDecoderVersion) + R"===(' kernels: @@ -2476,9 +2601,11 @@ HWTEST_F(ModuleTranslationUnitTest, WhenBuildOptionsAreNullThenReuseExistingOpti DebugManager.flags.DisableStatelessToStatefulOptimization.set(1); MockModuleTranslationUnit moduleTu(this->device); + moduleTu.processUnpackedBinaryCallBase = false; ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = moduleTu.buildFromSpirV("", 0U, nullptr, "", nullptr); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(moduleTu.processUnpackedBinaryCalled, 1u); EXPECT_NE(pMockCompilerInterface->inputInternalOptions.find("cl-intel-greater-than-4GB-buffer-required"), std::string::npos); } @@ -2487,10 +2614,12 @@ HWTEST_F(ModuleTranslationUnitTest, givenInternalOptionsThenLSCCachePolicyIsSet) auto &rootDeviceEnvironment = this->neoDevice->executionEnvironment->rootDeviceEnvironments[this->neoDevice->getRootDeviceIndex()]; rootDeviceEnvironment->compilerInterface.reset(pMockCompilerInterface); MockModuleTranslationUnit moduleTu(this->device); + moduleTu.processUnpackedBinaryCallBase = false; ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = moduleTu.buildFromSpirV("", 0U, nullptr, "", nullptr); const auto &compilerProductHelper = rootDeviceEnvironment->getHelper(); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(moduleTu.processUnpackedBinaryCalled, 1u); auto expectedPolicy = compilerProductHelper.getCachingPolicyOptions(false); if (expectedPolicy != nullptr) { EXPECT_NE(pMockCompilerInterface->inputInternalOptions.find(expectedPolicy), std::string::npos); @@ -2507,9 +2636,11 @@ HWTEST2_F(ModuleTranslationUnitTest, givenDebugFlagSetToWbWhenGetInternalOptions auto &rootDeviceEnvironment = this->neoDevice->executionEnvironment->rootDeviceEnvironments[this->neoDevice->getRootDeviceIndex()]; rootDeviceEnvironment->compilerInterface.reset(pMockCompilerInterface); MockModuleTranslationUnit moduleTu(this->device); + moduleTu.processUnpackedBinaryCallBase = false; ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = moduleTu.buildFromSpirV("", 0U, nullptr, "", nullptr); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(moduleTu.processUnpackedBinaryCalled, 1u); EXPECT_NE(pMockCompilerInterface->inputInternalOptions.find("-cl-store-cache-default=7 -cl-load-cache-default=4"), std::string::npos); } @@ -2521,9 +2652,11 @@ HWTEST2_F(ModuleTranslationUnitTest, givenDebugFlagSetForceAllResourcesUncachedW auto &rootDeviceEnvironment = this->neoDevice->executionEnvironment->rootDeviceEnvironments[this->neoDevice->getRootDeviceIndex()]; rootDeviceEnvironment->compilerInterface.reset(pMockCompilerInterface); MockModuleTranslationUnit moduleTu(this->device); + moduleTu.processUnpackedBinaryCallBase = false; ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = moduleTu.buildFromSpirV("", 0U, nullptr, "", nullptr); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(moduleTu.processUnpackedBinaryCalled, 1u); EXPECT_NE(pMockCompilerInterface->inputInternalOptions.find("-cl-store-cache-default=1 -cl-load-cache-default=1"), std::string::npos); } @@ -2532,9 +2665,11 @@ HWTEST2_F(ModuleTranslationUnitTest, givenAtLeastXeHpgCoreWhenGetInternalOptions auto &rootDeviceEnvironment = this->neoDevice->executionEnvironment->rootDeviceEnvironments[this->neoDevice->getRootDeviceIndex()]; rootDeviceEnvironment->compilerInterface.reset(pMockCompilerInterface); MockModuleTranslationUnit moduleTu(this->device); + moduleTu.processUnpackedBinaryCallBase = false; ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = moduleTu.buildFromSpirV("", 0U, nullptr, "", nullptr); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(moduleTu.processUnpackedBinaryCalled, 1u); EXPECT_NE(pMockCompilerInterface->inputInternalOptions.find("-cl-store-cache-default=2 -cl-load-cache-default=4"), std::string::npos); } @@ -2544,9 +2679,11 @@ HWTEST_F(ModuleTranslationUnitTest, givenForceToStatelessRequiredWhenBuildingMod rootDeviceEnvironment->compilerInterface.reset(mockCompilerInterface); MockModuleTranslationUnit moduleTu(device); + moduleTu.processUnpackedBinaryCallBase = false; ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = moduleTu.buildFromSpirV("", 0U, nullptr, "", nullptr); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(moduleTu.processUnpackedBinaryCalled, 1u); const auto &compilerProductHelper = rootDeviceEnvironment->getHelper(); if (compilerProductHelper.isForceToStatelessRequired()) { @@ -2587,11 +2724,13 @@ HWTEST2_F(ModuleTranslationUnitTest, givenSourceLevelDebuggerAndAllowZebinBuildO rootDeviceEnvironment->compilerInterface.reset(mockCompilerInterface); MockModuleTranslationUnit moduleTu(device); + moduleTu.processUnpackedBinaryCallBase = false; ze_result_t result = ZE_RESULT_SUCCESS; auto buildOption = NEO::CompilerOptions::allowZebin.str(); result = moduleTu.buildFromSpirV("", 0U, buildOption.c_str(), "", nullptr); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(moduleTu.processUnpackedBinaryCalled, 1u); EXPECT_NE(mockCompilerInterface->receivedApiOptions.find(NEO::CompilerOptions::allowZebin.str()), std::string::npos); EXPECT_EQ(mockCompilerInterface->inputInternalOptions.find(NEO::CompilerOptions::disableZebin.str()), std::string::npos); } @@ -2602,11 +2741,13 @@ HWTEST_F(ModuleTranslationUnitTest, givenAllowZebinBuildOptionWhenBuildWithSpirv rootDeviceEnvironment->compilerInterface.reset(mockCompilerInterface); MockModuleTranslationUnit moduleTu(device); + moduleTu.processUnpackedBinaryCallBase = false; ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; auto buildOption = NEO::CompilerOptions::allowZebin.str(); result = moduleTu.buildFromSpirV("", 0U, buildOption.c_str(), "", nullptr); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(moduleTu.processUnpackedBinaryCalled, 1u); EXPECT_NE(mockCompilerInterface->receivedApiOptions.find(NEO::CompilerOptions::allowZebin.str()), std::string::npos); EXPECT_EQ(mockCompilerInterface->inputInternalOptions.find(NEO::CompilerOptions::disableZebin.str()), std::string::npos); } @@ -2623,10 +2764,12 @@ HWTEST_F(ModuleTranslationUnitTest, givenSourceLevelDebuggerWhenBuildWithSpirvTh rootDeviceEnvironment->compilerInterface.reset(mockCompilerInterface); MockModuleTranslationUnit moduleTu(device); + moduleTu.processUnpackedBinaryCallBase = false; ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = moduleTu.buildFromSpirV("", 0U, nullptr, "", nullptr); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(moduleTu.processUnpackedBinaryCalled, 1u); } TEST(ModuleBuildLog, WhenGreaterBufferIsPassedToGetStringThenOutputSizeIsOverridden) { diff --git a/level_zero/core/test/unit_tests/sources/module/test_module_2.cpp b/level_zero/core/test/unit_tests/sources/module/test_module_2.cpp index 393b337c17..cc26037fb0 100644 --- a/level_zero/core/test/unit_tests/sources/module/test_module_2.cpp +++ b/level_zero/core/test/unit_tests/sources/module/test_module_2.cpp @@ -299,6 +299,7 @@ TEST_F(ModuleTests, givenLargeGrfFlagSetWhenCreatingModuleThenOverrideInternalFl moduleDesc.inputSize = src.size(); auto mockTranslationUnit = new MockModuleTranslationUnit(device); + mockTranslationUnit->processUnpackedBinaryCallBase = false; Module module(device, nullptr, ModuleType::User); module.translationUnit.reset(mockTranslationUnit); @@ -306,6 +307,7 @@ TEST_F(ModuleTests, givenLargeGrfFlagSetWhenCreatingModuleThenOverrideInternalFl ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = module.initialize(&moduleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_NE(pMockCompilerInterface->inputInternalOptions.find("-cl-intel-256-GRF-per-thread"), std::string::npos); EXPECT_EQ(pMockCompilerInterface->inputInternalOptions.find("-cl-intel-128-GRF-per-thread"), std::string::npos); @@ -328,6 +330,7 @@ TEST_F(ModuleTests, givenAutoGrfFlagSetWhenCreatingModuleThenOverrideInternalFla moduleDesc.inputSize = src.size(); auto mockTranslationUnit = new MockModuleTranslationUnit(device); + mockTranslationUnit->processUnpackedBinaryCallBase = false; Module module(device, nullptr, ModuleType::User); module.translationUnit.reset(mockTranslationUnit); @@ -335,6 +338,7 @@ TEST_F(ModuleTests, givenAutoGrfFlagSetWhenCreatingModuleThenOverrideInternalFla ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = module.initialize(&moduleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_NE(pMockCompilerInterface->receivedApiOptions.find("-cl-intel-enable-auto-large-GRF-mode"), std::string::npos); EXPECT_EQ(pMockCompilerInterface->receivedApiOptions.find("-cl-intel-256-GRF-per-thread"), std::string::npos); @@ -357,6 +361,7 @@ TEST_F(ModuleTests, givenDefaultGrfFlagSetWhenCreatingModuleThenOverrideInternal moduleDesc.inputSize = src.size(); auto mockTranslationUnit = new MockModuleTranslationUnit(device); + mockTranslationUnit->processUnpackedBinaryCallBase = false; Module module(device, nullptr, ModuleType::User); module.translationUnit.reset(mockTranslationUnit); @@ -364,6 +369,7 @@ TEST_F(ModuleTests, givenDefaultGrfFlagSetWhenCreatingModuleThenOverrideInternal ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; result = module.initialize(&moduleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); EXPECT_EQ(pMockCompilerInterface->inputInternalOptions.find("-cl-intel-256-GRF-per-thread"), std::string::npos); EXPECT_NE(pMockCompilerInterface->inputInternalOptions.find("-cl-intel-128-GRF-per-thread"), std::string::npos); diff --git a/level_zero/core/test/unit_tests/xe_hpg_core/test_module_xe_hpg_core.cpp b/level_zero/core/test/unit_tests/xe_hpg_core/test_module_xe_hpg_core.cpp index b0217b2d9c..920f7beb37 100644 --- a/level_zero/core/test/unit_tests/xe_hpg_core/test_module_xe_hpg_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpg_core/test_module_xe_hpg_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -50,10 +50,12 @@ HWTEST2_F(KernelPropertyTest, givenDG2WhenGetInternalOptionsThenWriteBackBuildOp auto pMockCompilerInterface = new MockCompilerInterface; auto &rootDeviceEnvironment = this->neoDevice->executionEnvironment->rootDeviceEnvironments[this->neoDevice->getRootDeviceIndex()]; rootDeviceEnvironment->compilerInterface.reset(pMockCompilerInterface); - MockModuleTranslationUnit moduleTu(this->device); + MockModuleTranslationUnit mockTranslationUnit(this->device); + mockTranslationUnit.processUnpackedBinaryCallBase = false; ze_result_t result = ZE_RESULT_ERROR_MODULE_BUILD_FAILURE; - result = moduleTu.buildFromSpirV("", 0U, nullptr, "", nullptr); + result = mockTranslationUnit.buildFromSpirV("", 0U, nullptr, "", nullptr); EXPECT_EQ(result, ZE_RESULT_SUCCESS); + EXPECT_EQ(mockTranslationUnit.processUnpackedBinaryCalled, 1u); EXPECT_NE(pMockCompilerInterface->inputInternalOptions.find("-cl-store-cache-default=7 -cl-load-cache-default=4"), std::string::npos); } diff --git a/shared/source/helpers/gfx_core_helper.h b/shared/source/helpers/gfx_core_helper.h index e937972175..b28dc4f4ab 100644 --- a/shared/source/helpers/gfx_core_helper.h +++ b/shared/source/helpers/gfx_core_helper.h @@ -41,6 +41,7 @@ struct AllocationProperties; struct EncodeSurfaceStateArgs; struct RootDeviceEnvironment; struct PipeControlArgs; +struct KernelDescriptor; class ProductHelper; class GfxCoreHelper; @@ -96,6 +97,7 @@ class GfxCoreHelper { virtual bool isLinearStoragePreferred(bool isSharedContext, bool isImage1d, bool forceLinearStorage) const = 0; virtual uint8_t getBarriersCountFromHasBarriers(uint8_t hasBarriers) const = 0; virtual uint32_t calculateAvailableThreadCount(const HardwareInfo &hwInfo, uint32_t grfCount) const = 0; + virtual uint32_t calculateMaxWorkGroupSize(const KernelDescriptor &kernelDescriptor, uint32_t defaultMaxGroupSize) const = 0; virtual uint32_t alignSlmSize(uint32_t slmSize) const = 0; virtual uint32_t computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const = 0; @@ -276,6 +278,8 @@ class GfxCoreHelperHw : public GfxCoreHelper { uint32_t calculateAvailableThreadCount(const HardwareInfo &hwInfo, uint32_t grfCount) const override; + uint32_t calculateMaxWorkGroupSize(const KernelDescriptor &kernelDescriptor, uint32_t defaultMaxGroupSize) const override; + uint32_t alignSlmSize(uint32_t slmSize) const override; uint32_t computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const override; diff --git a/shared/source/helpers/gfx_core_helper_bdw_and_later.inl b/shared/source/helpers/gfx_core_helper_bdw_and_later.inl index 2e5b161a49..a19a2c304c 100644 --- a/shared/source/helpers/gfx_core_helper_bdw_and_later.inl +++ b/shared/source/helpers/gfx_core_helper_bdw_and_later.inl @@ -89,6 +89,11 @@ uint32_t GfxCoreHelperHw::calculateAvailableThreadCount(const Hardwar return hwInfo.gtSystemInfo.ThreadCount; } +template +inline uint32_t GfxCoreHelperHw::calculateMaxWorkGroupSize(const KernelDescriptor &kernelDescriptor, uint32_t defaultMaxGroupSize) const { + return defaultMaxGroupSize; +} + template uint64_t GfxCoreHelperHw::getGpuTimeStampInNS(uint64_t timeStamp, double frequency) const { return static_cast(timeStamp * frequency); diff --git a/shared/source/helpers/gfx_core_helper_xehp_and_later.inl b/shared/source/helpers/gfx_core_helper_xehp_and_later.inl index 9067ad2898..47ee9e0b63 100644 --- a/shared/source/helpers/gfx_core_helper_xehp_and_later.inl +++ b/shared/source/helpers/gfx_core_helper_xehp_and_later.inl @@ -12,6 +12,7 @@ #include "shared/source/helpers/heap_assigner.h" #include "shared/source/helpers/pipe_control_args.h" #include "shared/source/kernel/grf_config.h" +#include "shared/source/kernel/kernel_descriptor.h" #include "shared/source/memory_manager/memory_manager.h" #include "aubstream/engine_node.h" @@ -135,6 +136,14 @@ uint32_t GfxCoreHelperHw::calculateAvailableThreadCount(const Hardwar return hwInfo.gtSystemInfo.ThreadCount; } +template +inline uint32_t GfxCoreHelperHw::calculateMaxWorkGroupSize(const KernelDescriptor &kernelDescriptor, uint32_t defaultMaxGroupSize) const { + if (kernelDescriptor.kernelAttributes.simdSize != 32 && kernelDescriptor.kernelAttributes.numGrfRequired == GrfConfig::LargeGrfNumber) { + defaultMaxGroupSize >>= 1; + } + return defaultMaxGroupSize; +} + template uint64_t GfxCoreHelperHw::getGpuTimeStampInNS(uint64_t timeStamp, double frequency) const { constexpr uint64_t mask = static_cast(std::numeric_limits::max()); diff --git a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp index f74885ba7f..31ef2f708e 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp @@ -16,6 +16,7 @@ #include "shared/source/helpers/pipe_control_args.h" #include "shared/source/helpers/preamble.h" #include "shared/source/helpers/string.h" +#include "shared/source/kernel/kernel_descriptor.h" #include "shared/source/memory_manager/allocation_type.h" #include "shared/source/os_interface/os_interface.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" @@ -1436,6 +1437,25 @@ HWTEST_F(GfxCoreHelperTest, whenIsDynamicallyPopulatedisFalseThengetHighestEnabl EXPECT_EQ(maxSlice, hwInfo.gtSystemInfo.MaxSlicesSupported); } +HWTEST2_F(GfxCoreHelperTest, givenLargeGrfIsNotSupportedWhenCalculatingMaxWorkGroupSizeThenAlwaysReturnDeviceDefault, IsAtMostGen12lp) { + auto &gfxCoreHelper = getHelper(); + auto defaultMaxGroupSize = 42u; + + NEO::KernelDescriptor kernelDescriptor{}; + + kernelDescriptor.kernelAttributes.simdSize = 16; + kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::LargeGrfNumber; + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize)); + + kernelDescriptor.kernelAttributes.simdSize = 32; + kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::LargeGrfNumber; + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize)); + + kernelDescriptor.kernelAttributes.simdSize = 16; + kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::DefaultGrfNumber; + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize)); +} + HWTEST_F(GfxCoreHelperTest, whenIsDynamicallyPopulatedisTrueThengetHighestEnabledSliceReturnsHighestEnabledSliceInfo) { auto hwInfo = *defaultHwInfo; diff --git a/shared/test/unit_test/helpers/gfx_core_helper_tests_dg2_and_later.cpp b/shared/test/unit_test/helpers/gfx_core_helper_tests_dg2_and_later.cpp index c0d22151cf..079dac96e1 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_tests_dg2_and_later.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_tests_dg2_and_later.cpp @@ -9,6 +9,7 @@ #include "shared/source/command_stream/linear_stream.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/pipe_control_args.h" +#include "shared/source/kernel/kernel_descriptor.h" #include "shared/source/os_interface/hw_info_config.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/default_hw_info.h" @@ -62,6 +63,27 @@ HWTEST2_F(GfxCoreHelperDg2AndLaterTest, GivenUseL1CacheAsFalseWhenCallSetL1Cache EXPECT_NE(RENDER_SURFACE_STATE::L1_CACHE_POLICY_WB, surfaceState.getL1CachePolicyL1CacheControl()); } +using GfxCoreHelperWithLargeGrf = ::testing::Test; +HWTEST2_F(GfxCoreHelperWithLargeGrf, givenLargeGrfAndSimdSmallerThan32WhenCalculatingMaxWorkGroupSizeThenReturnHalfOfDeviceDefault, IsWithinXeGfxFamily) { + MockExecutionEnvironment mockExecutionEnvironment{}; + auto &gfxCoreHelper = mockExecutionEnvironment.rootDeviceEnvironments[0]->getHelper(); + auto defaultMaxGroupSize = 42u; + + NEO::KernelDescriptor kernelDescriptor{}; + + kernelDescriptor.kernelAttributes.simdSize = 16; + kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::LargeGrfNumber; + EXPECT_EQ((defaultMaxGroupSize >> 1), gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize)); + + kernelDescriptor.kernelAttributes.simdSize = 32; + kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::LargeGrfNumber; + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize)); + + kernelDescriptor.kernelAttributes.simdSize = 16; + kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::DefaultGrfNumber; + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize)); +} + using PipeControlHelperTestsDg2AndLater = ::testing::Test; HWTEST2_F(PipeControlHelperTestsDg2AndLater, WhenAddingPipeControlWAThenCorrectCommandsAreProgrammed, IsAtLeastXeHpgCore) {