diff --git a/level_zero/core/source/module/module_imp.cpp b/level_zero/core/source/module/module_imp.cpp index 10be269e6c..3781f2777b 100644 --- a/level_zero/core/source/module/module_imp.cpp +++ b/level_zero/core/source/module/module_imp.cpp @@ -624,6 +624,8 @@ ze_result_t ModuleImp::initialize(const ze_module_desc_t *desc, NEO::Device *neo registerElfInDebuggerL0(); + this->defaultMaxGroupSize = static_cast(neoDevice->getDeviceInfo().maxWorkGroupSize); + checkIfPrivateMemoryPerDispatchIsNeeded(); linkageSuccessful = this->linkBinary(); @@ -715,7 +717,7 @@ const KernelImmutableData *ModuleImp::getKernelImmutableData(const char *kernelN } uint32_t ModuleImp::getMaxGroupSize(const NEO::KernelDescriptor &kernelDescriptor) const { - return this->device->getGfxCoreHelper().calculateMaxWorkGroupSize(kernelDescriptor, static_cast(this->device->getDeviceInfo().maxWorkGroupSize)); + return this->device->getGfxCoreHelper().calculateMaxWorkGroupSize(kernelDescriptor, this->defaultMaxGroupSize); } void ModuleImp::createBuildOptions(const char *pBuildFlags, std::string &apiOptions, std::string &internalBuildOptions) { diff --git a/level_zero/core/source/module/module_imp.h b/level_zero/core/source/module/module_imp.h index 2cc26bb988..974631809b 100644 --- a/level_zero/core/source/module/module_imp.h +++ b/level_zero/core/source/module/module_imp.h @@ -162,6 +162,7 @@ struct ModuleImp : public Module { std::unique_ptr translationUnit; ModuleBuildLog *moduleBuildLog = nullptr; NEO::GraphicsAllocation *exportedFunctionsSurface = nullptr; + uint32_t defaultMaxGroupSize = 0U; std::vector> kernelImmDatas; NEO::Linker::RelocatedSymbolsMap symbols; diff --git a/level_zero/core/test/unit_tests/fixtures/module_fixture.cpp b/level_zero/core/test/unit_tests/fixtures/module_fixture.cpp index 78c57d9298..8832238931 100644 --- a/level_zero/core/test/unit_tests/fixtures/module_fixture.cpp +++ b/level_zero/core/test/unit_tests/fixtures/module_fixture.cpp @@ -58,7 +58,7 @@ ModuleImmutableDataFixture::MockModule::MockModule(L0::Device *device, uint32_t perHwThreadPrivateMemorySize, MockImmutableData *inMockKernelImmData) : ModuleImp(device, moduleBuildLog, type), mockKernelImmData(inMockKernelImmData) { this->mockKernelImmData->setDevice(device); - this->translationUnit.reset(new MockModuleTranslationUnit(this->device)); + this->translationUnit.reset(new MockModuleTranslationUnit(this->translationUnit.get())); } void ModuleImmutableDataFixture::MockModule::checkIfPrivateMemoryPerDispatchIsNeeded() { diff --git a/level_zero/core/test/unit_tests/fixtures/module_fixture.h b/level_zero/core/test/unit_tests/fixtures/module_fixture.h index 1cab446e2d..c52575b1c8 100644 --- a/level_zero/core/test/unit_tests/fixtures/module_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/module_fixture.h @@ -51,6 +51,7 @@ struct ModuleImmutableDataFixture : public DeviceFixture { struct MockModule : public L0::ModuleImp { using ModuleImp::allocatePrivateMemoryPerDispatch; + using ModuleImp::defaultMaxGroupSize; using ModuleImp::getKernelImmutableDataVector; using ModuleImp::kernelImmDatas; using ModuleImp::translationUnit; diff --git a/level_zero/core/test/unit_tests/mocks/mock_module.h b/level_zero/core/test/unit_tests/mocks/mock_module.h index ce576abf14..6d6dc1adf8 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_module.h +++ b/level_zero/core/test/unit_tests/mocks/mock_module.h @@ -24,6 +24,26 @@ struct MockModuleTranslationUnit : public L0::ModuleTranslationUnit { MockModuleTranslationUnit(L0::Device *device) : BaseClass{device} {} + MockModuleTranslationUnit(L0::ModuleTranslationUnit *orig) : BaseClass{orig->device} { + std::swap(this->globalConstBuffer, orig->globalConstBuffer); + std::swap(this->globalVarBuffer, orig->globalVarBuffer); + std::swap(this->programInfo, orig->programInfo); + std::swap(this->options, orig->options); + std::swap(this->shouldSuppressRebuildWarning, orig->shouldSuppressRebuildWarning); + std::swap(this->buildLog, orig->buildLog); + std::swap(this->irBinary, orig->irBinary); + std::swap(this->irBinarySize, orig->irBinarySize); + std::swap(this->unpackedDeviceBinary, orig->unpackedDeviceBinary); + std::swap(this->unpackedDeviceBinarySize, orig->unpackedDeviceBinarySize); + std::swap(this->packedDeviceBinary, orig->packedDeviceBinary); + std::swap(this->packedDeviceBinarySize, orig->packedDeviceBinarySize); + std::swap(this->debugData, orig->debugData); + std::swap(this->debugDataSize, orig->debugDataSize); + std::swap(this->alignedvIsas, orig->alignedvIsas); + std::swap(this->specConstantsValues, orig->specConstantsValues); + std::swap(this->isBuiltIn, orig->isBuiltIn); + } + ADDMETHOD(processUnpackedBinary, ze_result_t, true, ZE_RESULT_SUCCESS, (), ()); ze_result_t compileGenBinary(NEO::TranslationInput inputArgs, bool staticLink) override { @@ -62,6 +82,7 @@ struct WhiteBox<::L0::Module> : public ::L0::ModuleImp { using BaseClass::translationUnit; using BaseClass::type; using BaseClass::unresolvedExternalsInfo; + uint32_t &maxGroupSize{BaseClass::defaultMaxGroupSize}; WhiteBox(Device *device, ModuleBuildLog *moduleBuildLog, ModuleType type) : ::L0::ModuleImp{device, moduleBuildLog, type} { @@ -101,11 +122,13 @@ struct MockModule : public L0::ModuleImp { using ModuleImp::populateHostGlobalSymbolsMap; using ModuleImp::symbols; using ModuleImp::translationUnit; + uint32_t &maxGroupSize = ModuleImp::defaultMaxGroupSize; MockModule(L0::Device *device, L0::ModuleBuildLog *moduleBuildLog, L0::ModuleType type) : ModuleImp(device, moduleBuildLog, type) { this->translationUnit.reset(new MockModuleTranslationUnit{device}); + this->maxGroupSize = 32u; }; ~MockModule() override = default; diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index d044f46d73..72cb80ac03 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -944,6 +944,7 @@ TEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueThenRayTracingIsInitialized ModuleType::User, 32u, mockKernelImmutableData.get()); + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); @@ -985,6 +986,7 @@ TEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndPatchTokenPointerSizeIsZ ModuleType::User, 32u, mockKernelImmutableData.get()); + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); @@ -1025,6 +1027,7 @@ HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndNoRTDispatchGlobalsIs ModuleType::User, 32u, mockKernelImmutableData.get()); + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); @@ -1066,6 +1069,7 @@ HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndRTStackAllocationFail ModuleType::User, 32u, mockKernelImmutableData.get()); + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); @@ -1108,6 +1112,7 @@ HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndRTDispatchGlobalsArra ModuleType::User, 32u, mockKernelImmutableData.get()); + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); @@ -1142,6 +1147,7 @@ TEST_F(KernelImmutableDataTests, whenHasRTCallsIsFalseThenRayTracingIsNotInitial ModuleType::User, 32u, mockKernelImmutableData.get()); + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); @@ -1182,6 +1188,7 @@ TEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueThenCrossThreadDataIsPatche ModuleType::User, 32u, mockKernelImmutableData.get()); + module->defaultMaxGroupSize = 10; std::unique_ptr kernel; kernel = std::make_unique(module.get()); diff --git a/level_zero/core/test/unit_tests/sources/module/test_module.cpp b/level_zero/core/test/unit_tests/sources/module/test_module.cpp index ea78f1238b..5053bb5c10 100644 --- a/level_zero/core/test/unit_tests/sources/module/test_module.cpp +++ b/level_zero/core/test/unit_tests/sources/module/test_module.cpp @@ -2362,17 +2362,17 @@ kernels: zebin.elfHeader->machine = this->device->getNEODevice()->getHardwareInfo().platform.eProductFamily; MockModule mockModule{this->device, nullptr, ModuleType::User}; - auto maxWorkGroupSize = static_cast(this->neoDevice->deviceInfo.maxWorkGroupSize); + mockModule.maxGroupSize = static_cast(this->device->getDeviceInfo().maxWorkGroupSize); auto mockTU = mockModule.translationUnit.get(); auto result = mockTU->createFromNativeBinary(reinterpret_cast(zebin.storage.data()), zebin.storage.size()); EXPECT_EQ(result, ZE_RESULT_SUCCESS); auto &defaultKernelDescriptor = mockTU->programInfo.kernelInfos[0]->kernelDescriptor; auto &reducedKernelDescriptor = mockTU->programInfo.kernelInfos[1]->kernelDescriptor; - EXPECT_EQ(mockModule.getMaxGroupSize(defaultKernelDescriptor), maxWorkGroupSize); - EXPECT_EQ(mockModule.getMaxGroupSize(reducedKernelDescriptor), (maxWorkGroupSize >> 1)); + EXPECT_EQ(mockModule.getMaxGroupSize(defaultKernelDescriptor), mockModule.maxGroupSize); + EXPECT_EQ(mockModule.getMaxGroupSize(reducedKernelDescriptor), (mockModule.maxGroupSize >> 1)); - uint32_t groupSize[3] = {8, 4, (maxWorkGroupSize >> 5)}; // default max WGS + uint32_t groupSize[3] = {8, 4, (mockModule.maxGroupSize >> 5)}; // default max WGS Mock defaultKernel; defaultKernel.module = &mockModule; defaultKernel.descriptor.kernelAttributes = defaultKernelDescriptor.kernelAttributes; @@ -2409,15 +2409,15 @@ kernels: zebin.elfHeader->machine = this->device->getNEODevice()->getHardwareInfo().platform.eProductFamily; MockModule mockModule{this->device, nullptr, ModuleType::User}; - auto maxWorkGroupSize = static_cast(this->neoDevice->deviceInfo.maxWorkGroupSize); + mockModule.maxGroupSize = static_cast(device->getDeviceInfo().maxWorkGroupSize); auto mockTU = mockModule.translationUnit.get(); auto result = mockTU->createFromNativeBinary(reinterpret_cast(zebin.storage.data()), zebin.storage.size()); EXPECT_EQ(result, ZE_RESULT_SUCCESS); auto &defaultKernelDescriptor = mockTU->programInfo.kernelInfos[0]->kernelDescriptor; auto &reducedKernelDescriptor = mockTU->programInfo.kernelInfos[1]->kernelDescriptor; - EXPECT_EQ(mockModule.getMaxGroupSize(defaultKernelDescriptor), maxWorkGroupSize); - EXPECT_EQ(mockModule.getMaxGroupSize(reducedKernelDescriptor), (maxWorkGroupSize >> 1)); + EXPECT_EQ(mockModule.getMaxGroupSize(defaultKernelDescriptor), mockModule.maxGroupSize); + EXPECT_EQ(mockModule.getMaxGroupSize(reducedKernelDescriptor), (mockModule.maxGroupSize >> 1)); uint32_t groupSize[3] = {0u, 0u, 0u}; Mock defaultKernel; @@ -2425,18 +2425,18 @@ kernels: defaultKernel.descriptor.kernelAttributes = defaultKernelDescriptor.kernelAttributes; EXPECT_EQ(ZE_RESULT_SUCCESS, defaultKernel.suggestGroupSize(4096u, 4096u, 4096u, &groupSize[0], &groupSize[1], &groupSize[2])); EXPECT_GT(groupSize[0] * groupSize[1] * groupSize[2], 0u); - EXPECT_LE(groupSize[0] * groupSize[1] * groupSize[2], maxWorkGroupSize); + EXPECT_LE(groupSize[0] * groupSize[1] * groupSize[2], mockModule.maxGroupSize); groupSize[0] = groupSize[1] = groupSize[2] = 0u; - EXPECT_EQ(ZE_RESULT_SUCCESS, defaultKernel.suggestGroupSize(maxWorkGroupSize, 1u, 1u, &groupSize[0], &groupSize[1], &groupSize[2])); + EXPECT_EQ(ZE_RESULT_SUCCESS, defaultKernel.suggestGroupSize(mockModule.maxGroupSize, 1u, 1u, &groupSize[0], &groupSize[1], &groupSize[2])); EXPECT_GT(groupSize[0] * groupSize[1] * groupSize[2], 0u); - EXPECT_LE(groupSize[0] * groupSize[1] * groupSize[2], maxWorkGroupSize); + EXPECT_LE(groupSize[0] * groupSize[1] * groupSize[2], mockModule.maxGroupSize); groupSize[0] = groupSize[1] = groupSize[2] = 0u; Mock reducedKernel; reducedKernel.module = &mockModule; reducedKernel.descriptor.kernelAttributes = reducedKernelDescriptor.kernelAttributes; - EXPECT_EQ(ZE_RESULT_SUCCESS, reducedKernel.suggestGroupSize(maxWorkGroupSize, 1u, 1u, &groupSize[0], &groupSize[1], &groupSize[2])); + EXPECT_EQ(ZE_RESULT_SUCCESS, reducedKernel.suggestGroupSize(mockModule.maxGroupSize, 1u, 1u, &groupSize[0], &groupSize[1], &groupSize[2])); EXPECT_GT(groupSize[0] * groupSize[1] * groupSize[2], 0u); EXPECT_LE(groupSize[0] * groupSize[1] * groupSize[2], mockModule.getMaxGroupSize(reducedKernelDescriptor)); @@ -3412,6 +3412,8 @@ TEST_F(ModuleTests, givenConstDataStringSectionWhenLinkingModuleThenSegmentIsPat TEST_F(ModuleTests, givenImplicitArgsRelocationAndStackCallsWhenLinkingBuiltinModuleThenSegmentIsNotPatchedAndImplicitArgsAreNotRequired) { auto pModule = std::make_unique>(device, nullptr, ModuleType::Builtin); + pModule->maxGroupSize = 32; + char data[64]{}; auto kernelInfo = new KernelInfo(); kernelInfo->heapInfo.KernelHeapSize = 64; @@ -3452,6 +3454,7 @@ TEST_F(ModuleTests, givenFullyLinkedModuleAndSlmSizeExceedingLocalMemorySizeWhen DebugManager.flags.PrintDebugMessages.set(true); auto pModule = std::make_unique>(device, nullptr, ModuleType::Builtin); + pModule->maxGroupSize = 32; char data[64]{}; std::unique_ptr kernelInfo = std::make_unique(); @@ -3498,6 +3501,7 @@ TEST_F(ModuleTests, givenFullyLinkedModuleWhenCreatingKernelThenDebugMsgOnPrivat DebugManager.flags.PrintDebugMessages.set(true); auto pModule = std::make_unique>(device, nullptr, ModuleType::Builtin); + pModule->maxGroupSize = 32; char data[64]{}; std::unique_ptr kernelInfo = std::make_unique();