diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 35eaaeff63..27150d434d 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -258,7 +258,7 @@ struct CommandListCoreFamily : CommandListImp { const CmdListKernelLaunchParams &launchParams); ze_result_t prepareIndirectParams(const ze_group_count_t *threadGroupDimensions); - void updateStreamProperties(Kernel &kernel, bool isCooperative); + void updateStreamProperties(Kernel &kernel, bool isCooperative, const ze_group_count_t *threadGroupDimensions); void updateStateBaseAddressStreamProperties(Kernel &kernel, bool updateRequiredState, bool captureBaseAddressState); void clearCommandsToPatch(); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 2c5647f2b1..bb39757d43 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2371,15 +2371,21 @@ void CommandListCoreFamily::updateStateBaseAddressStreamPropertie } template -void CommandListCoreFamily::updateStreamProperties(Kernel &kernel, bool isCooperative) { +void CommandListCoreFamily::updateStreamProperties(Kernel &kernel, bool isCooperative, const ze_group_count_t *threadGroupDimensions) { using VFE_STATE_TYPE = typename GfxFamily::VFE_STATE_TYPE; auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment(); auto &kernelAttributes = kernel.getKernelDescriptor().kernelAttributes; bool captureBaseAddressState = containsAnyKernel; + bool fusedEuDisabled = kernelAttributes.flags.requiresDisabledEUFusion; + auto &productHelper = device->getProductHelper(); + if (threadGroupDimensions) { + uint32_t groupCount[3] = {threadGroupDimensions->groupCountX, threadGroupDimensions->groupCountY, threadGroupDimensions->groupCountZ}; + fusedEuDisabled |= productHelper.isFusedEuDisabledForDpas(kernelAttributes.flags.usesSystolicPipelineSelectMode, kernel.getGroupSize(), groupCount); + } if (!containsAnyKernel) { - requiredStreamState.frontEndState.setProperties(isCooperative, kernelAttributes.flags.requiresDisabledEUFusion, true, -1, rootDeviceEnvironment); + requiredStreamState.frontEndState.setProperties(isCooperative, fusedEuDisabled, true, -1, rootDeviceEnvironment); requiredStreamState.pipelineSelect.setProperties(true, false, kernelAttributes.flags.usesSystolicPipelineSelectMode, rootDeviceEnvironment); if (!this->isFlushTaskSubmissionEnabled) { @@ -2410,7 +2416,7 @@ void CommandListCoreFamily::updateStreamProperties(Kernel &kernel rootDeviceEnvironment); } - finalStreamState.frontEndState.setProperties(isCooperative, kernelAttributes.flags.requiresDisabledEUFusion, true, -1, rootDeviceEnvironment); + finalStreamState.frontEndState.setProperties(isCooperative, fusedEuDisabled, true, -1, rootDeviceEnvironment); bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get(); if (finalStreamState.frontEndState.isDirty() && logicalStateHelperBlock) { if (isPatchingVfeStateAllowed) { diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index 2af94b313f..f85cd32bf6 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -131,7 +131,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K std::list additionalCommands; - updateStreamProperties(*kernel, launchParams.isCooperative); + updateStreamProperties(*kernel, launchParams.isCooperative, threadGroupDimensions); NEO::EncodeDispatchKernelArgs dispatchKernelArgs{ 0, // eventAddress neoDevice, // device diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 4a9abb3fae..cd8975fa73 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -247,7 +247,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K this->containsStatelessUncachedResource |= kernelImp->getKernelRequiresUncachedMocs(); this->requiresQueueUncachedMocs |= kernelImp->getKernelRequiresQueueUncachedMocs(); - updateStreamProperties(*kernel, launchParams.isCooperative); + updateStreamProperties(*kernel, launchParams.isCooperative, threadGroupDimensions); auto localMemSize = static_cast(neoDevice->getDeviceInfo().localMemSize); auto slmTotalSize = kernelImp->getSlmTotalSize(); diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 717e97767e..087239766b 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -368,9 +368,12 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; } + bool requiresEuFusionDisable = kernelImmData->getDescriptor().kernelAttributes.flags.requiresDisabledEUFusion || + neoDevice->getProductHelper().isFusedEuDisabledForDpas(kernelImmData->getDescriptor().kernelAttributes.flags.usesSystolicPipelineSelectMode, nullptr, nullptr); + NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, kernelImmData->getDescriptor().kernelAttributes.usesBarriers(), simd, this->getSlmTotalSize(), neoDevice->getRootDeviceEnvironment(), numThreadsPerSubSlice, localMemSize, - usesImages, false, kernelImmData->getDescriptor().kernelAttributes.flags.requiresDisabledEUFusion); + usesImages, false, requiresEuFusionDisable); NEO::computeWorkgroupSizeND(wsInfo, retGroupSize, workItems, dim); } else { if (1U == dim) { @@ -381,7 +384,6 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz NEO::computeWorkgroupSize2D(maxWorkGroupSize, retGroupSize, workItems, simd); } } - *groupSizeX = static_cast(retGroupSize[0]); *groupSizeY = static_cast(retGroupSize[1]); *groupSizeZ = static_cast(retGroupSize[2]); @@ -747,7 +749,12 @@ ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties) preferredGroupSizeProperties->preferredMultiple = this->kernelImmData->getKernelInfo()->getMaxSimdSize(); auto &gfxCoreHelper = this->module->getDevice()->getGfxCoreHelper(); - if (gfxCoreHelper.isFusedEuDispatchEnabled(this->module->getDevice()->getHwInfo(), kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion)) { + auto &productHelper = this->module->getDevice()->getProductHelper(); + + bool requiresEuFusionDisabled = kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion || + productHelper.isFusedEuDisabledForDpas(kernelImmData->getDescriptor().kernelAttributes.flags.usesSystolicPipelineSelectMode, nullptr, nullptr); + + if (gfxCoreHelper.isFusedEuDispatchEnabled(this->module->getDevice()->getHwInfo(), requiresEuFusionDisabled)) { preferredGroupSizeProperties->preferredMultiple *= 2; } } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp index 36af1cf36c..34cdf374d0 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp @@ -340,7 +340,8 @@ HWTEST2_F(CommandListAppendLaunchKernel, GivenComputeModePropertiesWhenUpdateStr ASSERT_EQ(ZE_RESULT_SUCCESS, result); const_cast(&kernel.getKernelDescriptor())->kernelAttributes.numGrfRequired = 0x100; - commandList->updateStreamProperties(kernel, false); + const ze_group_count_t launchKernelArgs = {}; + commandList->updateStreamProperties(kernel, false, &launchKernelArgs); if (commandList->stateComputeModeTracking) { EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.isCoherencyRequired.isDirty); EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty); @@ -350,7 +351,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, GivenComputeModePropertiesWhenUpdateStr } const_cast(&kernel.getKernelDescriptor())->kernelAttributes.numGrfRequired = 0x80; - commandList->updateStreamProperties(kernel, false); + commandList->updateStreamProperties(kernel, false, &launchKernelArgs); EXPECT_EQ(productHelper.isGrfNumReportedWithScm(), commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty); EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.isCoherencyRequired.isDirty); } @@ -377,7 +378,8 @@ HWTEST2_F(CommandListAppendLaunchKernel, ASSERT_EQ(ZE_RESULT_SUCCESS, result); const_cast(&kernel.getKernelDescriptor())->kernelAttributes.numGrfRequired = 0x100; - commandList->updateStreamProperties(kernel, false); + const ze_group_count_t launchKernelArgs = {}; + commandList->updateStreamProperties(kernel, false, &launchKernelArgs); if (commandList->stateComputeModeTracking) { EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.isCoherencyRequired.isDirty); if (productHelper.isGrfNumReportedWithScm()) { @@ -391,7 +393,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, } const_cast(&kernel.getKernelDescriptor())->kernelAttributes.numGrfRequired = 0x80; - commandList->updateStreamProperties(kernel, false); + commandList->updateStreamProperties(kernel, false, &launchKernelArgs); EXPECT_EQ(productHelper.isGrfNumReportedWithScm(), commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty); EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.isCoherencyRequired.isDirty); } @@ -409,7 +411,8 @@ HWTEST2_F(CommandListAppendLaunchKernel, GivenComputeModePropertiesWhenPropertes ASSERT_EQ(ZE_RESULT_SUCCESS, result); const_cast(&kernel.getKernelDescriptor())->kernelAttributes.numGrfRequired = 0x100; - commandList->updateStreamProperties(kernel, false); + const ze_group_count_t launchKernelArgs = {}; + commandList->updateStreamProperties(kernel, false, &launchKernelArgs); if (commandList->stateComputeModeTracking) { EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.isCoherencyRequired.isDirty); EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty); @@ -418,7 +421,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, GivenComputeModePropertiesWhenPropertes EXPECT_EQ(productHelper.isGrfNumReportedWithScm(), commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty); } - commandList->updateStreamProperties(kernel, false); + commandList->updateStreamProperties(kernel, false, &launchKernelArgs); EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.isCoherencyRequired.isDirty); EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty); } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index 3171670cee..62faee53dc 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -1374,18 +1374,19 @@ HWTEST2_F(CommandListAppendLaunchKernel, GivenDebugToggleSetWhenUpdateStreamProp ASSERT_EQ(ZE_RESULT_SUCCESS, result); // initial kernel with no policy preference - pCommandList->updateStreamProperties(kernel, false); + const ze_group_count_t launchKernelArgs = {}; + pCommandList->updateStreamProperties(kernel, false, &launchKernelArgs); EXPECT_EQ(defaultThreadArbitrationPolicy, pCommandList->finalStreamState.stateComputeMode.threadArbitrationPolicy.value); // policy changed to non-default state pCommandList->finalStreamState.stateComputeMode.threadArbitrationPolicy.value = nonDefaultThreadArbitrationPolicy; // another kernel with no policy preference - do not update policy - pCommandList->updateStreamProperties(kernel, false); + pCommandList->updateStreamProperties(kernel, false, &launchKernelArgs); EXPECT_EQ(nonDefaultThreadArbitrationPolicy, pCommandList->finalStreamState.stateComputeMode.threadArbitrationPolicy.value); // another kernel with no policy preference, this time with debug toggle set - update policy back to default value DebugManager.flags.ForceDefaultThreadArbitrationPolicyIfNotSpecified.set(true); - pCommandList->updateStreamProperties(kernel, false); + pCommandList->updateStreamProperties(kernel, false, &launchKernelArgs); EXPECT_EQ(defaultThreadArbitrationPolicy, pCommandList->finalStreamState.stateComputeMode.threadArbitrationPolicy.value); } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index cdb69a6b7a..166167f7bb 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -372,11 +372,12 @@ HWTEST2_F(CommandListAppendLaunchKernel, whenUpdateStreamPropertiesIsCalledThenR const auto &productHelper = device->getProductHelper(); int32_t expectedDisableOverdispatch = productHelper.isDisableOverdispatchAvailable(*defaultHwInfo) ? 1 : -1; - pCommandList->updateStreamProperties(kernel, false); + const ze_group_count_t launchKernelArgs = {}; + pCommandList->updateStreamProperties(kernel, false, &launchKernelArgs); EXPECT_EQ(expectedDisableOverdispatch, pCommandList->requiredStreamState.frontEndState.disableOverdispatch.value); EXPECT_EQ(expectedDisableOverdispatch, pCommandList->finalStreamState.frontEndState.disableOverdispatch.value); - pCommandList->updateStreamProperties(kernel, false); + pCommandList->updateStreamProperties(kernel, false, &launchKernelArgs); EXPECT_EQ(expectedDisableOverdispatch, pCommandList->requiredStreamState.frontEndState.disableOverdispatch.value); EXPECT_EQ(expectedDisableOverdispatch, pCommandList->finalStreamState.frontEndState.disableOverdispatch.value); } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp index 628d91cdc7..bbc572cdad 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp @@ -217,23 +217,24 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenVariousKernelsWhenUpdateStreamProp auto &productHelper = device->getProductHelper(); int32_t expectedDispatchAllWalkerEnable = productHelper.isComputeDispatchAllWalkerEnableInCfeStateRequired(device->getHwInfo()) ? 0 : -1; - pCommandList->updateStreamProperties(defaultKernel, false); + const ze_group_count_t launchKernelArgs = {}; + pCommandList->updateStreamProperties(defaultKernel, false, &launchKernelArgs); EXPECT_EQ(expectedDispatchAllWalkerEnable, pCommandList->requiredStreamState.frontEndState.computeDispatchAllWalkerEnable.value); EXPECT_EQ(expectedDispatchAllWalkerEnable, pCommandList->finalStreamState.frontEndState.computeDispatchAllWalkerEnable.value); EXPECT_EQ(0u, pCommandList->commandsToPatch.size()); pCommandList->reset(); - pCommandList->updateStreamProperties(cooperativeKernel, true); - pCommandList->updateStreamProperties(cooperativeKernel, true); + pCommandList->updateStreamProperties(cooperativeKernel, true, &launchKernelArgs); + pCommandList->updateStreamProperties(cooperativeKernel, true, &launchKernelArgs); expectedDispatchAllWalkerEnable = expectedDispatchAllWalkerEnable != -1 ? 1 : expectedDispatchAllWalkerEnable; EXPECT_EQ(expectedDispatchAllWalkerEnable, pCommandList->requiredStreamState.frontEndState.computeDispatchAllWalkerEnable.value); EXPECT_EQ(expectedDispatchAllWalkerEnable, pCommandList->finalStreamState.frontEndState.computeDispatchAllWalkerEnable.value); EXPECT_EQ(0u, pCommandList->commandsToPatch.size()); pCommandList->reset(); - pCommandList->updateStreamProperties(defaultKernel, false); - pCommandList->updateStreamProperties(cooperativeKernel, true); + pCommandList->updateStreamProperties(defaultKernel, false, &launchKernelArgs); + pCommandList->updateStreamProperties(cooperativeKernel, true, &launchKernelArgs); expectedDispatchAllWalkerEnable = expectedDispatchAllWalkerEnable != -1 ? 0 : expectedDispatchAllWalkerEnable; EXPECT_EQ(expectedDispatchAllWalkerEnable, pCommandList->requiredStreamState.frontEndState.computeDispatchAllWalkerEnable.value); expectedDispatchAllWalkerEnable = expectedDispatchAllWalkerEnable != -1 ? 1 : expectedDispatchAllWalkerEnable; @@ -242,18 +243,18 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenVariousKernelsWhenUpdateStreamProp EXPECT_EQ(expectedCommandsToPatch, pCommandList->commandsToPatch.size()); pCommandList->reset(); - pCommandList->updateStreamProperties(cooperativeKernel, true); - pCommandList->updateStreamProperties(defaultKernel, false); - pCommandList->updateStreamProperties(cooperativeKernel, true); + pCommandList->updateStreamProperties(cooperativeKernel, true, &launchKernelArgs); + pCommandList->updateStreamProperties(defaultKernel, false, &launchKernelArgs); + pCommandList->updateStreamProperties(cooperativeKernel, true, &launchKernelArgs); EXPECT_EQ(expectedDispatchAllWalkerEnable, pCommandList->requiredStreamState.frontEndState.computeDispatchAllWalkerEnable.value); EXPECT_EQ(expectedDispatchAllWalkerEnable, pCommandList->finalStreamState.frontEndState.computeDispatchAllWalkerEnable.value); expectedCommandsToPatch = expectedCommandsToPatch != 0 ? 2 : 0; EXPECT_EQ(expectedCommandsToPatch, pCommandList->commandsToPatch.size()); pCommandList->reset(); - pCommandList->updateStreamProperties(defaultKernel, false); - pCommandList->updateStreamProperties(defaultKernel, false); - pCommandList->updateStreamProperties(cooperativeKernel, true); + pCommandList->updateStreamProperties(defaultKernel, false, &launchKernelArgs); + pCommandList->updateStreamProperties(defaultKernel, false, &launchKernelArgs); + pCommandList->updateStreamProperties(cooperativeKernel, true, &launchKernelArgs); expectedDispatchAllWalkerEnable = expectedDispatchAllWalkerEnable != -1 ? 0 : expectedDispatchAllWalkerEnable; EXPECT_EQ(expectedDispatchAllWalkerEnable, pCommandList->requiredStreamState.frontEndState.computeDispatchAllWalkerEnable.value); expectedDispatchAllWalkerEnable = expectedDispatchAllWalkerEnable != -1 ? 1 : expectedDispatchAllWalkerEnable; @@ -283,14 +284,15 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenVariousKernelsAndPatchingDisallowe auto result = pCommandList->initialize(device, NEO::EngineGroupType::Compute, 0u); ASSERT_EQ(ZE_RESULT_SUCCESS, result); - pCommandList->updateStreamProperties(defaultKernel, false); - pCommandList->updateStreamProperties(cooperativeKernel, true); + const ze_group_count_t launchKernelArgs = {}; + pCommandList->updateStreamProperties(defaultKernel, false, &launchKernelArgs); + pCommandList->updateStreamProperties(cooperativeKernel, true, &launchKernelArgs); EXPECT_EQ(0u, pCommandList->commandsToPatch.size()); pCommandList->reset(); DebugManager.flags.AllowPatchingVfeStateInCommandLists.set(1); - pCommandList->updateStreamProperties(defaultKernel, false); - pCommandList->updateStreamProperties(cooperativeKernel, true); + pCommandList->updateStreamProperties(defaultKernel, false, &launchKernelArgs); + pCommandList->updateStreamProperties(cooperativeKernel, true, &launchKernelArgs); const auto &productHelper = device->getProductHelper(); size_t expectedCmdsToPatch = productHelper.isComputeDispatchAllWalkerEnableInCfeStateRequired(device->getHwInfo()) ? 1 : 0; diff --git a/level_zero/core/test/unit_tests/xe_hpg_core/dg2/test_cmdlist_dg2.cpp b/level_zero/core/test/unit_tests/xe_hpg_core/dg2/test_cmdlist_dg2.cpp index 0d1a5ed3c2..f04d62b1f2 100644 --- a/level_zero/core/test/unit_tests/xe_hpg_core/dg2/test_cmdlist_dg2.cpp +++ b/level_zero/core/test/unit_tests/xe_hpg_core/dg2/test_cmdlist_dg2.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -13,6 +13,7 @@ #include "level_zero/core/source/xe_hpg_core/cmdlist_xe_hpg_core.h" #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" #include "level_zero/core/test/unit_tests/fixtures/module_fixture.h" +#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h" #include "level_zero/core/test/unit_tests/mocks/mock_module.h" namespace L0 { @@ -61,5 +62,40 @@ HWTEST2_F(CommandListTests, givenDG2WithBSteppingWhenCreatingCommandListThenAddi EXPECT_TRUE(cmdSba->getDynamicStateBaseAddressModifyEnable()); EXPECT_TRUE(cmdSba->getDynamicStateBufferSizeModifyEnable()); } +HWTEST2_F(CommandListTests, GivenKernelWithDpasWhenLwsIsOddThenFesedEuIsDisabled, IsDG2) { + Mock<::L0::Kernel> kernel; + auto pMockModule = std::unique_ptr(new Mock(device, nullptr)); + kernel.module = pMockModule.get(); + + auto commandList = std::make_unique>>(); + auto result = commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + const_cast(&kernel.getKernelDescriptor())->kernelAttributes.flags.usesSystolicPipelineSelectMode = true; + const ze_group_count_t launchKernelArgs = {3, 1, 1}; + kernel.groupSize[0] = 7; + kernel.groupSize[1] = 1; + kernel.groupSize[2] = 1; + commandList->updateStreamProperties(kernel, false, &launchKernelArgs); + EXPECT_TRUE(commandList->finalStreamState.frontEndState.disableEUFusion.value); +} +HWTEST2_F(CommandListTests, GivenKernelWithDpasWhenLwsIsNonOddThenFesedEuIsNotDisabled, IsDG2) { + Mock<::L0::Kernel> kernel; + auto pMockModule = std::unique_ptr(new Mock(device, nullptr)); + kernel.module = pMockModule.get(); + + auto commandList = std::make_unique>>(); + auto result = commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + const_cast(&kernel.getKernelDescriptor())->kernelAttributes.flags.usesSystolicPipelineSelectMode = true; + const ze_group_count_t launchKernelArgs = {3, 1, 1}; + kernel.groupSize[0] = 8; + kernel.groupSize[1] = 1; + kernel.groupSize[2] = 1; + commandList->updateStreamProperties(kernel, false, &launchKernelArgs); + EXPECT_FALSE(commandList->finalStreamState.frontEndState.disableEUFusion.value); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/xe_hpg_core/dg2/test_kernel_dg2.cpp b/level_zero/core/test/unit_tests/xe_hpg_core/dg2/test_kernel_dg2.cpp index 89a1f7e2a5..b39698edfa 100644 --- a/level_zero/core/test/unit_tests/xe_hpg_core/dg2/test_kernel_dg2.cpp +++ b/level_zero/core/test/unit_tests/xe_hpg_core/dg2/test_kernel_dg2.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 Intel Corporation + * Copyright (C) 2022-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -69,5 +69,115 @@ HWTEST2_F(KernelTestDG2, givenKernelImpWhenSetBufferSurfaceStateCalledThenProgra context->freeMem(devicePtr); } +HWTEST2_F(KernelTestDG2, givenKernelImpWithDpasWhenSuggestingWGSizeThenSizeIsDifferntThanWithoutDpas, IsDG2) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + ze_kernel_handle_t kernelHandle; + + ze_kernel_desc_t kernelDesc = {}; + kernelDesc.pKernelName = kernelName.c_str(); + + ze_result_t res = module->createKernel(&kernelDesc, &kernelHandle); + + ASSERT_EQ(ZE_RESULT_SUCCESS, res); + + auto kernelImp = reinterpret_cast(L0::Kernel::fromHandle(kernelHandle)); + + reinterpret_cast(module->getDevice()->getNEODevice())->deviceInfo.maxNumEUsPerSubSlice = 16; + reinterpret_cast(module->getDevice()->getNEODevice())->deviceInfo.numThreadsPerEU = 8; + uint32_t groupSizeXDpas = 79u; + uint32_t groupSizeYDpas = 14; + uint32_t groupSizeZDpas = 1u; + const_cast(kernelImp->getImmutableData()->getDescriptor()).kernelAttributes.flags.usesSystolicPipelineSelectMode = true; + kernelImp->suggestGroupSize(groupSizeXDpas, groupSizeYDpas, groupSizeZDpas, &groupSizeXDpas, &groupSizeYDpas, &groupSizeZDpas); + uint32_t groupSizeXNoDpas = 79u; + uint32_t groupSizeYNoDpas = 14u; + uint32_t groupSizeZNoDpas = 1u; + const_cast(kernelImp->getImmutableData()->getDescriptor()).kernelAttributes.flags.usesSystolicPipelineSelectMode = false; + kernelImp->suggestGroupSize(groupSizeXNoDpas, groupSizeYNoDpas, groupSizeZNoDpas, &groupSizeXNoDpas, &groupSizeYNoDpas, &groupSizeZNoDpas); + EXPECT_TRUE(groupSizeXDpas != groupSizeXNoDpas || groupSizeYDpas != groupSizeYNoDpas); + Kernel::fromHandle(kernelHandle)->destroy(); +} + +HWTEST2_F(KernelTestDG2, givenKernelImpWithFusedEuDisabledWhenSuggestingWGSizeThenSizeIsDifferntThanWithoutDpas, IsDG2) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + ze_kernel_handle_t kernelHandle; + + ze_kernel_desc_t kernelDesc = {}; + kernelDesc.pKernelName = kernelName.c_str(); + + ze_result_t res = module->createKernel(&kernelDesc, &kernelHandle); + + ASSERT_EQ(ZE_RESULT_SUCCESS, res); + + auto kernelImp = reinterpret_cast(L0::Kernel::fromHandle(kernelHandle)); + + reinterpret_cast(module->getDevice()->getNEODevice())->deviceInfo.maxNumEUsPerSubSlice = 16; + reinterpret_cast(module->getDevice()->getNEODevice())->deviceInfo.numThreadsPerEU = 8; + uint32_t groupSizeXEuFusionDisabled = 79u; + uint32_t groupSizeYEuFusionDisabled = 14; + uint32_t groupSizeZEuFusionDisabled = 1u; + const_cast(kernelImp->getImmutableData()->getDescriptor()).kernelAttributes.flags.requiresDisabledEUFusion = true; + kernelImp->suggestGroupSize(groupSizeXEuFusionDisabled, groupSizeYEuFusionDisabled, groupSizeZEuFusionDisabled, &groupSizeXEuFusionDisabled, &groupSizeYEuFusionDisabled, &groupSizeZEuFusionDisabled); + uint32_t groupSizeXNoEuFusionDisabled = 79u; + uint32_t groupSizeYNoEuFusionDisabled = 14; + uint32_t groupSizeZNoEuFusionDisabled = 1u; + const_cast(kernelImp->getImmutableData()->getDescriptor()).kernelAttributes.flags.requiresDisabledEUFusion = false; + kernelImp->suggestGroupSize(groupSizeXNoEuFusionDisabled, groupSizeYNoEuFusionDisabled, groupSizeZNoEuFusionDisabled, &groupSizeXNoEuFusionDisabled, &groupSizeYNoEuFusionDisabled, &groupSizeZNoEuFusionDisabled); + EXPECT_TRUE(groupSizeXEuFusionDisabled != groupSizeXNoEuFusionDisabled || groupSizeYEuFusionDisabled != groupSizeYNoEuFusionDisabled); + Kernel::fromHandle(kernelHandle)->destroy(); +} + +HWTEST2_F(KernelTestDG2, givenKernelImpWithDpasWhenGetPreferredWorkGroupSizeThenReturnedSizeIsLowerThanSizeForKernelWithoutDpas, IsDG2) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + ze_kernel_handle_t kernelHandle; + + ze_kernel_desc_t kernelDesc = {}; + kernelDesc.pKernelName = kernelName.c_str(); + + ze_result_t res = module->createKernel(&kernelDesc, &kernelHandle); + + ASSERT_EQ(ZE_RESULT_SUCCESS, res); + + auto kernelImp = reinterpret_cast(L0::Kernel::fromHandle(kernelHandle)); + ze_kernel_properties_t properties = {}; + ze_kernel_preferred_group_size_properties_t extProperties = {}; + extProperties.stype = ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES; + properties.pNext = &extProperties; + const_cast(kernelImp->getImmutableData()->getDescriptor()).kernelAttributes.flags.usesSystolicPipelineSelectMode = true; + kernelImp->getProperties(&properties); + auto sizeWithDpas = extProperties.preferredMultiple; + const_cast(kernelImp->getImmutableData()->getDescriptor()).kernelAttributes.flags.usesSystolicPipelineSelectMode = false; + kernelImp->getProperties(&properties); + auto sizeWithoutDpas = extProperties.preferredMultiple; + EXPECT_NE(sizeWithDpas, sizeWithoutDpas); + Kernel::fromHandle(kernelHandle)->destroy(); +} + +HWTEST2_F(KernelTestDG2, givenKernelImpWithFusedEuDisabledWhenGetPreferredWorkGroupSizeThenReturnedSizeIsLowerThanSizeForKernelWithoutFusedEuEnabled, IsDG2) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + ze_kernel_handle_t kernelHandle; + + ze_kernel_desc_t kernelDesc = {}; + kernelDesc.pKernelName = kernelName.c_str(); + + ze_result_t res = module->createKernel(&kernelDesc, &kernelHandle); + + ASSERT_EQ(ZE_RESULT_SUCCESS, res); + + auto kernelImp = reinterpret_cast(L0::Kernel::fromHandle(kernelHandle)); + ze_kernel_properties_t properties = {}; + ze_kernel_preferred_group_size_properties_t extProperties = {}; + extProperties.stype = ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES; + properties.pNext = &extProperties; + const_cast(kernelImp->getImmutableData()->getDescriptor()).kernelAttributes.flags.requiresDisabledEUFusion = true; + kernelImp->getProperties(&properties); + auto sizeWithDpas = extProperties.preferredMultiple; + const_cast(kernelImp->getImmutableData()->getDescriptor()).kernelAttributes.flags.requiresDisabledEUFusion = false; + kernelImp->getProperties(&properties); + auto sizeWithoutDpas = extProperties.preferredMultiple; + EXPECT_NE(sizeWithDpas, sizeWithoutDpas); + Kernel::fromHandle(kernelHandle)->destroy(); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/xe_hpg_core/test_cmdlist_xe_hpg_core.cpp b/level_zero/core/test/unit_tests/xe_hpg_core/test_cmdlist_xe_hpg_core.cpp index d928aad942..9357cdd5b3 100644 --- a/level_zero/core/test/unit_tests/xe_hpg_core/test_cmdlist_xe_hpg_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpg_core/test_cmdlist_xe_hpg_core.cpp @@ -348,7 +348,8 @@ HWTEST2_F(CommandListCreate, GivenComputeModePropertiesWhenUpdateStreamPropertie ASSERT_EQ(ZE_RESULT_SUCCESS, result); const_cast(&kernel.getKernelDescriptor())->kernelAttributes.numGrfRequired = 0x100; - commandList->updateStreamProperties(kernel, false); + const ze_group_count_t launchKernelArgs = {}; + commandList->updateStreamProperties(kernel, false, &launchKernelArgs); if (commandList->stateComputeModeTracking) { EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.isCoherencyRequired.isDirty); if (productHelper.isGrfNumReportedWithScm()) { @@ -361,7 +362,7 @@ HWTEST2_F(CommandListCreate, GivenComputeModePropertiesWhenUpdateStreamPropertie EXPECT_TRUE(commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty); } const_cast(&kernel.getKernelDescriptor())->kernelAttributes.numGrfRequired = 0x80; - commandList->updateStreamProperties(kernel, false); + commandList->updateStreamProperties(kernel, false, &launchKernelArgs); EXPECT_TRUE(commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty); EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.isCoherencyRequired.isDirty); } @@ -385,7 +386,8 @@ HWTEST2_F(CommandListCreate, GivenComputeModePropertiesWhenUpdateStreamPropertie auto result = commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); ASSERT_EQ(ZE_RESULT_SUCCESS, result); const_cast(&kernel.getKernelDescriptor())->kernelAttributes.numGrfRequired = 0x100; - commandList->updateStreamProperties(kernel, false); + const ze_group_count_t launchKernelArgs = {}; + commandList->updateStreamProperties(kernel, false, &launchKernelArgs); if (commandList->stateComputeModeTracking) { EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.isCoherencyRequired.isDirty); EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty); @@ -393,9 +395,8 @@ HWTEST2_F(CommandListCreate, GivenComputeModePropertiesWhenUpdateStreamPropertie EXPECT_TRUE(commandList->finalStreamState.stateComputeMode.isCoherencyRequired.isDirty); EXPECT_TRUE(commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty); } - const_cast(&kernel.getKernelDescriptor())->kernelAttributes.numGrfRequired = 0x80; - commandList->updateStreamProperties(kernel, false); + commandList->updateStreamProperties(kernel, false, &launchKernelArgs); EXPECT_TRUE(commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty); EXPECT_FALSE(commandList->finalStreamState.stateComputeMode.isCoherencyRequired.isDirty); } diff --git a/opencl/source/command_queue/cl_local_work_size.cpp b/opencl/source/command_queue/cl_local_work_size.cpp index c92d6a5764..e6ca56851d 100644 --- a/opencl/source/command_queue/cl_local_work_size.cpp +++ b/opencl/source/command_queue/cl_local_work_size.cpp @@ -97,6 +97,9 @@ WorkSizeInfo createWorkSizeInfoFromDispatchInfo(const DispatchInfo &dispatchInfo const auto &kernelInfo = dispatchInfo.getKernel()->getKernelInfo(); auto numThreadsPerSubSlice = static_cast(device.getSharedDeviceInfo().maxNumEUsPerSubSlice) * device.getSharedDeviceInfo().numThreadsPerEU; + bool requiresEuFusionDisabled = kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion || + device.getProductHelper().isFusedEuDisabledForDpas(kernelInfo.kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode, nullptr, nullptr); + WorkSizeInfo wsInfo(dispatchInfo.getKernel()->getMaxKernelWorkGroupSize(), kernelInfo.kernelDescriptor.kernelAttributes.usesBarriers(), static_cast(kernelInfo.getMaxSimdSize()), @@ -106,7 +109,8 @@ WorkSizeInfo createWorkSizeInfoFromDispatchInfo(const DispatchInfo &dispatchInfo static_cast(device.getSharedDeviceInfo().localMemSize), false, false, - kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion); + requiresEuFusionDisabled); + wsInfo.setIfUseImg(kernelInfo); return wsInfo; diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index d6bc71c74b..8b2cb0ddbd 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -855,7 +855,8 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired; dispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode = systolicPipelineSelectMode; - dispatchFlags.disableEUFusion = kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion; + dispatchFlags.disableEUFusion = kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion || + device->getProductHelper().isFusedEuDisabledForDpas(kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode, *kernel->getLocalWorkSizeValues().data(), *kernel->getNumWorkGroupsValues().data()); const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 3865dca370..66a478bd1c 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -567,6 +567,7 @@ cl_int Kernel::getWorkGroupInfo(cl_kernel_work_group_info paramName, size_t maxWorkgroupSize; const auto &hwInfo = clDevice.getHardwareInfo(); auto &gfxCoreHelper = clDevice.getGfxCoreHelper(); + auto &productHelper = clDevice.getProductHelper(); auto &clGfxCoreHelper = clDevice.getRootDeviceEnvironment().getHelper(); GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet); @@ -597,7 +598,8 @@ cl_int Kernel::getWorkGroupInfo(cl_kernel_work_group_info paramName, case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: preferredWorkGroupSizeMultiple = kernelInfo.getMaxSimdSize(); - if (gfxCoreHelper.isFusedEuDispatchEnabled(hwInfo, kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion)) { + if (gfxCoreHelper.isFusedEuDispatchEnabled(hwInfo, kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion) && + !productHelper.isFusedEuDisabledForDpas(kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode, nullptr, nullptr)) { preferredWorkGroupSizeMultiple *= 2; } srcSize = sizeof(preferredWorkGroupSizeMultiple); diff --git a/opencl/test/unit_test/xe_hpg_core/CMakeLists.txt b/opencl/test/unit_test/xe_hpg_core/CMakeLists.txt index 451bcaebe1..b9b69a1ff3 100644 --- a/opencl/test/unit_test/xe_hpg_core/CMakeLists.txt +++ b/opencl/test/unit_test/xe_hpg_core/CMakeLists.txt @@ -17,6 +17,7 @@ if(TESTS_XE_HPG_CORE) ${CMAKE_CURRENT_SOURCE_DIR}/command_stream_receiver_hw_tests_xe_hpg_core.cpp ${CMAKE_CURRENT_SOURCE_DIR}/copy_engine_tests_xe_hpg_core.cpp ${CMAKE_CURRENT_SOURCE_DIR}/image_tests_xe_hpg_core.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/local_work_size_tests_dg2.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_cl_device_caps_xe_hpg_core.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_cmds_programming_xe_hpg_core.cpp ) diff --git a/opencl/test/unit_test/xe_hpg_core/dg2/CMakeLists.txt b/opencl/test/unit_test/xe_hpg_core/dg2/CMakeLists.txt index c2bc7b0d61..9ba13c45af 100644 --- a/opencl/test/unit_test/xe_hpg_core/dg2/CMakeLists.txt +++ b/opencl/test/unit_test/xe_hpg_core/dg2/CMakeLists.txt @@ -13,6 +13,7 @@ if(TESTS_DG2) set(IGDRCL_SRCS_tests_xe_hpg_core_dg2 ${IGDRCL_SRCS_tests_xe_hpg_core_dg2_excludes} ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_tests_dg2.cpp ${CMAKE_CURRENT_SOURCE_DIR}/buffer_pool_alloc_tests_dg2.cpp ${CMAKE_CURRENT_SOURCE_DIR}/get_device_info_dg2.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_cmds_programming_dg2.cpp diff --git a/opencl/test/unit_test/xe_hpg_core/dg2/command_queue_tests_dg2.cpp b/opencl/test/unit_test/xe_hpg_core/dg2/command_queue_tests_dg2.cpp new file mode 100644 index 0000000000..c8e4bbe3e2 --- /dev/null +++ b/opencl/test/unit_test/xe_hpg_core/dg2/command_queue_tests_dg2.cpp @@ -0,0 +1,178 @@ +/* + * Copyright (C) 2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_stream/scratch_space_controller_base.h" +#include "shared/source/os_interface/hw_info_config.h" +#include "shared/source/xe_hpg_core/hw_cmds_dg2.h" +#include "shared/test/common/helpers/engine_descriptor_helper.h" +#include "shared/test/common/mocks/mock_command_stream_receiver.h" +#include "shared/test/common/test_macros/header/per_product_test_definitions.h" +#include "shared/test/common/test_macros/test.h" + +#include "opencl/source/event/event_builder.h" +#include "opencl/source/helpers/task_information.h" +#include "opencl/test/unit_test/command_queue/command_queue_fixture.h" +#include "opencl/test/unit_test/mocks/mock_command_queue.h" +#include "opencl/test/unit_test/mocks/mock_kernel.h" +#include "opencl/test/unit_test/mocks/mock_mdi.h" + +using namespace NEO; + +class MyMockCommandStreamReceiver : public MockCommandStreamReceiver { + public: + using CommandStreamReceiver::scratchSpaceController; + MyMockCommandStreamReceiver(ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex, const DeviceBitfield deviceBitfield) + : MockCommandStreamReceiver(executionEnvironment, rootDeviceIndex, deviceBitfield) {} + CompletionStamp flushTask( + LinearStream &commandStream, + size_t commandStreamStart, + const IndirectHeap *dsh, + const IndirectHeap *ioh, + const IndirectHeap *ssh, + TaskCountType taskLevel, + DispatchFlags &dispatchFlags, + Device &device) override { + disableEuFusionPassed = dispatchFlags.disableEUFusion; + return MockCommandStreamReceiver::flushTask(commandStream, commandStreamStart, dsh, ioh, ssh, taskLevel, dispatchFlags, device); + } + bool disableEuFusionPassed = false; +}; +template +class MockCmdQueueOverrideCsr : public MockCommandQueueHw { + public: + MockCmdQueueOverrideCsr(Context *context, + ClDevice *device, + MyMockCommandStreamReceiver *csr) : MockCommandQueueHw(context, device, nullptr) { + this->csr = csr; + } + CommandStreamReceiver &getGpgpuCommandStreamReceiver() const override { return *csr; } + MyMockCommandStreamReceiver *csr = nullptr; +}; + +DG2TEST_F(CommandQueueHwTest, GivenKernelWithDpasAndOddWorkGroupWhenenqueueNonBlockedCalledThenDisableEuFusionPassedToFlushTask) { + auto hardwareInfo = *defaultHwInfo; + auto mockDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&hardwareInfo, 0)); + std::unique_ptr osContext(OsContext::create(mockDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), mockDevice->getRootDeviceIndex(), 0, + EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS1, EngineUsage::Regular}, + PreemptionMode::ThreadGroup, mockDevice->getDeviceBitfield()))); + auto csr = std::make_unique(*mockDevice->getExecutionEnvironment(), mockDevice->getRootDeviceIndex(), mockDevice->getDeviceBitfield()); + csr->setupContext(*osContext); + auto scratchController = new ScratchSpaceControllerBase(pDevice->getRootDeviceIndex(), *pDevice->executionEnvironment, *csr->getInternalAllocationStorage()); + csr->scratchSpaceController.reset(scratchController); + MockCmdQueueOverrideCsr cmdQ(pContext, mockDevice.get(), csr.get()); + MockKernelWithInternals mockKernelWithInternals(*mockDevice.get()); + auto pKernel = mockKernelWithInternals.mockKernel; + MockMultiDispatchInfo multiDispatchInfo(mockDevice.get(), pKernel); + BlitPropertiesContainer blitPropertiesContainer; + const EnqueueProperties enqueueProperties(false, true, false, false, false, &blitPropertiesContainer); + TimestampPacketDependencies timestampPacketDependencies; + EventsRequest eventsRequest(0, nullptr, nullptr); + EventBuilder eventBuilder; + LinearStream commandStream; + const_cast(pKernel->getDescriptor()).payloadMappings.dispatchTraits.localWorkSize[0] = 0; + const_cast(pKernel->getDescriptor()).payloadMappings.dispatchTraits.localWorkSize[1] = 4; + const_cast(pKernel->getDescriptor()).payloadMappings.dispatchTraits.localWorkSize[2] = 8; + const_cast(pKernel->getDescriptor()).payloadMappings.dispatchTraits.numWorkGroups[0] = 12; + const_cast(pKernel->getDescriptor()).payloadMappings.dispatchTraits.numWorkGroups[1] = 16; + const_cast(pKernel->getDescriptor()).payloadMappings.dispatchTraits.numWorkGroups[2] = 20; + + pKernel->setLocalWorkSizeValues(3, 7, 1); + pKernel->setNumWorkGroupsValues(5, 1, 1); + + bool blocking = false; + const_cast(pKernel->getDescriptor()).kernelAttributes.flags.usesSystolicPipelineSelectMode = true; + cmdQ.template enqueueNonBlocked(nullptr, 0, commandStream, commandStream.getUsed(), blocking, true, multiDispatchInfo, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, nullptr); + EXPECT_TRUE(csr->disableEuFusionPassed); +} + +DG2TEST_F(CommandQueueHwTest, GivenKernelWithDpasAndNotOddWorkGroupWhenenqueueNonBlockedCalledThenDisableEuFusionNotPassedToFlushTask) { + auto hardwareInfo = *defaultHwInfo; + auto mockDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&hardwareInfo, 0)); + std::unique_ptr osContext(OsContext::create(mockDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), mockDevice->getRootDeviceIndex(), 0, + EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS1, EngineUsage::Regular}, + PreemptionMode::ThreadGroup, mockDevice->getDeviceBitfield()))); + auto csr = std::make_unique(*mockDevice->getExecutionEnvironment(), mockDevice->getRootDeviceIndex(), mockDevice->getDeviceBitfield()); + csr->setupContext(*osContext); + auto scratchController = new ScratchSpaceControllerBase(pDevice->getRootDeviceIndex(), *pDevice->executionEnvironment, *csr->getInternalAllocationStorage()); + csr->scratchSpaceController.reset(scratchController); + MockCmdQueueOverrideCsr cmdQ(pContext, mockDevice.get(), csr.get()); + MockKernelWithInternals mockKernelWithInternals(*mockDevice.get()); + auto pKernel = mockKernelWithInternals.mockKernel; + MockMultiDispatchInfo multiDispatchInfo(mockDevice.get(), pKernel); + BlitPropertiesContainer blitPropertiesContainer; + const EnqueueProperties enqueueProperties(false, true, false, false, false, &blitPropertiesContainer); + TimestampPacketDependencies timestampPacketDependencies; + EventsRequest eventsRequest(0, nullptr, nullptr); + EventBuilder eventBuilder; + LinearStream commandStream; + const_cast(pKernel->getDescriptor()).payloadMappings.dispatchTraits.localWorkSize[0] = 0; + const_cast(pKernel->getDescriptor()).payloadMappings.dispatchTraits.localWorkSize[1] = 4; + const_cast(pKernel->getDescriptor()).payloadMappings.dispatchTraits.localWorkSize[2] = 8; + const_cast(pKernel->getDescriptor()).payloadMappings.dispatchTraits.numWorkGroups[0] = 12; + const_cast(pKernel->getDescriptor()).payloadMappings.dispatchTraits.numWorkGroups[1] = 16; + const_cast(pKernel->getDescriptor()).payloadMappings.dispatchTraits.numWorkGroups[2] = 20; + + pKernel->setLocalWorkSizeValues(4, 7, 1); + pKernel->setNumWorkGroupsValues(5, 1, 1); + + bool blocking = false; + const_cast(pKernel->getDescriptor()).kernelAttributes.flags.usesSystolicPipelineSelectMode = true; + cmdQ.template enqueueNonBlocked(nullptr, 0, commandStream, commandStream.getUsed(), blocking, true, multiDispatchInfo, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, nullptr); + EXPECT_FALSE(csr->disableEuFusionPassed); +} +DG2TEST_F(CommandQueueHwTest, GivenKernelWithRequiredDisableEuFusionWhenenqueueNonBlockedCalledThenDisableEuFusionPassedToFlushTask) { + auto hardwareInfo = *defaultHwInfo; + auto mockDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&hardwareInfo, 0)); + std::unique_ptr osContext(OsContext::create(mockDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), mockDevice->getRootDeviceIndex(), 0, + EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS1, EngineUsage::Regular}, + PreemptionMode::ThreadGroup, mockDevice->getDeviceBitfield()))); + auto csr = std::make_unique(*mockDevice->getExecutionEnvironment(), mockDevice->getRootDeviceIndex(), mockDevice->getDeviceBitfield()); + csr->setupContext(*osContext); + auto scratchController = new ScratchSpaceControllerBase(pDevice->getRootDeviceIndex(), *pDevice->executionEnvironment, *csr->getInternalAllocationStorage()); + csr->scratchSpaceController.reset(scratchController); + MockCmdQueueOverrideCsr cmdQ(pContext, mockDevice.get(), csr.get()); + MockKernelWithInternals mockKernelWithInternals(*mockDevice.get()); + auto pKernel = mockKernelWithInternals.mockKernel; + MockMultiDispatchInfo multiDispatchInfo(mockDevice.get(), pKernel); + BlitPropertiesContainer blitPropertiesContainer; + const EnqueueProperties enqueueProperties(false, true, false, false, false, &blitPropertiesContainer); + TimestampPacketDependencies timestampPacketDependencies; + EventsRequest eventsRequest(0, nullptr, nullptr); + EventBuilder eventBuilder; + LinearStream commandStream; + + bool blocking = false; + const_cast(pKernel->getDescriptor()).kernelAttributes.flags.requiresDisabledEUFusion = true; + cmdQ.template enqueueNonBlocked(nullptr, 0, commandStream, commandStream.getUsed(), blocking, true, multiDispatchInfo, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, nullptr); + EXPECT_TRUE(csr->disableEuFusionPassed); +} +DG2TEST_F(CommandQueueHwTest, GivenKernelWithoutRequiredDisableEuFusionWhenenqueueNonBlockedCalledThenDisableEuFusionNotPassedToFlushTask) { + auto hardwareInfo = *defaultHwInfo; + auto mockDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&hardwareInfo, 0)); + std::unique_ptr osContext(OsContext::create(mockDevice->getExecutionEnvironment()->rootDeviceEnvironments[0]->osInterface.get(), mockDevice->getRootDeviceIndex(), 0, + EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS1, EngineUsage::Regular}, + PreemptionMode::ThreadGroup, mockDevice->getDeviceBitfield()))); + auto csr = std::make_unique(*mockDevice->getExecutionEnvironment(), mockDevice->getRootDeviceIndex(), mockDevice->getDeviceBitfield()); + csr->setupContext(*osContext); + auto scratchController = new ScratchSpaceControllerBase(pDevice->getRootDeviceIndex(), *pDevice->executionEnvironment, *csr->getInternalAllocationStorage()); + csr->scratchSpaceController.reset(scratchController); + MockCmdQueueOverrideCsr cmdQ(pContext, mockDevice.get(), csr.get()); + MockKernelWithInternals mockKernelWithInternals(*mockDevice.get()); + auto pKernel = mockKernelWithInternals.mockKernel; + MockMultiDispatchInfo multiDispatchInfo(mockDevice.get(), pKernel); + BlitPropertiesContainer blitPropertiesContainer; + const EnqueueProperties enqueueProperties(false, true, false, false, false, &blitPropertiesContainer); + TimestampPacketDependencies timestampPacketDependencies; + EventsRequest eventsRequest(0, nullptr, nullptr); + EventBuilder eventBuilder; + LinearStream commandStream; + + bool blocking = false; + const_cast(pKernel->getDescriptor()).kernelAttributes.flags.requiresDisabledEUFusion = false; + cmdQ.template enqueueNonBlocked(nullptr, 0, commandStream, commandStream.getUsed(), blocking, true, multiDispatchInfo, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, nullptr); + EXPECT_FALSE(csr->disableEuFusionPassed); +} \ No newline at end of file diff --git a/opencl/test/unit_test/xe_hpg_core/dg2/test_cmds_programming_dg2.cpp b/opencl/test/unit_test/xe_hpg_core/dg2/test_cmds_programming_dg2.cpp index 631156fca5..b9bf21d0bb 100644 --- a/opencl/test/unit_test/xe_hpg_core/dg2/test_cmds_programming_dg2.cpp +++ b/opencl/test/unit_test/xe_hpg_core/dg2/test_cmds_programming_dg2.cpp @@ -18,6 +18,7 @@ #include "shared/test/common/test_macros/test.h" #include "opencl/source/mem_obj/buffer.h" +#include "opencl/test/unit_test/command_queue/command_queue_fixture.h" #include "opencl/test/unit_test/fixtures/ult_command_stream_receiver_fixture.h" #include "opencl/test/unit_test/mocks/mock_cl_device.h" #include "opencl/test/unit_test/mocks/mock_context.h" diff --git a/opencl/test/unit_test/xe_hpg_core/local_work_size_tests_dg2.cpp b/opencl/test/unit_test/xe_hpg_core/local_work_size_tests_dg2.cpp new file mode 100644 index 0000000000..dc90358a91 --- /dev/null +++ b/opencl/test/unit_test/xe_hpg_core/local_work_size_tests_dg2.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/helpers/gfx_core_helper.h" +#include "shared/source/helpers/local_work_size.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/mocks/mock_device.h" +#include "shared/test/common/mocks/mock_execution_environment.h" +#include "shared/test/common/test_macros/hw_test.h" + +#include "opencl/source/command_queue/cl_local_work_size.h" +#include "opencl/source/helpers/dispatch_info.h" +#include "opencl/test/unit_test/mocks/mock_cl_device.h" +#include "opencl/test/unit_test/mocks/mock_kernel.h" + +using namespace NEO; + +using LocalWorkSizeTestDG2 = ::testing::Test; + +DG2TEST_F(LocalWorkSizeTestDG2, givenKernelWithDpasAndSlmWhenWorkSizeInfoCalculatedThenMinWGSizeIsLessThanForKernelWithoutDpas) { + MockClDevice device{new MockDevice}; + MockKernelWithInternals kernel(device); + DispatchInfo dispatchInfo; + dispatchInfo.setClDevice(&device); + dispatchInfo.setKernel(kernel.mockKernel); + + auto threadsPerEu = defaultHwInfo->gtSystemInfo.ThreadCount / defaultHwInfo->gtSystemInfo.EUCount; + auto euPerSubSlice = defaultHwInfo->gtSystemInfo.ThreadCount / defaultHwInfo->gtSystemInfo.MaxEuPerSubSlice; + + auto &deviceInfo = device.sharedDeviceInfo; + deviceInfo.maxNumEUsPerSubSlice = euPerSubSlice; + deviceInfo.numThreadsPerEU = threadsPerEu; + kernel.mockKernel->slmTotalSize = 0x100; + + const_cast(kernel.mockKernel->getDescriptor()).kernelAttributes.flags.usesSystolicPipelineSelectMode = true; + WorkSizeInfo workSizeInfoWithDpas = createWorkSizeInfoFromDispatchInfo(dispatchInfo); + + const_cast(kernel.mockKernel->getDescriptor()).kernelAttributes.flags.usesSystolicPipelineSelectMode = false; + WorkSizeInfo workSizeInfoWithoutDpas = createWorkSizeInfoFromDispatchInfo(dispatchInfo); + EXPECT_NE(workSizeInfoWithDpas.minWorkGroupSize, workSizeInfoWithoutDpas.minWorkGroupSize); +} + +DG2TEST_F(LocalWorkSizeTestDG2, givenKernelWithFusedEuDisabledAndSlmWhenWorkSizeInfoCalculatedThenMinWGSizeIsLessThanForKernelWithoutDpas) { + MockClDevice device{new MockDevice}; + MockKernelWithInternals kernel(device); + DispatchInfo dispatchInfo; + dispatchInfo.setClDevice(&device); + dispatchInfo.setKernel(kernel.mockKernel); + + auto threadsPerEu = defaultHwInfo->gtSystemInfo.ThreadCount / defaultHwInfo->gtSystemInfo.EUCount; + auto euPerSubSlice = defaultHwInfo->gtSystemInfo.ThreadCount / defaultHwInfo->gtSystemInfo.MaxEuPerSubSlice; + + auto &deviceInfo = device.sharedDeviceInfo; + deviceInfo.maxNumEUsPerSubSlice = euPerSubSlice; + deviceInfo.numThreadsPerEU = threadsPerEu; + kernel.mockKernel->slmTotalSize = 0x100; + + const_cast(kernel.mockKernel->getDescriptor()).kernelAttributes.flags.requiresDisabledEUFusion = true; + WorkSizeInfo workSizeInfoWithDpas = createWorkSizeInfoFromDispatchInfo(dispatchInfo); + + const_cast(kernel.mockKernel->getDescriptor()).kernelAttributes.flags.requiresDisabledEUFusion = false; + WorkSizeInfo workSizeInfoWithoutDpas = createWorkSizeInfoFromDispatchInfo(dispatchInfo); + EXPECT_NE(workSizeInfoWithDpas.minWorkGroupSize, workSizeInfoWithoutDpas.minWorkGroupSize); +} \ No newline at end of file diff --git a/shared/source/helpers/gfx_core_helper_base.inl b/shared/source/helpers/gfx_core_helper_base.inl index 6a6610f743..cb4d7ac24f 100644 --- a/shared/source/helpers/gfx_core_helper_base.inl +++ b/shared/source/helpers/gfx_core_helper_base.inl @@ -723,5 +723,4 @@ template bool GfxCoreHelperHw::isRelaxedOrderingSupported() const { return false; } - } // namespace NEO diff --git a/shared/source/os_interface/hw_info_config.h b/shared/source/os_interface/hw_info_config.h index 040bb83069..e6c989d86c 100644 --- a/shared/source/os_interface/hw_info_config.h +++ b/shared/source/os_interface/hw_info_config.h @@ -195,6 +195,7 @@ class ProductHelper { virtual uint32_t getDefaultRevisionId() const = 0; virtual bool isMultiContextResourceDeferDeletionSupported() const = 0; + virtual bool isFusedEuDisabledForDpas(bool kernelHasDpasInstructions, const uint32_t *lws, const uint32_t *groupCount) const = 0; virtual ~ProductHelper() = default; @@ -351,6 +352,7 @@ class ProductHelperHw : public ProductHelper { uint32_t getDefaultRevisionId() const override; bool isMultiContextResourceDeferDeletionSupported() const override; + bool isFusedEuDisabledForDpas(bool kernelHasDpasInstructions, const uint32_t *lws, const uint32_t *groupCount) const override; ~ProductHelperHw() override = default; diff --git a/shared/source/os_interface/hw_info_config.inl b/shared/source/os_interface/hw_info_config.inl index 486ce61baa..878c84676e 100644 --- a/shared/source/os_interface/hw_info_config.inl +++ b/shared/source/os_interface/hw_info_config.inl @@ -727,4 +727,8 @@ bool ProductHelperHw::isMultiContextResourceDeferDeletionSupported() return false; } +template +bool ProductHelperHw::isFusedEuDisabledForDpas(bool kernelHasDpasInstructions, const uint32_t *lws, const uint32_t *groupCount) const { + return false; +} } // namespace NEO diff --git a/shared/source/xe_hpg_core/dg2/os_agnostic_hw_info_config_dg2.inl b/shared/source/xe_hpg_core/dg2/os_agnostic_hw_info_config_dg2.inl index 133a25bfba..f9edc4976f 100644 --- a/shared/source/xe_hpg_core/dg2/os_agnostic_hw_info_config_dg2.inl +++ b/shared/source/xe_hpg_core/dg2/os_agnostic_hw_info_config_dg2.inl @@ -235,5 +235,21 @@ template <> std::optional ProductHelperHw::getAubStreamProductFamily() const { return aub_stream::ProductFamily::Dg2; }; +template <> +bool ProductHelperHw::isFusedEuDisabledForDpas(bool kernelHasDpasInstructions, const uint32_t *lws, const uint32_t *groupCount) const { + if (!kernelHasDpasInstructions) { + return false; + } else if (lws == nullptr || groupCount == nullptr) { + return true; + } else if (size_t lwsCount = lws[0] * lws[1] * lws[2]; lwsCount > 1 && (lwsCount & 1) != 0) { + return true; + } else if (lwsCount > 1) { + return false; + } else if ((groupCount[0] & 1) != 0) { + return true; + } else { + return false; + } +} } // namespace NEO diff --git a/shared/source/xe_hpg_core/windows/hw_info_config_dg2.cpp b/shared/source/xe_hpg_core/windows/hw_info_config_dg2.cpp index 017c215241..3e689e0043 100644 --- a/shared/source/xe_hpg_core/windows/hw_info_config_dg2.cpp +++ b/shared/source/xe_hpg_core/windows/hw_info_config_dg2.cpp @@ -43,6 +43,5 @@ template <> bool ProductHelperHw::isMultiContextResourceDeferDeletionSupported() const { return true; } - template class ProductHelperHw; } // namespace NEO diff --git a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp index f74885ba7f..5f1775902b 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp @@ -1459,3 +1459,7 @@ HWTEST_F(ProductHelperCommonTest, givenPatIndexAndAllocationTypeWhenCallOverride patIndex = 3u; EXPECT_EQ(patIndex, gfxCoreHelper.overridePatIndex(allocationType, patIndex)); } +HWTEST_F(ProductHelperCommonTest, givenHwHelperWhenIsFusedEuDisabledForDpasCalledThenFalseReturned) { + auto &gfxCoreHelper = getHelper(); + EXPECT_FALSE(gfxCoreHelper.isFusedEuDisabledForDpas(true, nullptr, nullptr)); +} \ No newline at end of file diff --git a/shared/test/unit_test/xe_hpg_core/dg2/excludes_xe_hpg_core_dg2.cpp b/shared/test/unit_test/xe_hpg_core/dg2/excludes_xe_hpg_core_dg2.cpp index 0b25650a85..cbf82e94b2 100644 --- a/shared/test/unit_test/xe_hpg_core/dg2/excludes_xe_hpg_core_dg2.cpp +++ b/shared/test/unit_test/xe_hpg_core/dg2/excludes_xe_hpg_core_dg2.cpp @@ -33,3 +33,4 @@ HWTEST_EXCLUDE_PRODUCT(XeHpgSbaTest, givenSpecificProductFamilyWhenAppendingSbaT HWTEST_EXCLUDE_PRODUCT(GfxCoreHelperTest, GivenZeroSlmSizeWhenComputeSlmSizeIsCalledThenCorrectValueIsReturned, IGFX_DG2); HWTEST_EXCLUDE_PRODUCT(ProductHelperTestXeHpgCore, givenProductHelperWhenCheckTimestampWaitSupportForEventsThenReturnFalse, IGFX_DG2); HWTEST_EXCLUDE_PRODUCT(CommandEncodeStatesTestDg2AndLater, GivenVariousSlmTotalSizesAndSettingRevIDToDifferentValuesWhenSetAdditionalInfoIsCalledThenCorrectValuesAreSet_IsXeHpgCore, IGFX_DG2); +HWTEST_EXCLUDE_PRODUCT(ProductHelperCommonTest, givenHwHelperWhenIsFusedEuDisabledForDpasCalledThenFalseReturned, IGFX_DG2); diff --git a/shared/test/unit_test/xe_hpg_core/dg2/product_config_helper_tests_dg2.cpp b/shared/test/unit_test/xe_hpg_core/dg2/product_config_helper_tests_dg2.cpp index 2ff34b3482..77db8a812f 100644 --- a/shared/test/unit_test/xe_hpg_core/dg2/product_config_helper_tests_dg2.cpp +++ b/shared/test/unit_test/xe_hpg_core/dg2/product_config_helper_tests_dg2.cpp @@ -6,7 +6,9 @@ */ #include "shared/source/helpers/product_config_helper.h" +#include "shared/source/os_interface/hw_info_config.h" #include "shared/source/xe_hpg_core/hw_cmds_dg2.h" +#include "shared/test/common/fixtures/device_fixture.h" #include "shared/test/common/test_macros/header/per_product_test_definitions.h" #include "shared/test/common/test_macros/test.h" @@ -14,6 +16,7 @@ using namespace NEO; using ProductConfigHelperDg2Tests = ::testing::Test; +using ProductHelperTests = Test; DG2TEST_F(ProductConfigHelperDg2Tests, givenVariousVariantsOfXeHpgAcronymsWhenGetReleaseThenCorrectValueIsReturned) { std::vector acronymsVariants = {"xe_hpg_core", "xe_hpg", "xehpg", "XeHpg"}; @@ -29,3 +32,55 @@ DG2TEST_F(ProductConfigHelperDg2Tests, givenXeHpgReleaseWhenSearchForDeviceAcron auto aotInfos = productConfigHelper->getDeviceAotInfo(); EXPECT_TRUE(std::any_of(aotInfos.begin(), aotInfos.end(), ProductConfigHelper::findDeviceAcronymForRelease(AOT::XE_HPG_RELEASE))); } +DG2TEST_F(ProductHelperTests, givenNoDpasInstructionInKernelHelperWhenCheckingIfEuFusionShouldBeDisabledThenFalseReturned) { + auto &gfxCoreHelper = getHelper(); + const uint32_t lws[3] = {1, 1, 1}; + const uint32_t groupCount[3] = {5, 3, 1}; + bool dpasInstruction = false; + EXPECT_FALSE(gfxCoreHelper.isFusedEuDisabledForDpas(dpasInstruction, lws, groupCount)); +} +DG2TEST_F(ProductHelperTests, givenDpasInstructionLwsAndGroupCountIsNullPtrInKernelHelperWhenCheckingIfEuFusionShouldBeDisabledThenTrueReturned) { + auto &gfxCoreHelper = getHelper(); + bool dpasInstruction = true; + EXPECT_TRUE(gfxCoreHelper.isFusedEuDisabledForDpas(dpasInstruction, nullptr, nullptr)); +} +DG2TEST_F(ProductHelperTests, givenDpasInstructionLwsIsNullPtrInKernelHelperWhenCheckingIfEuFusionShouldBeDisabledThenTrueReturned) { + auto &gfxCoreHelper = getHelper(); + bool dpasInstruction = true; + const uint32_t groupCount[3] = {5, 3, 1}; + EXPECT_TRUE(gfxCoreHelper.isFusedEuDisabledForDpas(dpasInstruction, nullptr, groupCount)); +} +DG2TEST_F(ProductHelperTests, givenDpasInstructionGroupCountIsNullPtrInKernelHelperWhenCheckingIfEuFusionShouldBeDisabledThenTrueReturned) { + auto &gfxCoreHelper = getHelper(); + bool dpasInstruction = true; + const uint32_t lws[3] = {1, 1, 1}; + EXPECT_TRUE(gfxCoreHelper.isFusedEuDisabledForDpas(dpasInstruction, lws, nullptr)); +} +DG2TEST_F(ProductHelperTests, givenDpasInstructionLwsAndLwsIsOddWhenCheckingIfEuFusionShouldBeDisabledThenTrueReturned) { + auto &gfxCoreHelper = getHelper(); + const uint32_t lws[3] = {7, 3, 1}; + const uint32_t groupCount[3] = {2, 1, 1}; + bool dpasInstruction = true; + EXPECT_TRUE(gfxCoreHelper.isFusedEuDisabledForDpas(dpasInstruction, lws, groupCount)); +} +DG2TEST_F(ProductHelperTests, givenDpasInstructionLwsAndLwsIsNoOddWhenCheckingIfEuFusionShouldBeDisabledThenFalseReturned) { + auto &gfxCoreHelper = getHelper(); + const uint32_t lws[3] = {8, 3, 1}; + const uint32_t groupCount[3] = {2, 1, 1}; + bool dpasInstruction = true; + EXPECT_FALSE(gfxCoreHelper.isFusedEuDisabledForDpas(dpasInstruction, lws, groupCount)); +} +DG2TEST_F(ProductHelperTests, givenDpasInstructionLwsAndLwsIsOneAndXGroupCountIsOddWhenCheckingIfEuFusionShouldBeDisabledThenFalseReturned) { + auto &gfxCoreHelper = getHelper(); + const uint32_t lws[3] = {1, 1, 1}; + const uint32_t groupCount[3] = {5, 1, 1}; + bool dpasInstruction = true; + EXPECT_TRUE(gfxCoreHelper.isFusedEuDisabledForDpas(dpasInstruction, lws, groupCount)); +} +DG2TEST_F(ProductHelperTests, givenDpasInstructionLwsAndLwsIsOneAndXGroupCountIsNoOddWhenCheckingIfEuFusionShouldBeDisabledThenFalseReturned) { + auto &gfxCoreHelper = getHelper(); + const uint32_t lws[3] = {1, 1, 1}; + const uint32_t groupCount[3] = {4, 1, 1}; + bool dpasInstruction = true; + EXPECT_FALSE(gfxCoreHelper.isFusedEuDisabledForDpas(dpasInstruction, lws, groupCount)); +}