From bb62343aba9eb03dc29967a00fe3272817d8c1f4 Mon Sep 17 00:00:00 2001 From: "Zdanowicz, Zbigniew" Date: Wed, 3 Oct 2018 15:13:54 +0200 Subject: [PATCH] Add new parameter to thread data dispatching Change-Id: I86710b0cc764156f4c2db9d24ccd1c96b32d7660 --- runtime/command_queue/CMakeLists.txt | 2 + runtime/command_queue/gpgpu_walker.h | 4 +- runtime/command_queue/gpgpu_walker_base.inl | 15 ++- runtime/command_queue/hardware_interface.inl | 11 +- runtime/helpers/kernel_commands.h | 6 +- runtime/helpers/kernel_commands.inl | 21 +++- runtime/os_interface/DebugVariables_base.inl | 6 +- runtime/program/process_gen_binary.cpp | 3 +- .../get_size_required_buffer_tests.cpp | 4 +- .../command_queue/work_group_size_tests.cpp | 2 +- unit_tests/helpers/kernel_commands_tests.cpp | 108 ++++++++++++++++-- unit_tests/program/kernel_data.cpp | 1 + unit_tests/test_files/igdrcl.config | 1 + 13 files changed, 160 insertions(+), 24 deletions(-) diff --git a/runtime/command_queue/CMakeLists.txt b/runtime/command_queue/CMakeLists.txt index 71424d67fc..88f1079132 100644 --- a/runtime/command_queue/CMakeLists.txt +++ b/runtime/command_queue/CMakeLists.txt @@ -34,6 +34,7 @@ set(RUNTIME_SRCS_COMMAND_QUEUE ${CMAKE_CURRENT_SOURCE_DIR}/flush.h ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.h ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.inl + ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker_base.inl ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.h ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.inl ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface_base.inl @@ -46,3 +47,4 @@ set(RUNTIME_SRCS_COMMAND_QUEUE ) target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_COMMAND_QUEUE}) set_property(GLOBAL PROPERTY RUNTIME_SRCS_COMMAND_QUEUE ${RUNTIME_SRCS_COMMAND_QUEUE}) +add_subdirectories() \ No newline at end of file diff --git a/runtime/command_queue/gpgpu_walker.h b/runtime/command_queue/gpgpu_walker.h index 67e49e5e3b..85772d4b9b 100644 --- a/runtime/command_queue/gpgpu_walker.h +++ b/runtime/command_queue/gpgpu_walker.h @@ -134,7 +134,9 @@ class GpgpuWalkerHelper { const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, - bool localIdsGeneration); + bool localIdsGenerationByRuntime, + bool kernelUsesLocalIds, + bool inlineDataProgrammingRequired); static void dispatchProfilingCommandsStart( HwTimeStamps &hwTimeStamps, diff --git a/runtime/command_queue/gpgpu_walker_base.inl b/runtime/command_queue/gpgpu_walker_base.inl index 0b93bd33e9..e4161f14f5 100644 --- a/runtime/command_queue/gpgpu_walker_base.inl +++ b/runtime/command_queue/gpgpu_walker_base.inl @@ -19,7 +19,9 @@ inline size_t GpgpuWalkerHelper::setGpgpuWalkerThreadData( const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, - bool localIdsGeneration) { + bool localIdsGenerationByRuntime, + bool kernelUsesLocalIds, + bool inlineDataProgrammingRequired) { auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2]; auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize); @@ -127,7 +129,9 @@ void GpgpuWalkerHelper::dispatchScheduler( auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER)); *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker; - bool localIdsGeneration = KernelCommandsHelper::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes); + bool localIdsGenerationByRuntime = KernelCommandsHelper::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes); + bool inlineDataProgrammingRequired = KernelCommandsHelper::inlineDataProgrammingRequired(scheduler); + bool kernelUsesLocalIds = KernelCommandsHelper::kernelUsesLocalIds(scheduler); KernelCommandsHelper::sendIndirectState( *commandStream, *dsh, @@ -141,14 +145,17 @@ void GpgpuWalkerHelper::dispatchScheduler( preemptionMode, pGpGpuWalkerCmd, nullptr, - localIdsGeneration); + localIdsGenerationByRuntime, + kernelUsesLocalIds, + inlineDataProgrammingRequired); // Implement enabling special WA DisableLSQCROPERFforOCL if needed GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true); size_t globalOffsets[3] = {0, 0, 0}; size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1}; - GpgpuWalkerHelper::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1, localIdsGeneration); + GpgpuWalkerHelper::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, + simd, 1, localIdsGenerationByRuntime, kernelUsesLocalIds, inlineDataProgrammingRequired); // Implement disabling special WA DisableLSQCROPERFforOCL if needed GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false); diff --git a/runtime/command_queue/hardware_interface.inl b/runtime/command_queue/hardware_interface.inl index 5da5dd478a..5d08184d7b 100644 --- a/runtime/command_queue/hardware_interface.inl +++ b/runtime/command_queue/hardware_interface.inl @@ -198,7 +198,9 @@ void HardwareInterface::dispatchWalker( auto idd = obtainInterfaceDescriptorData(walkerCmd); - bool localIdsGeneration = KernelCommandsHelper::isRuntimeLocalIdsGenerationRequired(dim, globalWorkSizes, localWorkSizes); + bool localIdsGenerationByRuntime = KernelCommandsHelper::isRuntimeLocalIdsGenerationRequired(dim, globalWorkSizes, localWorkSizes); + bool inlineDataProgrammingRequired = KernelCommandsHelper::inlineDataProgrammingRequired(kernel); + bool kernelUsesLocalIds = KernelCommandsHelper::kernelUsesLocalIds(kernel); KernelCommandsHelper::sendIndirectState( *commandStream, *dsh, @@ -212,13 +214,16 @@ void HardwareInterface::dispatchWalker( preemptionMode, walkerCmd, idd, - localIdsGeneration); + localIdsGenerationByRuntime, + kernelUsesLocalIds, + inlineDataProgrammingRequired); size_t globalOffsets[3] = {offset.x, offset.y, offset.z}; size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z}; size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z}; GpgpuWalkerHelper::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups, - numWorkGroups, localWorkSizes, simd, dim, localIdsGeneration); + numWorkGroups, localWorkSizes, simd, dim, + localIdsGenerationByRuntime, kernelUsesLocalIds, inlineDataProgrammingRequired); dispatchWorkarounds(commandStream, commandQueue, kernel, false); currentDispatchIndex++; diff --git a/runtime/helpers/kernel_commands.h b/runtime/helpers/kernel_commands.h index 8b24629352..2238cc97cc 100644 --- a/runtime/helpers/kernel_commands.h +++ b/runtime/helpers/kernel_commands.h @@ -95,7 +95,9 @@ struct KernelCommandsHelper : public PerThreadDataHelper { PreemptionMode preemptionMode, WALKER_TYPE *walkerCmd, INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor, - bool localIdsGeneration); + bool localIdsGenerationByRuntime, + bool kernelUsesLocalIds, + bool inlineDataProgrammingRequired); static size_t getSizeRequiredCS(); static bool isPipeControlWArequired(); @@ -160,5 +162,7 @@ struct KernelCommandsHelper : public PerThreadDataHelper { static bool doBindingTablePrefetch(); static bool isRuntimeLocalIdsGenerationRequired(uint32_t workDim, size_t *gws, size_t *lws); + static bool inlineDataProgrammingRequired(const Kernel &kernel); + static bool kernelUsesLocalIds(const Kernel &kernel); }; } // namespace OCLRT diff --git a/runtime/helpers/kernel_commands.inl b/runtime/helpers/kernel_commands.inl index be3dabb30b..b15761eb40 100644 --- a/runtime/helpers/kernel_commands.inl +++ b/runtime/helpers/kernel_commands.inl @@ -296,7 +296,9 @@ size_t KernelCommandsHelper::sendIndirectState( PreemptionMode preemptionMode, WALKER_TYPE *walkerCmd, INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor, - bool localIdsGeneration) { + bool localIdsGenerationByRuntime, + bool kernelUsesLocalIds, + bool inlineDataProgrammingRequired) { using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE; DEBUG_BREAK_IF(simd != 8 && simd != 16 && simd != 32); @@ -460,4 +462,21 @@ template bool KernelCommandsHelper::isRuntimeLocalIdsGenerationRequired(uint32_t workDim, size_t *gws, size_t *lws) { return true; } + +template +bool KernelCommandsHelper::inlineDataProgrammingRequired(const Kernel &kernel) { + if (DebugManager.flags.EnablePassInlineData.get()) { + return kernel.getKernelInfo().patchInfo.threadPayload->PassInlineData && + kernel.getCrossThreadDataSize() <= sizeof(GRF); + } + return false; +} + +template +bool KernelCommandsHelper::kernelUsesLocalIds(const Kernel &kernel) { + return (kernel.getKernelInfo().patchInfo.threadPayload->LocalIDXPresent || + kernel.getKernelInfo().patchInfo.threadPayload->LocalIDYPresent || + kernel.getKernelInfo().patchInfo.threadPayload->LocalIDZPresent); +} + } // namespace OCLRT diff --git a/runtime/os_interface/DebugVariables_base.inl b/runtime/os_interface/DebugVariables_base.inl index 6b777f450f..e0a9f59e1f 100644 --- a/runtime/os_interface/DebugVariables_base.inl +++ b/runtime/os_interface/DebugVariables_base.inl @@ -12,9 +12,9 @@ DECLARE_DEBUG_VARIABLE(std::string, ProductFamilyOverride, std::string("unk"), " DECLARE_DEBUG_VARIABLE(std::string, ForceCompilerUsePlatform, std::string("unk"), "Specify product for use in compiler interface") DECLARE_DEBUG_VARIABLE(std::string, AUBDumpCaptureFileName, std::string("unk"), "Name of file to save AUB capture into") DECLARE_DEBUG_VARIABLE(std::string, AUBDumpFilterKernelName, std::string("unk"), "Name of kernel to AUB capture") +DECLARE_DEBUG_VARIABLE(std::string, AUBDumpToggleFileName, std::string("unk"), "Name of file to save AUB in toggle mode") DECLARE_DEBUG_VARIABLE(int32_t, AUBDumpFilterNamedKernelStartIdx, 0, "Start index of named kernel to AUB capture") DECLARE_DEBUG_VARIABLE(int32_t, AUBDumpFilterNamedKernelEndIdx, -1, "End index of named kernel to AUB capture") -DECLARE_DEBUG_VARIABLE(std::string, AUBDumpToggleFileName, std::string("unk"), "Name of file to save AUB in toggle mode") DECLARE_DEBUG_VARIABLE(int32_t, AUBDumpSubCaptureMode, 0, "AUB dump subcapture mode (off, toggle, filter)") DECLARE_DEBUG_VARIABLE(int32_t, AUBDumpFilterKernelStartIdx, 0, "Start index of kernel to AUB capture") DECLARE_DEBUG_VARIABLE(int32_t, AUBDumpFilterKernelEndIdx, -1, "End index of kernel to AUB capture") @@ -81,6 +81,8 @@ DECLARE_DEBUG_VARIABLE(bool, EnableForcePin, true, "Enables early pinning for me DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeND, true, "Enables diffrent algorithm to compute local work size") DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, false, "Enables algorithm to compute the most squared work group as possible") DECLARE_DEBUG_VARIABLE(bool, EnableVaLibCalls, true, "Enable cl-va sharing lib calls") +DECLARE_DEBUG_VARIABLE(bool, AddClGlSharing, false, "Add cl-gl extension") +DECLARE_DEBUG_VARIABLE(bool, EnablePassInlineData, false, "Enable passing of inline data") DECLARE_DEBUG_VARIABLE(int32_t, EnableStatelessToStatefulBufferOffsetOpt, -1, "-1: dont override, 0: disable, 1: enable, Enables buffer-offset improvement of the stateless to stateful optimization") DECLARE_DEBUG_VARIABLE(int32_t, CreateMultipleDevices, 0, "0: default - disable, 1+: Driver will create multiple (N) devices during initialization.") DECLARE_DEBUG_VARIABLE(int32_t, Enable64kbpages, -1, "-1: default behaviour, 0 Disables, 1 Enables support for 64KB pages for driver allocated fine grain svm buffers") @@ -92,7 +94,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideEnableQuickKmdSleepForSporadicWaits, -1, DECLARE_DEBUG_VARIABLE(int32_t, OverrideDelayQuickKmdSleepForSporadicWaitsMicroseconds, -1, "-1: dont override, >0: timeout in microseconds") DECLARE_DEBUG_VARIABLE(int32_t, CsrDispatchMode, 0, "Chooses DispatchMode for Csr") DECLARE_DEBUG_VARIABLE(int32_t, OverrideDefaultFP64Settings, -1, "-1: dont override, 0: disable, 1: enable.") -DECLARE_DEBUG_VARIABLE(bool, AddClGlSharing, false, "Add cl-gl extension") + /*DRIVER TOGGLES*/ DECLARE_DEBUG_VARIABLE(int32_t, ForceOCLVersion, 0, "Force specific OpenCL API version") DECLARE_DEBUG_VARIABLE(int32_t, ForcePreemptionMode, -1, "Keep this variable in sync with PreemptionMode enum. -1 - devices default mode, 1 - disable, 2 - midBatch, 3 " "- threadGroup, 4 - midThread") diff --git a/runtime/program/process_gen_binary.cpp b/runtime/program/process_gen_binary.cpp index 484429acf6..a27a29cde0 100644 --- a/runtime/program/process_gen_binary.cpp +++ b/runtime/program/process_gen_binary.cpp @@ -489,7 +489,8 @@ cl_int Program::parsePatchList(KernelInfo &kernelInfo) { "\n .GetLocalIDPresent", kernelInfo.patchInfo.threadPayload->GetLocalIDPresent, "\n .GetGroupIDPresent", kernelInfo.patchInfo.threadPayload->GetGroupIDPresent, "\n .GetGlobalOffsetPresent", kernelInfo.patchInfo.threadPayload->GetGlobalOffsetPresent, - "\n .OffsetToSkipPerThreadDataLoad", kernelInfo.patchInfo.threadPayload->OffsetToSkipPerThreadDataLoad); + "\n .OffsetToSkipPerThreadDataLoad", kernelInfo.patchInfo.threadPayload->OffsetToSkipPerThreadDataLoad, + "\n .PassInlineData", kernelInfo.patchInfo.threadPayload->PassInlineData); break; case PATCH_TOKEN_EXECUTION_ENVIRONMENT: diff --git a/unit_tests/command_queue/get_size_required_buffer_tests.cpp b/unit_tests/command_queue/get_size_required_buffer_tests.cpp index e98cd9462f..74d9562f54 100644 --- a/unit_tests/command_queue/get_size_required_buffer_tests.cpp +++ b/unit_tests/command_queue/get_size_required_buffer_tests.cpp @@ -392,7 +392,7 @@ HWTEST_F(GetSizeRequiredBufferTest, enqueueKernelHelloWorld) { auto iohAfter = pIOH->getUsed(); auto sshAfter = pSSH->getUsed(); - auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, nullptr); + auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel); auto expectedSizeDSH = KernelCommandsHelper::getSizeRequiredDSH(*KernelFixture::pKernel); auto expectedSizeIOH = KernelCommandsHelper::getSizeRequiredIOH(*KernelFixture::pKernel, workSize[0]); auto expectedSizeSSH = KernelCommandsHelper::getSizeRequiredSSH(*KernelFixture::pKernel); @@ -431,7 +431,7 @@ HWTEST_F(GetSizeRequiredBufferTest, enqueueKernelSimpleArg) { auto iohAfter = pIOH->getUsed(); auto sshAfter = pSSH->getUsed(); - auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, nullptr); + auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel); auto expectedSizeDSH = KernelCommandsHelper::getSizeRequiredDSH(*KernelFixture::pKernel); auto expectedSizeIOH = KernelCommandsHelper::getSizeRequiredIOH(*KernelFixture::pKernel, workSize[0]); auto expectedSizeSSH = KernelCommandsHelper::getSizeRequiredSSH(*KernelFixture::pKernel); diff --git a/unit_tests/command_queue/work_group_size_tests.cpp b/unit_tests/command_queue/work_group_size_tests.cpp index 883ff7e5b6..9d9e42f101 100644 --- a/unit_tests/command_queue/work_group_size_tests.cpp +++ b/unit_tests/command_queue/work_group_size_tests.cpp @@ -85,7 +85,7 @@ struct WorkGroupSizeBase { (workItems[1] + workGroupSize[1] - 1) / workGroupSize[1], (workItems[2] + workGroupSize[2] - 1) / workGroupSize[2]}; GpgpuWalkerHelper::setGpgpuWalkerThreadData(&pCmd, globalOffsets, workGroupsStart, workGroupsNum, - workGroupSize, simdSize, dims, true); + workGroupSize, simdSize, dims, true, false, false); //And check if it is programmed correctly auto numWorkItems = computeWalkerWorkItems(pCmd); diff --git a/unit_tests/helpers/kernel_commands_tests.cpp b/unit_tests/helpers/kernel_commands_tests.cpp index 1110be5b65..3594c55cb0 100644 --- a/unit_tests/helpers/kernel_commands_tests.cpp +++ b/unit_tests/helpers/kernel_commands_tests.cpp @@ -323,7 +323,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, sendIndirectStateResourceUsage) pDevice->getPreemptionMode(), pWalkerCmd, nullptr, - true); + true, + true, + false); // It's okay these are EXPECT_GE as they're only going to be used for // estimation purposes to avoid OOM. @@ -375,7 +377,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWithFourBindingTableE pDevice->getPreemptionMode(), pWalkerCmd, nullptr, - true); + true, + true, + false); auto interfaceDescriptor = reinterpret_cast(dsh.getCpuBase()); if (KernelCommandsHelper::doBindingTablePrefetch()) { @@ -420,7 +424,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelThatIsSchedulerWhenIn pDevice->getPreemptionMode(), pWalkerCmd, nullptr, - true); + true, + true, + false); auto interfaceDescriptor = reinterpret_cast(dsh.getCpuBase()); EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount()); @@ -459,7 +465,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWith100BindingTableEn pDevice->getPreemptionMode(), pWalkerCmd, nullptr, - true); + true, + true, + false); auto interfaceDescriptor = reinterpret_cast(dsh.getCpuBase()); if (KernelCommandsHelper::doBindingTablePrefetch()) { @@ -531,7 +539,10 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, whenSendingIndirectStateThenKern pDevice->getPreemptionMode(), pWalkerCmd, nullptr, - true); + true, + true, + false); + size_t numThreads = localWorkSizeX * localWorkSizeY * localWorkSizeZ; numThreads = (numThreads + modifiedKernelInfo.getMaxSimdSize() - 1) / modifiedKernelInfo.getMaxSimdSize(); size_t expectedIohSize = ((modifiedKernelInfo.getMaxSimdSize() == 32) ? 32 : 16) * 3 * numThreads * sizeof(uint16_t); @@ -609,7 +620,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointer) { pDevice->getPreemptionMode(), pWalkerCmd, nullptr, - true); + true, + true, + false); EXPECT_EQ(0x00000000u, *(&bindingTableStatesPointers[0])); EXPECT_EQ(0x00000040u, *(&bindingTableStatesPointers[1])); @@ -769,7 +782,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointersFor pDevice->getPreemptionMode(), pWalkerCmd, nullptr, - true); + true, + true, + false); bti = reinterpret_cast(reinterpret_cast(ssh.getCpuBase()) + localSshOffset + btiOffset); for (uint32_t i = 0; i < numSurfaces; ++i) { @@ -1009,7 +1024,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, GivenKernelWithSamplersWhenIndir pDevice->getPreemptionMode(), pWalkerCmd, nullptr, - true); + true, + true, + false); bool isMemorySame = memcmp(borderColorPointer, mockDsh, borderColorSize) == 0; EXPECT_TRUE(isMemorySame); @@ -1166,3 +1183,78 @@ INSTANTIATE_TEST_CASE_P(ParentKernelCommandsFromBinaryTest, ::testing::Combine( ::testing::Values(binaryFile), ::testing::ValuesIn(KernelNames))); + +HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelAllowsInlineAndCrossThreadSizeLesserEqualThanGrfThenReturnTrue) { + DebugManagerStateRestore restore; + DebugManager.flags.EnablePassInlineData.set(true); + + uint32_t crossThreadData[8]; + + MockKernelWithInternals mockKernelWithInternal(*pDevice); + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->PassInlineData = 1; + mockKernelWithInternal.mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData)); + + EXPECT_TRUE(KernelCommandsHelper::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel)); +} + +HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelDisallowsInlineAndCrossThreadSizeLesserEqualThanGrfThenReturnFalse) { + DebugManagerStateRestore restore; + DebugManager.flags.EnablePassInlineData.set(true); + + uint32_t crossThreadData[8]; + + MockKernelWithInternals mockKernelWithInternal(*pDevice); + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->PassInlineData = 0; + mockKernelWithInternal.mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData)); + + EXPECT_FALSE(KernelCommandsHelper::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel)); +} + +HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelAllowsInlineAndCrossThreadSizeGreaterThanGrfThenReturnFalse) { + DebugManagerStateRestore restore; + DebugManager.flags.EnablePassInlineData.set(true); + + uint32_t crossThreadData[16]; + + MockKernelWithInternals mockKernelWithInternal(*pDevice); + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->PassInlineData = 1; + mockKernelWithInternal.mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData)); + + EXPECT_FALSE(KernelCommandsHelper::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel)); +} + +HWTEST_F(KernelCommandsTest, whenLocalIdxInXDimPresentThenExpectLocalIdsInUseIsTrue) { + MockKernelWithInternals mockKernelWithInternal(*pDevice); + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 1; + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0; + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0; + + EXPECT_TRUE(KernelCommandsHelper::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel)); +} + +HWTEST_F(KernelCommandsTest, whenLocalIdxInYDimPresentThenExpectLocalIdsInUseIsTrue) { + MockKernelWithInternals mockKernelWithInternal(*pDevice); + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0; + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 1; + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0; + + EXPECT_TRUE(KernelCommandsHelper::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel)); +} + +HWTEST_F(KernelCommandsTest, whenLocalIdxInZDimPresentThenExpectLocalIdsInUseIsTrue) { + MockKernelWithInternals mockKernelWithInternal(*pDevice); + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0; + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0; + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 1; + + EXPECT_TRUE(KernelCommandsHelper::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel)); +} + +HWTEST_F(KernelCommandsTest, whenLocalIdxAreNotPresentThenExpectLocalIdsInUseIsFalse) { + MockKernelWithInternals mockKernelWithInternal(*pDevice); + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0; + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0; + const_cast(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0; + + EXPECT_FALSE(KernelCommandsHelper::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel)); +} diff --git a/unit_tests/program/kernel_data.cpp b/unit_tests/program/kernel_data.cpp index d6ee4c2751..985575a66b 100644 --- a/unit_tests/program/kernel_data.cpp +++ b/unit_tests/program/kernel_data.cpp @@ -239,6 +239,7 @@ TEST_F(KernelDataTest, ThreadPayload) { threadPayload.LocalIDYPresent = true; threadPayload.LocalIDZPresent = true; threadPayload.OffsetToSkipPerThreadDataLoad = true; + threadPayload.PassInlineData = true; pPatchList = &threadPayload; patchListSize = threadPayload.Size; diff --git a/unit_tests/test_files/igdrcl.config b/unit_tests/test_files/igdrcl.config index 44f6e6e54f..4c24a7b0ab 100644 --- a/unit_tests/test_files/igdrcl.config +++ b/unit_tests/test_files/igdrcl.config @@ -86,3 +86,4 @@ EnableTimestampPacket = false ReturnRawGpuTimestamps = 0 DoNotRegisterTrimCallback = false AddClGlSharing = 0 +EnablePassInlineData = false \ No newline at end of file