diff --git a/Jenkinsfile b/Jenkinsfile index 610f05ef0d..6f8ef7c170 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,5 +1,5 @@ #!groovy neoDependenciesRev='803657-1097' strategy='EQUAL' -allowedCD=274 +allowedCD=273 allowedF=4 diff --git a/runtime/command_queue/CMakeLists.txt b/runtime/command_queue/CMakeLists.txt index 7eb353454f..71424d67fc 100644 --- a/runtime/command_queue/CMakeLists.txt +++ b/runtime/command_queue/CMakeLists.txt @@ -36,6 +36,7 @@ set(RUNTIME_SRCS_COMMAND_QUEUE ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.inl ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.h ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.inl + ${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface_base.inl ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.h ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl diff --git a/runtime/command_queue/gpgpu_walker.h b/runtime/command_queue/gpgpu_walker.h index 75a812f810..67e49e5e3b 100644 --- a/runtime/command_queue/gpgpu_walker.h +++ b/runtime/command_queue/gpgpu_walker.h @@ -26,8 +26,6 @@ namespace OCLRT { -using WALKER_HANDLE = void *; - template using WALKER_TYPE = typename GfxFamily::WALKER_TYPE; @@ -129,7 +127,7 @@ class GpgpuWalkerHelper { static size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel); static size_t setGpgpuWalkerThreadData( - WALKER_HANDLE pCmdData, + WALKER_TYPE *walkerCmd, const size_t globalOffsets[3], const size_t startWorkGroups[3], const size_t numWorkGroups[3], @@ -187,7 +185,7 @@ class GpgpuWalkerHelper { static void setupTimestampPacket( LinearStream *cmdStream, - WALKER_HANDLE walkerHandle, + WALKER_TYPE *walkerCmd, TimestampPacket *timestampPacket, TimestampPacket::WriteOperationType writeOperationType); diff --git a/runtime/command_queue/gpgpu_walker.inl b/runtime/command_queue/gpgpu_walker.inl index dd46a38b04..af0946f663 100644 --- a/runtime/command_queue/gpgpu_walker.inl +++ b/runtime/command_queue/gpgpu_walker.inl @@ -101,7 +101,7 @@ void GpgpuWalkerHelper::addAluReadModifyWriteRegister( template inline size_t GpgpuWalkerHelper::setGpgpuWalkerThreadData( - WALKER_HANDLE pCmdData, + WALKER_TYPE *walkerCmd, const size_t globalOffsets[3], const size_t startWorkGroups[3], const size_t numWorkGroups[3], @@ -109,16 +109,14 @@ inline size_t GpgpuWalkerHelper::setGpgpuWalkerThreadData( uint32_t simd, uint32_t workDim, bool localIdsGeneration) { - WALKER_TYPE *pCmd = static_cast *>(pCmdData); - auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2]; auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize); - pCmd->setThreadWidthCounterMaximum(static_cast(threadsPerWorkGroup)); + walkerCmd->setThreadWidthCounterMaximum(static_cast(threadsPerWorkGroup)); - pCmd->setThreadGroupIdXDimension(static_cast(numWorkGroups[0])); - pCmd->setThreadGroupIdYDimension(static_cast(numWorkGroups[1])); - pCmd->setThreadGroupIdZDimension(static_cast(numWorkGroups[2])); + walkerCmd->setThreadGroupIdXDimension(static_cast(numWorkGroups[0])); + walkerCmd->setThreadGroupIdYDimension(static_cast(numWorkGroups[1])); + walkerCmd->setThreadGroupIdZDimension(static_cast(numWorkGroups[2])); // compute executionMask - to tell which SIMD lines are active within thread auto remainderSimdLanes = localWorkSize & (simd - 1); @@ -128,13 +126,13 @@ inline size_t GpgpuWalkerHelper::setGpgpuWalkerThreadData( using SIMD_SIZE = typename WALKER_TYPE::SIMD_SIZE; - pCmd->setRightExecutionMask(static_cast(executionMask)); - pCmd->setBottomExecutionMask(static_cast(0xffffffff)); - pCmd->setSimdSize(static_cast(simd >> 4)); + walkerCmd->setRightExecutionMask(static_cast(executionMask)); + walkerCmd->setBottomExecutionMask(static_cast(0xffffffff)); + walkerCmd->setSimdSize(static_cast(simd >> 4)); - pCmd->setThreadGroupIdStartingX(static_cast(startWorkGroups[0])); - pCmd->setThreadGroupIdStartingY(static_cast(startWorkGroups[1])); - pCmd->setThreadGroupIdStartingResumeZ(static_cast(startWorkGroups[2])); + walkerCmd->setThreadGroupIdStartingX(static_cast(startWorkGroups[0])); + walkerCmd->setThreadGroupIdStartingY(static_cast(startWorkGroups[1])); + walkerCmd->setThreadGroupIdStartingResumeZ(static_cast(startWorkGroups[2])); return localWorkSize; } @@ -432,7 +430,7 @@ inline void GpgpuWalkerHelper::dispatchOnDeviceWaitlistSemaphores(Lin template void GpgpuWalkerHelper::setupTimestampPacket( LinearStream *cmdStream, - WALKER_HANDLE walkerHandle, + WALKER_TYPE *walkerCmd, TimestampPacket *timestampPacket, TimestampPacket::WriteOperationType writeOperationType) { @@ -523,8 +521,12 @@ void GpgpuWalkerHelper::dispatchScheduler( indirectObjectHeap.getSpace(curbeOffset); ioh = &indirectObjectHeap; - bool localIdsGeneration = KernelCommandsHelper::isDispatchForLocalIdsGeneration(1, globalWorkSizes, localWorkSizes); - auto offsetCrossThreadData = KernelCommandsHelper::sendIndirectState( + // Program the walker. Invokes execution so all state should already be programmed + auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER)); + *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker; + + bool localIdsGeneration = KernelCommandsHelper::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes); + KernelCommandsHelper::sendIndirectState( *commandStream, *dsh, *ioh, @@ -535,37 +537,16 @@ void GpgpuWalkerHelper::dispatchScheduler( offsetInterfaceDescriptorTable, interfaceDescriptorIndex, preemptionMode, + pGpGpuWalkerCmd, nullptr, localIdsGeneration); // Implement enabling special WA DisableLSQCROPERFforOCL if needed GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true); - // Program the walker. Invokes execution so all state should already be programmed - auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER)); - *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker; - size_t globalOffsets[3] = {0, 0, 0}; size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1}; - auto localWorkSize = GpgpuWalkerHelper::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1, localIdsGeneration); - - pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData); - DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0); - pGpGpuWalkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex); - - auto threadPayload = scheduler.getKernelInfo().patchInfo.threadPayload; - DEBUG_BREAK_IF(nullptr == threadPayload); - - auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); - auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels); - localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF)); - - auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread; - DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group - - auto sizeCrossThreadData = scheduler.getCrossThreadDataSize(); - auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); - pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength); + GpgpuWalkerHelper::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1, localIdsGeneration); // Implement disabling special WA DisableLSQCROPERFforOCL if needed GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false); diff --git a/runtime/command_queue/hardware_interface.h b/runtime/command_queue/hardware_interface.h index dcbb707359..7e01db0a9c 100644 --- a/runtime/command_queue/hardware_interface.h +++ b/runtime/command_queue/hardware_interface.h @@ -26,7 +26,8 @@ struct MultiDispatchInfo; template struct TagNode; -using WALKER_HANDLE = void *; +template +using WALKER_TYPE = typename GfxFamily::WALKER_TYPE; template class HardwareInterface { @@ -57,12 +58,7 @@ class HardwareInterface { LinearStream *commandStream); static INTERFACE_DESCRIPTOR_DATA *obtainInterfaceDescriptorData( - WALKER_HANDLE pCmdData); - - static void setOffsetCrossThreadData( - WALKER_HANDLE pCmdData, - size_t &offsetCrossThreadData, - uint32_t &interfaceDescriptorIndex); + WALKER_TYPE *walkerCmd); static void dispatchWorkarounds( LinearStream *commandStream, @@ -83,6 +79,9 @@ class HardwareInterface { HwPerfCounter *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue); + + static WALKER_TYPE *allocateWalkerSpace(LinearStream &commandStream, + const Kernel &kernel); }; } // namespace OCLRT diff --git a/runtime/command_queue/hardware_interface.inl b/runtime/command_queue/hardware_interface.inl index 45f22d1aaa..801af33ad3 100644 --- a/runtime/command_queue/hardware_interface.inl +++ b/runtime/command_queue/hardware_interface.inl @@ -188,18 +188,17 @@ void HardwareInterface::dispatchWalker( } // Program the walker. Invokes execution so all state should already be programmed - auto pWalkerCmd = static_cast *>(commandStream->getSpace(sizeof(WALKER_TYPE))); - *pWalkerCmd = GfxFamily::cmdInitGpgpuWalker; + auto walkerCmd = allocateWalkerSpace(*commandStream, kernel); if (setupTimestampPacket) { - GpgpuWalkerHelper::setupTimestampPacket(commandStream, pWalkerCmd, currentTimestampPacket, + GpgpuWalkerHelper::setupTimestampPacket(commandStream, walkerCmd, currentTimestampPacket, TimestampPacket::WriteOperationType::AfterWalker); } - auto idd = obtainInterfaceDescriptorData(pWalkerCmd); + auto idd = obtainInterfaceDescriptorData(walkerCmd); - bool localIdsGeneration = KernelCommandsHelper::isDispatchForLocalIdsGeneration(dim, globalWorkSizes, localWorkSizes); - auto offsetCrossThreadData = KernelCommandsHelper::sendIndirectState( + bool localIdsGeneration = KernelCommandsHelper::isRuntimeLocalIdsGenerationRequired(dim, globalWorkSizes, localWorkSizes); + KernelCommandsHelper::sendIndirectState( *commandStream, *dsh, *ioh, @@ -210,35 +209,15 @@ void HardwareInterface::dispatchWalker( offsetInterfaceDescriptorTable, interfaceDescriptorIndex, preemptionMode, + walkerCmd, idd, localIdsGeneration); size_t globalOffsets[3] = {offset.x, offset.y, offset.z}; size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z}; size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z}; - auto localWorkSize = GpgpuWalkerHelper::setGpgpuWalkerThreadData(pWalkerCmd, globalOffsets, startWorkGroups, - numWorkGroups, localWorkSizes, simd, dim, localIdsGeneration); - - DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0); - setOffsetCrossThreadData(pWalkerCmd, offsetCrossThreadData, interfaceDescriptorIndex); - auto sizeCrossThreadData = kernel.getCrossThreadDataSize(); - - size_t sizePerThreadDataTotal = 0; - if (localIdsGeneration) { - auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload; - DEBUG_BREAK_IF(nullptr == threadPayload); - - auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); - auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels); - localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF)); - - sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread; - DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group - } - - auto indirectDataLength = alignUp(static_cast(sizeCrossThreadData + sizePerThreadDataTotal), - WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); - pWalkerCmd->setIndirectDataLength(indirectDataLength); + GpgpuWalkerHelper::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups, + numWorkGroups, localWorkSizes, simd, dim, localIdsGeneration); dispatchWorkarounds(commandStream, commandQueue, kernel, false); currentDispatchIndex++; diff --git a/runtime/command_queue/hardware_interface_base.inl b/runtime/command_queue/hardware_interface_base.inl index e03f39199a..528a5d36a6 100644 --- a/runtime/command_queue/hardware_interface_base.inl +++ b/runtime/command_queue/hardware_interface_base.inl @@ -33,22 +33,10 @@ inline void HardwareInterface::getDefaultDshSpace( template inline typename HardwareInterface::INTERFACE_DESCRIPTOR_DATA * HardwareInterface::obtainInterfaceDescriptorData( - WALKER_HANDLE pCmdData) { - + WALKER_TYPE *walkerCmd) { return nullptr; } -template -inline void HardwareInterface::setOffsetCrossThreadData( - WALKER_HANDLE pCmdData, - size_t &offsetCrossThreadData, - uint32_t &interfaceDescriptorIndex) { - - WALKER_TYPE *pCmd = static_cast *>(pCmdData); - pCmd->setIndirectDataStartAddress(static_cast(offsetCrossThreadData)); - pCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++); -} - template inline void HardwareInterface::dispatchWorkarounds( LinearStream *commandStream, @@ -103,4 +91,12 @@ inline void HardwareInterface::dispatchProfilingPerfEndCommands( } } +template +inline WALKER_TYPE *HardwareInterface::allocateWalkerSpace(LinearStream &commandStream, + const Kernel &kernel) { + auto walkerCmd = static_cast *>(commandStream.getSpace(sizeof(WALKER_TYPE))); + *walkerCmd = GfxFamily::cmdInitGpgpuWalker; + return walkerCmd; +} + } // namespace OCLRT diff --git a/runtime/helpers/basic_math.h b/runtime/helpers/basic_math.h index 40a189bf5d..b3ae5d26c0 100644 --- a/runtime/helpers/basic_math.h +++ b/runtime/helpers/basic_math.h @@ -1,23 +1,8 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (C) 2017-2018 Intel Corporation * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: + * SPDX-License-Identifier: MIT * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. */ #pragma once @@ -164,5 +149,15 @@ inline size_t computeTotalElementsCount(const Vec3 &inputVector) { return xDim * yDim * zDim; } +template +bool isPow2(T val) { + if (val != 0) { + if ((val & (val - 1)) == 0) { + return true; + } + } + return false; +} + } // namespace Math } // namespace OCLRT diff --git a/runtime/helpers/kernel_commands.h b/runtime/helpers/kernel_commands.h index 6b0f8edd4d..8b24629352 100644 --- a/runtime/helpers/kernel_commands.h +++ b/runtime/helpers/kernel_commands.h @@ -24,6 +24,9 @@ class IndirectHeap; struct CrossThreadInfo; struct MultiDispatchInfo; +template +using WALKER_TYPE = typename GfxFamily::WALKER_TYPE; + template struct KernelCommandsHelper : public PerThreadDataHelper { using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE; @@ -88,8 +91,9 @@ struct KernelCommandsHelper : public PerThreadDataHelper { uint32_t simd, const size_t localWorkSize[3], const uint64_t offsetInterfaceDescriptorTable, - const uint32_t interfaceDescriptorIndex, + uint32_t &interfaceDescriptorIndex, PreemptionMode preemptionMode, + WALKER_TYPE *walkerCmd, INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor, bool localIdsGeneration); @@ -155,6 +159,6 @@ struct KernelCommandsHelper : public PerThreadDataHelper { static bool doBindingTablePrefetch(); - static bool isDispatchForLocalIdsGeneration(uint32_t workDim, size_t *gws, size_t *lws); + static bool isRuntimeLocalIdsGenerationRequired(uint32_t workDim, size_t *gws, size_t *lws); }; } // namespace OCLRT diff --git a/runtime/helpers/kernel_commands.inl b/runtime/helpers/kernel_commands.inl index 12f29d0f43..be3dabb30b 100644 --- a/runtime/helpers/kernel_commands.inl +++ b/runtime/helpers/kernel_commands.inl @@ -292,8 +292,9 @@ size_t KernelCommandsHelper::sendIndirectState( uint32_t simd, const size_t localWorkSize[3], const uint64_t offsetInterfaceDescriptorTable, - const uint32_t interfaceDescriptorIndex, + uint32_t &interfaceDescriptorIndex, PreemptionMode preemptionMode, + WALKER_TYPE *walkerCmd, INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor, bool localIdsGeneration) { using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE; @@ -340,22 +341,29 @@ size_t KernelCommandsHelper::sendIndirectState( ptrOffset(kernel.getDynamicStateHeap(), patchInfo.samplerStateArray->Offset), sizeSamplerState); - auto pSmplr = (SAMPLER_STATE *)(samplerState); + auto pSmplr = reinterpret_cast(samplerState); for (uint32_t i = 0; i < samplerCount; i++) { pSmplr->setIndirectStatePointer((uint32_t)borderColorOffset); pSmplr++; } } + auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload; + DEBUG_BREAK_IF(nullptr == threadPayload); + + auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2]; + auto threadsPerThreadGroup = static_cast(getThreadsPerWG(simd, localWorkItems)); + auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); + // Send thread data + auto sizeCrossThreadData = kernel.getCrossThreadDataSize(); auto offsetCrossThreadData = sendCrossThreadData( ioh, kernel); - auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload; - DEBUG_BREAK_IF(nullptr == threadPayload); + size_t sizePerThreadDataTotal = 0; + size_t sizePerThreadData = 0; - auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); sendPerThreadData( ioh, simd, @@ -364,10 +372,13 @@ size_t KernelCommandsHelper::sendIndirectState( kernel.getKernelInfo().workgroupDimensionsOrder, kernel.usesOnlyImages()); - // send interface descriptor data - auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2]; - auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels); - auto threadsPerThreadGroup = static_cast(getThreadsPerWG(simd, localWorkItems)); + sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels); + + auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels); + localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF)); + + sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkItems) * localIdSizePerThread; + DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA); @@ -382,7 +393,7 @@ size_t KernelCommandsHelper::sendIndirectState( dsh, offsetInterfaceDescriptor, kernelStartOffset, - kernel.getCrossThreadDataSize(), + sizeCrossThreadData, sizePerThreadData, dstBindingTablePointer, samplerStateOffset, @@ -404,6 +415,14 @@ size_t KernelCommandsHelper::sendIndirectState( commandStream, interfaceDescriptorIndex); + DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0); + walkerCmd->setIndirectDataStartAddress(static_cast(offsetCrossThreadData)); + walkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++); + + auto indirectDataLength = alignUp(static_cast(sizeCrossThreadData + sizePerThreadDataTotal), + WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); + walkerCmd->setIndirectDataLength(indirectDataLength); + return offsetCrossThreadData; } @@ -438,7 +457,7 @@ bool KernelCommandsHelper::doBindingTablePrefetch() { } template -bool KernelCommandsHelper::isDispatchForLocalIdsGeneration(uint32_t workDim, size_t *gws, size_t *lws) { +bool KernelCommandsHelper::isRuntimeLocalIdsGenerationRequired(uint32_t workDim, size_t *gws, size_t *lws) { return true; } } // namespace OCLRT diff --git a/unit_tests/helpers/basic_math_tests.cpp b/unit_tests/helpers/basic_math_tests.cpp index eef5c1fff0..ee7413f053 100644 --- a/unit_tests/helpers/basic_math_tests.cpp +++ b/unit_tests/helpers/basic_math_tests.cpp @@ -1,23 +1,8 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (C) 2017-2018 Intel Corporation * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: + * SPDX-License-Identifier: MIT * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. */ #include "runtime/helpers/basic_math.h" @@ -188,3 +173,26 @@ TEST_P(ComputeTotalElementsCount, givenVariousInputVectorsWhenComputeTotalElemen Vec3 inputData(GetParam().x, GetParam().y, GetParam().z); EXPECT_EQ(GetParam().result, computeTotalElementsCount(inputData)); } + +TEST(isPow2Test, WhenArgZeroThenReturnFalse) { + bool ret = isPow2(0u); + EXPECT_FALSE(ret); +} + +TEST(isPow2Test, WhenArgNonPow2ThenReturnFalse) { + bool ret = true; + uint32_t args[5] = {3, 5, 6, 7, 10}; + for (uint32_t i = 0; i < 5; i++) { + ret = isPow2(args[i]); + EXPECT_FALSE(ret); + } +} + +TEST(isPow2Test, WhenArgPow2ThenReturnTrue) { + bool ret = false; + size_t args[5] = {1, 4, 8, 128, 4096}; + for (uint32_t i = 0; i < 5; i++) { + ret = isPow2(args[i]); + EXPECT_TRUE(ret); + } +} diff --git a/unit_tests/helpers/kernel_commands_tests.cpp b/unit_tests/helpers/kernel_commands_tests.cpp index a048265f8b..1110be5b65 100644 --- a/unit_tests/helpers/kernel_commands_tests.cpp +++ b/unit_tests/helpers/kernel_commands_tests.cpp @@ -259,7 +259,8 @@ HWTEST_F(KernelCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoComment } HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, sendIndirectStateResourceUsage) { - typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA; + using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(pContext, pDevice, 0); @@ -289,6 +290,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, sendIndirectStateResourceUsage) const size_t localWorkSizes[3]{localWorkSize, 1, 1}; auto &commandStream = cmdQ.getCS(1024); + auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); + *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; + auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); @@ -305,7 +309,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, sendIndirectStateResourceUsage) commandStream, IDToffset, sizeof(INTERFACE_DESCRIPTOR_DATA)); - + uint32_t interfaceDescriptorIndex = 0; KernelCommandsHelper::sendIndirectState( commandStream, dsh, @@ -315,8 +319,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, sendIndirectStateResourceUsage) kernel->getKernelInfo().getMaxSimdSize(), localWorkSizes, IDToffset, - 0, + interfaceDescriptorIndex, pDevice->getPreemptionMode(), + pWalkerCmd, nullptr, true); @@ -339,9 +344,12 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, sendIndirectStateResourceUsage) HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWithFourBindingTableEntriesWhenIndirectStateIsEmittedThenInterfaceDescriptorContainsCorrectBindingTableEntryCount) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(pContext, pDevice, 0); auto &commandStream = cmdQ.getCS(1024); + auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); + *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; MockKernelWithInternals mockKernel(*pDevice, pContext); @@ -353,7 +361,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWithFourBindingTableE auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); const size_t localWorkSize = 256; const size_t localWorkSizes[3]{localWorkSize, 1, 1}; - + uint32_t interfaceDescriptorIndex = 0; KernelCommandsHelper::sendIndirectState( commandStream, dsh, @@ -363,8 +371,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWithFourBindingTableE mockKernel.mockKernel->getKernelInfo().getMaxSimdSize(), localWorkSizes, 0, - 0, + interfaceDescriptorIndex, pDevice->getPreemptionMode(), + pWalkerCmd, nullptr, true); @@ -378,9 +387,12 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWithFourBindingTableE HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelThatIsSchedulerWhenIndirectStateIsEmittedThenInterfaceDescriptorContainsZeroBindingTableEntryCount) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(pContext, pDevice, 0); auto &commandStream = cmdQ.getCS(1024); + auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); + *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; MockKernelWithInternals mockKernel(*pDevice, pContext); @@ -394,7 +406,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelThatIsSchedulerWhenIn auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); const size_t localWorkSize = 256; const size_t localWorkSizes[3]{localWorkSize, 1, 1}; - + uint32_t interfaceDescriptorIndex = 0; KernelCommandsHelper::sendIndirectState( commandStream, dsh, @@ -404,8 +416,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelThatIsSchedulerWhenIn mockKernel.mockKernel->getKernelInfo().getMaxSimdSize(), localWorkSizes, 0, - 0, + interfaceDescriptorIndex, pDevice->getPreemptionMode(), + pWalkerCmd, nullptr, true); @@ -415,9 +428,12 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelThatIsSchedulerWhenIn HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWith100BindingTableEntriesWhenIndirectStateIsEmittedThenInterfaceDescriptorHas31BindingTableEntriesSet) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(pContext, pDevice, 0); auto &commandStream = cmdQ.getCS(1024); + auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); + *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; MockKernelWithInternals mockKernel(*pDevice, pContext); @@ -429,7 +445,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWith100BindingTableEn auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); const size_t localWorkSize = 256; const size_t localWorkSizes[3]{localWorkSize, 1, 1}; - + uint32_t interfaceDescriptorIndex = 0; KernelCommandsHelper::sendIndirectState( commandStream, dsh, @@ -439,8 +455,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWith100BindingTableEn mockKernel.mockKernel->getKernelInfo().getMaxSimdSize(), localWorkSizes, 0, - 0, + interfaceDescriptorIndex, pDevice->getPreemptionMode(), + pWalkerCmd, nullptr, true); @@ -454,6 +471,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWith100BindingTableEn HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, whenSendingIndirectStateThenKernelsWalkOrderIsTakenIntoAccount) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(pContext, pDevice, 0); @@ -479,6 +497,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, whenSendingIndirectStateThenKern const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ}; auto &commandStream = cmdQ.getCS(1024); + auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); + *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; + auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); @@ -496,6 +517,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, whenSendingIndirectStateThenKern modifiedKernelInfo.workgroupDimensionsOrder[1] = 1; modifiedKernelInfo.workgroupDimensionsOrder[2] = 0; MockKernel mockKernel{kernel->getProgram(), modifiedKernelInfo, kernel->getDevice(), false}; + uint32_t interfaceDescriptorIndex = 0; KernelCommandsHelper::sendIndirectState( commandStream, dsh, @@ -505,8 +527,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, whenSendingIndirectStateThenKern modifiedKernelInfo.getMaxSimdSize(), localWorkSizes, IDToffset, - 0, + interfaceDescriptorIndex, pDevice->getPreemptionMode(), + pWalkerCmd, nullptr, true); size_t numThreads = localWorkSizeX * localWorkSizeY * localWorkSizeZ; @@ -524,6 +547,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, whenSendingIndirectStateThenKern HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointer) { typedef typename FamilyType::BINDING_TABLE_STATE BINDING_TABLE_STATE; typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; + using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(pContext, pDevice, 0); std::unique_ptr dstImage(Image2dHelper<>::create(pContext)); @@ -550,6 +574,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointer) { const size_t localWorkSizes[3]{256, 1, 1}; auto &commandStream = cmdQ.getCS(1024); + auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); + *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; + auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); @@ -568,7 +595,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointer) { // force statefull path for buffers const_cast(kernelInfo).requiresSshForBuffers = true; - + uint32_t interfaceDescriptorIndex = 0; KernelCommandsHelper::sendIndirectState( commandStream, dsh, @@ -578,8 +605,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointer) { kernel->getKernelInfo().getMaxSimdSize(), localWorkSizes, 0, - 0, + interfaceDescriptorIndex, pDevice->getPreemptionMode(), + pWalkerCmd, nullptr, true); @@ -589,6 +617,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointer) { HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointersForGlobalAndConstantAndPrivateAndEventPoolAndDefaultCommandQueueSurfaces) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; // define kernel info auto pKernelInfo = std::make_unique(); @@ -707,6 +736,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointersFor CommandQueueHw cmdQ(nullptr, pDevice, 0); auto &commandStream = cmdQ.getCS(1024); + auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); + *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; + auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); @@ -723,6 +755,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointersFor uint32_t localSshOffset = static_cast(ssh.getUsed()); // push surfaces states and binding table to given ssh heap + uint32_t interfaceDescriptorIndex = 0; KernelCommandsHelper::sendIndirectState( commandStream, dsh, @@ -732,8 +765,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointersFor pKernel->getKernelInfo().getMaxSimdSize(), localWorkSizes, 0, - 0, + interfaceDescriptorIndex, pDevice->getPreemptionMode(), + pWalkerCmd, nullptr, true); @@ -912,12 +946,16 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, GivenKernelWithSamplersWhenIndir typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; typedef typename FamilyType::SAMPLER_STATE SAMPLER_STATE; using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(nullptr, pDevice, 0); MockKernelWithInternals kernelInternals(*pDevice); const size_t localWorkSizes[3]{1, 1, 1}; auto &commandStream = cmdQ.getCS(1024); + auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); + *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; + auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); @@ -957,7 +995,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, GivenKernelWithSamplersWhenIndir MockKernel *kernel = new MockKernel(kernelInternals.mockProgram, kernelInternals.kernelInfo, *pDevice); kernel->setCrossThreadData(kernelInternals.crossThreadData, sizeof(kernelInternals.crossThreadData)); kernel->setSshLocal(kernelInternals.sshLocal, sizeof(kernelInternals.sshLocal)); - + uint32_t interfaceDescriptorIndex = 0; KernelCommandsHelper::sendIndirectState( commandStream, dsh, @@ -967,8 +1005,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, GivenKernelWithSamplersWhenIndir 8, localWorkSizes, interfaceDescriptorTableOffset, - 0, + interfaceDescriptorIndex, pDevice->getPreemptionMode(), + pWalkerCmd, nullptr, true);