diff --git a/runtime/command_queue/hardware_interface.h b/runtime/command_queue/hardware_interface.h index bc46e4ff82..956fdf5bf6 100644 --- a/runtime/command_queue/hardware_interface.h +++ b/runtime/command_queue/hardware_interface.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2019 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -80,6 +80,21 @@ class HardwareInterface { LinearStream *commandStream, CommandQueue &commandQueue); + static void programWalker( + LinearStream &commandStream, + Kernel &kernel, + CommandQueue &commandQueue, + TimestampPacketContainer *currentTimestampPacketNodes, + IndirectHeap &dsh, + IndirectHeap &ioh, + IndirectHeap &ssh, + size_t localWorkSizes[3], + PreemptionMode preemptionMode, + size_t currentDispatchIndex, + uint32_t &interfaceDescriptorIndex, + const DispatchInfo &dispatchInfo, + size_t offsetInterfaceDescriptorTable); + static WALKER_TYPE *allocateWalkerSpace(LinearStream &commandStream, const Kernel &kernel); }; diff --git a/runtime/command_queue/hardware_interface.inl b/runtime/command_queue/hardware_interface.inl index 021bcb8dec..963e551c11 100644 --- a/runtime/command_queue/hardware_interface.inl +++ b/runtime/command_queue/hardware_interface.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2019 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -12,6 +12,14 @@ namespace OCLRT { +template +inline WALKER_TYPE *HardwareInterface::allocateWalkerSpace(LinearStream &commandStream, + const Kernel &kernel) { + auto walkerCmd = static_cast *>(commandStream.getSpace(sizeof(WALKER_TYPE))); + *walkerCmd = GfxFamily::cmdInitGpgpuWalker; + return walkerCmd; +} + template void HardwareInterface::dispatchWalker( CommandQueue &commandQueue, @@ -126,9 +134,6 @@ void HardwareInterface::dispatchWalker( DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3)); DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2)); - // Determine SIMD size - uint32_t simd = kernel.getKernelInfo().getMaxSimdSize(); - // If we don't have a required WGS, compute one opportunistically auto maxWorkGroupSize = static_cast(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize); if (commandType == CL_COMMAND_NDRANGE_KERNEL) { @@ -148,7 +153,6 @@ void HardwareInterface::dispatchWalker( // Compute number of work groups Vec3 twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups() : generateWorkgroupsNumber(gws, lws); - Vec3 nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs; // Patch our kernel constants *kernel.globalWorkOffsetX = static_cast(offset.x); @@ -183,7 +187,6 @@ void HardwareInterface::dispatchWalker( // Send our indirect object data size_t localWorkSizes[3] = {lws.x, lws.y, lws.z}; - size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z}; dispatchProfilingPerfStartCommands(dispatchInfo, multiDispatchInfo, hwTimeStamps, hwPerfCounter, commandStream, commandQueue); @@ -195,47 +198,8 @@ void HardwareInterface::dispatchWalker( GpgpuWalkerHelper::setupTimestampPacket(commandStream, nullptr, timestampPacket, TimestampPacket::WriteOperationType::BeforeWalker); } - // Program the walker. Invokes execution so all state should already be programmed - auto walkerCmd = allocateWalkerSpace(*commandStream, kernel); - - KernelCommandsHelper::programCacheFlushAfterWalkerCommand(commandStream, &kernel); - - if (currentTimestampPacketNodes && commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { - auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex)->tag; - GpgpuWalkerHelper::setupTimestampPacket(commandStream, walkerCmd, timestampPacket, TimestampPacket::WriteOperationType::AfterWalker); - } - - auto idd = obtainInterfaceDescriptorData(walkerCmd); - - bool localIdsGenerationByRuntime = KernelCommandsHelper::isRuntimeLocalIdsGenerationRequired(dim, globalWorkSizes, localWorkSizes); - bool inlineDataProgrammingRequired = KernelCommandsHelper::inlineDataProgrammingRequired(kernel); - bool kernelUsesLocalIds = KernelCommandsHelper::kernelUsesLocalIds(kernel); - KernelCommandsHelper::sendIndirectState( - *commandStream, - *dsh, - *ioh, - *ssh, - kernel, - simd, - localWorkSizes, - offsetInterfaceDescriptorTable, - interfaceDescriptorIndex, - preemptionMode, - walkerCmd, - idd, - localIdsGenerationByRuntime, - kernelUsesLocalIds, - inlineDataProgrammingRequired); - - size_t globalOffsets[3] = {offset.x, offset.y, offset.z}; - size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z}; - size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z}; - GpgpuWalkerHelper::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups, - numWorkGroups, localWorkSizes, simd, dim, - localIdsGenerationByRuntime, inlineDataProgrammingRequired, - *kernel.getKernelInfo().patchInfo.threadPayload); - - GpgpuWalkerHelper::adjustWalkerData(commandStream, walkerCmd, kernel, dispatchInfo); + programWalker(*commandStream, kernel, commandQueue, currentTimestampPacketNodes, *dsh, *ioh, *ssh, + localWorkSizes, preemptionMode, currentDispatchIndex, interfaceDescriptorIndex, dispatchInfo, offsetInterfaceDescriptorTable); dispatchWorkarounds(commandStream, commandQueue, kernel, false); if (dispatchInfo.isPipeControlRequired()) { @@ -244,6 +208,8 @@ void HardwareInterface::dispatchWalker( *pPipeControlCmd = GfxFamily::cmdInitPipeControl; pPipeControlCmd->setCommandStreamerStallEnable(true); } + KernelCommandsHelper::programCacheFlushAfterWalkerCommand(commandStream, &kernel); + currentDispatchIndex++; } dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); diff --git a/runtime/command_queue/hardware_interface_base.inl b/runtime/command_queue/hardware_interface_base.inl index 33c5645794..c18f215626 100644 --- a/runtime/command_queue/hardware_interface_base.inl +++ b/runtime/command_queue/hardware_interface_base.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2019 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -92,11 +92,70 @@ inline void HardwareInterface::dispatchProfilingPerfEndCommands( } template -inline WALKER_TYPE *HardwareInterface::allocateWalkerSpace(LinearStream &commandStream, - const Kernel &kernel) { - auto walkerCmd = static_cast *>(commandStream.getSpace(sizeof(WALKER_TYPE))); - *walkerCmd = GfxFamily::cmdInitGpgpuWalker; - return walkerCmd; +inline void HardwareInterface::programWalker( + LinearStream &commandStream, + Kernel &kernel, + CommandQueue &commandQueue, + TimestampPacketContainer *currentTimestampPacketNodes, + IndirectHeap &dsh, + IndirectHeap &ioh, + IndirectHeap &ssh, + size_t localWorkSizes[3], + PreemptionMode preemptionMode, + size_t currentDispatchIndex, + uint32_t &interfaceDescriptorIndex, + const DispatchInfo &dispatchInfo, + size_t offsetInterfaceDescriptorTable) { + + auto walkerCmd = allocateWalkerSpace(commandStream, kernel); + uint32_t dim = dispatchInfo.getDim(); + Vec3 lws = dispatchInfo.getLocalWorkgroupSize(); + Vec3 gws = dispatchInfo.getGWS(); + Vec3 swgs = dispatchInfo.getStartOfWorkgroups(); + Vec3 twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups() : generateWorkgroupsNumber(gws, lws); + Vec3 nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs; + size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z}; + + if (currentTimestampPacketNodes && commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { + auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex)->tag; + GpgpuWalkerHelper::setupTimestampPacket(&commandStream, walkerCmd, timestampPacket, TimestampPacket::WriteOperationType::AfterWalker); + } + + auto idd = obtainInterfaceDescriptorData(walkerCmd); + + bool localIdsGenerationByRuntime = KernelCommandsHelper::isRuntimeLocalIdsGenerationRequired(dim, globalWorkSizes, localWorkSizes); + bool inlineDataProgrammingRequired = KernelCommandsHelper::inlineDataProgrammingRequired(kernel); + bool kernelUsesLocalIds = KernelCommandsHelper::kernelUsesLocalIds(kernel); + uint32_t simd = kernel.getKernelInfo().getMaxSimdSize(); + + Vec3 offset = dispatchInfo.getOffset(); + + KernelCommandsHelper::sendIndirectState( + commandStream, + dsh, + ioh, + ssh, + kernel, + simd, + localWorkSizes, + offsetInterfaceDescriptorTable, + interfaceDescriptorIndex, + preemptionMode, + walkerCmd, + idd, + localIdsGenerationByRuntime, + kernelUsesLocalIds, + inlineDataProgrammingRequired); + + size_t globalOffsets[3] = {offset.x, offset.y, offset.z}; + size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z}; + size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z}; + GpgpuWalkerHelper::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups, + numWorkGroups, localWorkSizes, simd, dim, + localIdsGenerationByRuntime, inlineDataProgrammingRequired, + *kernel.getKernelInfo().patchInfo.threadPayload); + + GpgpuWalkerHelper::adjustWalkerData(&commandStream, walkerCmd, kernel, dispatchInfo); } } // namespace OCLRT diff --git a/unit_tests/command_queue/enqueue_kernel_2_tests.cpp b/unit_tests/command_queue/enqueue_kernel_2_tests.cpp index 17aa9d0dc4..9fc7be9407 100644 --- a/unit_tests/command_queue/enqueue_kernel_2_tests.cpp +++ b/unit_tests/command_queue/enqueue_kernel_2_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2019 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -878,7 +878,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueKernelTest, givenCacheFlushAfterWalkerEnabled hwParse.parseCommands(cmdQ.getCS(0), 0); auto itorCmd = find(hwParse.cmdList.begin(), hwParse.cmdList.end()); ASSERT_NE(hwParse.cmdList.end(), itorCmd); - ++itorCmd; + itorCmd = find(itorCmd, hwParse.cmdList.end()); auto pipeControl = genCmdCast(*itorCmd); ASSERT_NE(nullptr, pipeControl); EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());