From 4b1871bf0ef7c1ebcff6b8f11195f8c9bf247fc7 Mon Sep 17 00:00:00 2001 From: Kamil Diedrich Date: Mon, 17 Dec 2018 15:23:35 +0100 Subject: [PATCH] Add pipe control before and after buffer translation Change-Id: I4ee32c410e1ac2bcdb3ceae203cd461de79146a5 --- runtime/command_queue/command_queue.cpp | 5 +++ runtime/command_queue/command_queue.h | 2 ++ runtime/command_queue/enqueue_common.h | 8 +++-- runtime/command_queue/gpgpu_walker.inl | 6 ++++ runtime/command_queue/hardware_interface.inl | 19 ++++++++-- runtime/helpers/dispatch_info.h | 27 ++++++++++++++ runtime/kernel/kernel.h | 2 +- runtime/utilities/stackvec.h | 18 ++++++++++ .../command_queue/dispatch_walker_tests.cpp | 35 +++++++++++++++++++ .../command_queue/enqueue_kernel_2_tests.cpp | 35 +++++++++++-------- unit_tests/helpers/dispatch_info_tests.cpp | 17 +++++++++ unit_tests/utilities/containers_tests.cpp | 11 ++++++ 12 files changed, 165 insertions(+), 20 deletions(-) diff --git a/runtime/command_queue/command_queue.cpp b/runtime/command_queue/command_queue.cpp index c122715140..ce7eab9e82 100644 --- a/runtime/command_queue/command_queue.cpp +++ b/runtime/command_queue/command_queue.cpp @@ -557,6 +557,9 @@ void CommandQueue::releaseIndirectHeap(IndirectHeap::Type heapType) { void CommandQueue::dispatchAuxTranslation(MultiDispatchInfo &multiDispatchInfo, MemObjsForAuxTranslation &memObjsForAuxTranslation, AuxTranslationDirection auxTranslationDirection) { + if (!multiDispatchInfo.empty()) { + multiDispatchInfo.rbegin()->setPipeControlRequired(true); + } auto &builder = getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, getContext(), getDevice()); BuiltinDispatchInfoBuilder::BuiltinOpParams dispatchParams; @@ -564,6 +567,8 @@ void CommandQueue::dispatchAuxTranslation(MultiDispatchInfo &multiDispatchInfo, dispatchParams.auxTranslationDirection = auxTranslationDirection; builder.buildDispatchInfos(multiDispatchInfo, dispatchParams); + + multiDispatchInfo.rbegin()->setPipeControlRequired(true); } void CommandQueue::obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes) { diff --git a/runtime/command_queue/command_queue.h b/runtime/command_queue/command_queue.h index 9e3dc5cf28..9359e555a6 100644 --- a/runtime/command_queue/command_queue.h +++ b/runtime/command_queue/command_queue.h @@ -9,6 +9,7 @@ #include "runtime/helpers/base_object.h" #include "runtime/helpers/engine_control.h" #include "runtime/helpers/task_information.h" +#include "runtime/helpers/dispatch_info.h" #include "instrumentation.h" #include #include @@ -27,6 +28,7 @@ class Kernel; class MemObj; class PerformanceCounters; struct CompletionStamp; +struct MultiDispatchInfo; enum class QueuePriority { LOW, diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index 3424e65346..e211340414 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -63,7 +63,9 @@ void CommandQueueHw::enqueueHandler(Surface *(&surfaces)[surfaceCount auto &builder = getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, getContext(), getDevice()); builtInLock.takeOwnership(builder, this->context); kernel->fillWithBuffersForAuxTranslation(memObjsForAuxTranslation); - dispatchAuxTranslation(multiDispatchInfo, memObjsForAuxTranslation, AuxTranslationDirection::AuxToNonAux); + if (!memObjsForAuxTranslation.empty()) { + dispatchAuxTranslation(multiDispatchInfo, memObjsForAuxTranslation, AuxTranslationDirection::AuxToNonAux); + } } if (kernel->getKernelInfo().builtinDispatchBuilder == nullptr) { @@ -85,7 +87,9 @@ void CommandQueueHw::enqueueHandler(Surface *(&surfaces)[surfaceCount buffer->getGraphicsAllocation()->setAllocationType(GraphicsAllocation::AllocationType::BUFFER); } } else { - dispatchAuxTranslation(multiDispatchInfo, memObjsForAuxTranslation, AuxTranslationDirection::NonAuxToAux); + if (!memObjsForAuxTranslation.empty()) { + dispatchAuxTranslation(multiDispatchInfo, memObjsForAuxTranslation, AuxTranslationDirection::AuxToNonAux); + } } } } diff --git a/runtime/command_queue/gpgpu_walker.inl b/runtime/command_queue/gpgpu_walker.inl index 0289a22a62..c16deec6e2 100644 --- a/runtime/command_queue/gpgpu_walker.inl +++ b/runtime/command_queue/gpgpu_walker.inl @@ -387,8 +387,14 @@ template size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, cl_uint numEventsInWaitList, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) { size_t expectedSizeCS = 0; Kernel *parentKernel = multiDispatchInfo.peekParentKernel(); + if (multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->isAuxTranslationRequired()) { + expectedSizeCS += sizeof(PIPE_CONTROL); + } for (auto &dispatchInfo : multiDispatchInfo) { expectedSizeCS += EnqueueOperation::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, dispatchInfo.getKernel()); + if (dispatchInfo.isPipeControlRequired()) { + expectedSizeCS += sizeof(PIPE_CONTROL); + } } if (parentKernel) { SchedulerKernel &scheduler = commandQueue.getDevice().getExecutionEnvironment()->getBuiltIns()->getSchedulerKernel(parentKernel->getContext()); diff --git a/runtime/command_queue/hardware_interface.inl b/runtime/command_queue/hardware_interface.inl index 196b9b15ee..021bcb8dec 100644 --- a/runtime/command_queue/hardware_interface.inl +++ b/runtime/command_queue/hardware_interface.inl @@ -30,6 +30,7 @@ void HardwareInterface::dispatchWalker( LinearStream *commandStream = nullptr; IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr; auto parentKernel = multiDispatchInfo.peekParentKernel(); + auto mainKernel = multiDispatchInfo.peekMainKernel(); for (auto &dispatchInfo : multiDispatchInfo) { // Compute local workgroup sizes @@ -109,10 +110,16 @@ void HardwareInterface::dispatchWalker( DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0); + if (mainKernel->isAuxTranslationRequired()) { + using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; + auto pPipeControlCmd = static_cast(commandStream->getSpace(sizeof(PIPE_CONTROL))); + *pPipeControlCmd = GfxFamily::cmdInitPipeControl; + pPipeControlCmd->setCommandStreamerStallEnable(true); + } + size_t currentDispatchIndex = 0; for (auto &dispatchInfo : multiDispatchInfo) { auto &kernel = *dispatchInfo.getKernel(); - DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3)); DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3)); DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2)); @@ -152,7 +159,7 @@ void HardwareInterface::dispatchWalker( *kernel.globalWorkSizeY = static_cast(gws.y); *kernel.globalWorkSizeZ = static_cast(gws.z); - if ((&kernel == multiDispatchInfo.peekMainKernel()) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) { + if ((&kernel == mainKernel) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) { *kernel.localWorkSizeX = static_cast(lws.x); *kernel.localWorkSizeY = static_cast(lws.y); *kernel.localWorkSizeZ = static_cast(lws.z); @@ -166,7 +173,7 @@ void HardwareInterface::dispatchWalker( *kernel.enqueuedLocalWorkSizeY = static_cast(elws.y); *kernel.enqueuedLocalWorkSizeZ = static_cast(elws.z); - if (&kernel == multiDispatchInfo.peekMainKernel()) { + if (&kernel == mainKernel) { *kernel.numWorkGroupsX = static_cast(twgs.x); *kernel.numWorkGroupsY = static_cast(twgs.y); *kernel.numWorkGroupsZ = static_cast(twgs.z); @@ -231,6 +238,12 @@ void HardwareInterface::dispatchWalker( GpgpuWalkerHelper::adjustWalkerData(commandStream, walkerCmd, kernel, dispatchInfo); dispatchWorkarounds(commandStream, commandQueue, kernel, false); + if (dispatchInfo.isPipeControlRequired()) { + using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; + auto pPipeControlCmd = static_cast(commandStream->getSpace(sizeof(PIPE_CONTROL))); + *pPipeControlCmd = GfxFamily::cmdInitPipeControl; + pPipeControlCmd->setCommandStreamerStallEnable(true); + } currentDispatchIndex++; } dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); diff --git a/runtime/helpers/dispatch_info.h b/runtime/helpers/dispatch_info.h index 15e5f14225..9bc6abdbf6 100644 --- a/runtime/helpers/dispatch_info.h +++ b/runtime/helpers/dispatch_info.h @@ -25,6 +25,8 @@ class DispatchInfo { : kernel(k), dim(d), gws(gws), elws(elws), offset(offset), agws(0, 0, 0), lws(0, 0, 0), twgs(0, 0, 0), nwgs(0, 0, 0), swgs(0, 0, 0) {} DispatchInfo(Kernel *k, uint32_t d, Vec3 gws, Vec3 elws, Vec3 offset, Vec3 agws, Vec3 lws, Vec3 twgs, Vec3 nwgs, Vec3 swgs) : kernel(k), dim(d), gws(gws), elws(elws), offset(offset), agws(agws), lws(lws), twgs(twgs), nwgs(nwgs), swgs(swgs) {} + bool isPipeControlRequired() const { return pipeControlRequired; } + void setPipeControlRequired(bool blocking) { this->pipeControlRequired = blocking; } bool usesSlm() const; bool usesStatelessPrintfSurface() const; uint32_t getRequiredScratchSize() const; @@ -50,6 +52,7 @@ class DispatchInfo { void setStartOfWorkgroups(const Vec3 &swgs) { this->swgs = swgs; } protected: + bool pipeControlRequired = false; Kernel *kernel = nullptr; uint32_t dim = 0; @@ -106,14 +109,38 @@ struct MultiDispatchInfo { return ret; } + DispatchInfo *begin() { + return dispatchInfos.begin(); + } + const DispatchInfo *begin() const { return dispatchInfos.begin(); } + std::reverse_iterator rbegin() { + return dispatchInfos.rbegin(); + } + + std::reverse_iterator crbegin() const { + return dispatchInfos.crbegin(); + } + + DispatchInfo *end() { + return dispatchInfos.end(); + } + const DispatchInfo *end() const { return dispatchInfos.end(); } + std::reverse_iterator rend() { + return dispatchInfos.rend(); + } + + std::reverse_iterator crend() const { + return dispatchInfos.crend(); + } + void push(const DispatchInfo &dispatchInfo) { dispatchInfos.push_back(dispatchInfo); } diff --git a/runtime/kernel/kernel.h b/runtime/kernel/kernel.h index 11c22653ae..ab7144ad0f 100644 --- a/runtime/kernel/kernel.h +++ b/runtime/kernel/kernel.h @@ -372,7 +372,7 @@ class Kernel : public BaseObject<_cl_kernel> { return usingImagesOnly; } - void fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &buffersForAuxTranslation); + void fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsForAuxTranslation); bool requiresCacheFlushCommand() const; diff --git a/runtime/utilities/stackvec.h b/runtime/utilities/stackvec.h index 8a60dc39b5..c778357180 100644 --- a/runtime/utilities/stackvec.h +++ b/runtime/utilities/stackvec.h @@ -18,6 +18,8 @@ class StackVec { public: using iterator = DataType *; using const_iterator = const DataType *; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; static const size_t onStackCaps = OnStackCapacity; @@ -181,6 +183,14 @@ class StackVec { return onStackMem; } + reverse_iterator rbegin() { + return reverse_iterator(end()); + } + + const_reverse_iterator crbegin() const { + return const_reverse_iterator(end()); + } + const_iterator begin() const { if (dynamicMem) { return dynamicMem->data(); @@ -197,6 +207,14 @@ class StackVec { return onStackMem + onStackSize; } + reverse_iterator rend() { + return reverse_iterator(begin()); + } + + const_reverse_iterator crend() const { + return const_reverse_iterator(begin()); + } + const_iterator end() const { if (dynamicMem) { return dynamicMem->data() + dynamicMem->size(); diff --git a/unit_tests/command_queue/dispatch_walker_tests.cpp b/unit_tests/command_queue/dispatch_walker_tests.cpp index bfbcb58493..2ba816ac1a 100644 --- a/unit_tests/command_queue/dispatch_walker_tests.cpp +++ b/unit_tests/command_queue/dispatch_walker_tests.cpp @@ -1108,3 +1108,38 @@ HWTEST_F(DispatchWalkerTest, WhenCallingDefaultWaMethodsThenExpectNothing) { size_t actualSize = GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(&kernel); EXPECT_EQ(expectedSize, actualSize); } + +HWTEST_F(DispatchWalkerTest, givenKernelWhenAuxTranslationWithoutParentKernelThenPipeControlAdded) { + MockKernel kernel(program.get(), kernelInfo, *pDevice); + kernelInfo.workloadInfo.workDimOffset = 0; + ASSERT_EQ(CL_SUCCESS, kernel.initialize()); + + auto &cmdStream = pCmdQ->getCS(0); + void *buffer = cmdStream.getCpuBase(); + kernel.auxTranslationRequired = true; + + MockMultiDispatchInfo multiDispatchInfo(&kernel); + DispatchInfo di1(&kernel, 1, Vec3(1, 1, 1), Vec3(1, 1, 1), Vec3(0, 0, 0)); + di1.setPipeControlRequired(true); + multiDispatchInfo.push(di1); + + HardwareInterface::dispatchWalker( + *pCmdQ, + multiDispatchInfo, + 0, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + pDevice->getPreemptionMode(), + false); + + auto sizeUsed = cmdStream.getUsed(); + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, buffer, sizeUsed)); + + auto itorCmd = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorCmd); +} diff --git a/unit_tests/command_queue/enqueue_kernel_2_tests.cpp b/unit_tests/command_queue/enqueue_kernel_2_tests.cpp index f91171e7a5..72a5cdc049 100644 --- a/unit_tests/command_queue/enqueue_kernel_2_tests.cpp +++ b/unit_tests/command_queue/enqueue_kernel_2_tests.cpp @@ -665,6 +665,7 @@ struct EnqueueAuxKernelTests : public EnqueueKernelTest { Kernel *lastKernel = nullptr; for (const auto &dispatchInfo : multiDispatchInfo) { lastKernel = dispatchInfo.getKernel(); + dispatchInfos.emplace_back(dispatchInfo); } dispatchAuxTranslationInputs.emplace_back(lastKernel, multiDispatchInfo.size(), memObjsForAuxTranslation, auxTranslationDirection); } @@ -674,31 +675,20 @@ struct EnqueueAuxKernelTests : public EnqueueKernelTest { CommandQueueHw::waitUntilComplete(taskCountToWait, flushStampToWait, useQuickKmdSleep); } + std::vector dispatchInfos; std::vector> dispatchAuxTranslationInputs; uint32_t waitCalled = 0; }; }; -HWTEST_F(EnqueueAuxKernelTests, givenKernelWithRequiredAuxTranslationWhenEnqueuedThenGuardKernelWithAuxTranslations) { +HWTEST_F(EnqueueAuxKernelTests, givenKernelWithRequiredAuxTranslationAndWithoutArgumentsWhenEnqueuedThenNoGuardKernelWithAuxTranslations) { MockKernelWithInternals mockKernel(*pDevice, context); MyCmdQ cmdQ(context, pDevice); size_t gws[3] = {1, 0, 0}; mockKernel.mockKernel->auxTranslationRequired = true; cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); - EXPECT_EQ(2u, cmdQ.dispatchAuxTranslationInputs.size()); - - // before kernel - EXPECT_EQ(0u, std::get(cmdQ.dispatchAuxTranslationInputs.at(0))); - EXPECT_EQ(AuxTranslationDirection::AuxToNonAux, std::get(cmdQ.dispatchAuxTranslationInputs.at(0))); - - // after kernel - EXPECT_EQ(1u, std::get(cmdQ.dispatchAuxTranslationInputs.at(1))); - EXPECT_EQ(AuxTranslationDirection::NonAuxToAux, std::get(cmdQ.dispatchAuxTranslationInputs.at(1))); - - mockKernel.mockKernel->auxTranslationRequired = false; - cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); - EXPECT_EQ(2u, cmdQ.dispatchAuxTranslationInputs.size()); // not changed + EXPECT_EQ(0u, cmdQ.dispatchAuxTranslationInputs.size()); } HWTEST_F(EnqueueAuxKernelTests, givenMultipleArgsWhenAuxTranslationIsRequiredThenPickOnlyApplicableBuffers) { @@ -738,11 +728,20 @@ HWTEST_F(EnqueueAuxKernelTests, givenMultipleArgsWhenAuxTranslationIsRequiredThe cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); EXPECT_EQ(2u, cmdQ.dispatchAuxTranslationInputs.size()); + EXPECT_EQ(1u, std::get(cmdQ.dispatchAuxTranslationInputs.at(0)).size()); // before kernel EXPECT_EQ(1u, std::get(cmdQ.dispatchAuxTranslationInputs.at(1)).size()); // after kernel EXPECT_EQ(&buffer2, *std::get(cmdQ.dispatchAuxTranslationInputs.at(0)).begin()); EXPECT_EQ(&buffer2, *std::get(cmdQ.dispatchAuxTranslationInputs.at(1)).begin()); + uint32_t pipeControlCount = 0; + for (auto dispatchInfo : cmdQ.dispatchInfos) { + if (dispatchInfo.isPipeControlRequired()) { + ++pipeControlCount; + } + } + + EXPECT_EQ(4u, pipeControlCount); } HWTEST_F(EnqueueAuxKernelTests, givenKernelWithRequiredAuxTranslationWhenEnqueuedThenDispatchAuxTranslationBuiltin) { @@ -821,6 +820,14 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueAuxKernelTests, givenParentKernelWhenAuxTrans EXPECT_EQ(GraphicsAllocation::AllocationType::BUFFER, buffer0.getGraphicsAllocation()->getAllocationType()); EXPECT_EQ(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED, buffer1.getGraphicsAllocation()->getAllocationType()); EXPECT_EQ(GraphicsAllocation::AllocationType::BUFFER, buffer2.getGraphicsAllocation()->getAllocationType()); + uint32_t pipeControlCount = 0; + for (auto dispatchInfo : cmdQ.dispatchInfos) { + if (dispatchInfo.isPipeControlRequired()) { + ++pipeControlCount; + } + } + + EXPECT_EQ(1u, pipeControlCount); } } diff --git a/unit_tests/helpers/dispatch_info_tests.cpp b/unit_tests/helpers/dispatch_info_tests.cpp index 127b9ea49e..0cdc02426f 100644 --- a/unit_tests/helpers/dispatch_info_tests.cpp +++ b/unit_tests/helpers/dispatch_info_tests.cpp @@ -307,4 +307,21 @@ TEST_F(DispatchInfoTest, givenKernelWhenMultiDispatchInfoIsCreatedThenQueryParen EXPECT_EQ(nullptr, multiDispatchInfo.peekParentKernel()); EXPECT_EQ(builtInKernel.get(), multiDispatchInfo.peekMainKernel()); } + + { + MultiDispatchInfo multiDispatchInfo; + multiDispatchInfo.push(parentKernelDispatchInfo); + multiDispatchInfo.push(baseDispatchInfo); + multiDispatchInfo.push(builtInDispatchInfo); + + std::reverse_iterator rend = multiDispatchInfo.rend(); + std::reverse_iterator crend = multiDispatchInfo.crend(); + std::reverse_iterator rbegin = multiDispatchInfo.rbegin(); + std::reverse_iterator crbegin = multiDispatchInfo.crbegin(); + + EXPECT_EQ(rbegin.base(), multiDispatchInfo.end()); + EXPECT_EQ(crbegin.base(), multiDispatchInfo.end()); + EXPECT_EQ(rend.base(), multiDispatchInfo.begin()); + EXPECT_EQ(crend.base(), multiDispatchInfo.begin()); + } } diff --git a/unit_tests/utilities/containers_tests.cpp b/unit_tests/utilities/containers_tests.cpp index 7bb8e9ef58..d2fbee8182 100644 --- a/unit_tests/utilities/containers_tests.cpp +++ b/unit_tests/utilities/containers_tests.cpp @@ -1429,6 +1429,17 @@ TEST(StackVec, Clear) { ASSERT_EQ(0U, v2.size()); } +TEST(StackVec, ReverseBeginningFunctions) { + using VecType = StackVec; + VecType v; + v.push_back(5); + + ASSERT_EQ(v.begin(), v.rend().base()); + ASSERT_EQ(v.end(), v.rbegin().base()); + ASSERT_EQ(v.begin(), v.crend().base()); + ASSERT_EQ(v.end(), v.crbegin().base()); +} + TEST(StackVec, ConstMemberFunctions) { using VecType = StackVec; VecType v;