From d04614dce306c4f8f8397d819484fa18e30d3000 Mon Sep 17 00:00:00 2001 From: "Dunajski, Bartosz" Date: Fri, 7 Sep 2018 09:09:24 +0200 Subject: [PATCH] Use Semaphore to wait for dependencies on the same device Change-Id: Ib04c960c50183c080d02753815ece80b58d1980e Signed-off-by: Dunajski, Bartosz --- runtime/command_queue/enqueue_common.h | 2 +- runtime/command_queue/gpgpu_walker.h | 6 +- runtime/command_queue/gpgpu_walker.inl | 27 +++++ runtime/event/event.cpp | 1 + runtime/event/event.h | 1 + .../parent_kernel_dispatch_tests.cpp | 2 +- unit_tests/helpers/timestamp_packet_tests.cpp | 109 +++++++++++++++++- unit_tests/libult/mock_gfx_family.h | 15 +++ 8 files changed, 158 insertions(+), 5 deletions(-) diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index 8be9f0a98a..5e96fcc789 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -206,7 +206,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, auto taskLevel = 0u; obtainTaskLevelAndBlockedStatus(taskLevel, numEventsInWaitList, eventWaitList, blockQueue, commandType); - auto &commandStream = getCommandStream(*this, profilingRequired, perfCountersRequired, multiDispatchInfo); + auto &commandStream = getCommandStream(*this, numEventsInWaitList, profilingRequired, perfCountersRequired, multiDispatchInfo); auto commandStreamStart = commandStream.getUsed(); DBG_LOG(EventsDebugEnable, "blockQueue", blockQueue, "virtualEvent", virtualEvent, "taskLevel", taskLevel); diff --git a/runtime/command_queue/gpgpu_walker.h b/runtime/command_queue/gpgpu_walker.h index e203adc7b8..4381a9e050 100644 --- a/runtime/command_queue/gpgpu_walker.h +++ b/runtime/command_queue/gpgpu_walker.h @@ -261,6 +261,9 @@ class GpgpuWalkerHelper { SchedulerKernel &scheduler, IndirectHeap *ssh, IndirectHeap *dsh); + + static void dispatchOnDeviceWaitlistSemaphores(LinearStream *commandStream, Device ¤tDevice, + cl_uint numEventsInWaitList, const cl_event *eventWaitList); }; template @@ -282,7 +285,7 @@ LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfiling } template -LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) { +LinearStream &getCommandStream(CommandQueue &commandQueue, cl_uint numEventsInWaitList, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) { size_t expectedSizeCS = 0; Kernel *parentKernel = multiDispatchInfo.peekParentKernel(); for (auto &dispatchInfo : multiDispatchInfo) { @@ -294,6 +297,7 @@ LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfiling } if (commandQueue.getDevice().peekCommandStreamReceiver()->peekTimestampPacketWriteEnabled()) { expectedSizeCS += EnqueueOperation::getSizeRequiredForTimestampPacketWrite(); + expectedSizeCS += numEventsInWaitList * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT); } return commandQueue.getCS(expectedSizeCS); } diff --git a/runtime/command_queue/gpgpu_walker.inl b/runtime/command_queue/gpgpu_walker.inl index a6590f1cb4..03d897d437 100644 --- a/runtime/command_queue/gpgpu_walker.inl +++ b/runtime/command_queue/gpgpu_walker.inl @@ -494,6 +494,11 @@ void GpgpuWalkerHelper::dispatchWalker( ssh = &getIndirectHeap(commandQueue, multiDispatchInfo); } + if (commandQueue.getDevice().peekCommandStreamReceiver()->peekTimestampPacketWriteEnabled()) { + GpgpuWalkerHelper::dispatchOnDeviceWaitlistSemaphores(commandStream, commandQueue.getDevice(), + numEventsInWaitList, eventWaitList); + } + dsh->align(KernelCommandsHelper::alignInterfaceDescriptorData); uint32_t interfaceDescriptorIndex = 0; @@ -645,6 +650,28 @@ void GpgpuWalkerHelper::dispatchWalker( dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); } +template +inline void GpgpuWalkerHelper::dispatchOnDeviceWaitlistSemaphores(LinearStream *commandStream, Device ¤tDevice, + cl_uint numEventsInWaitList, const cl_event *eventWaitList) { + using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; + + for (cl_uint i = 0; i < numEventsInWaitList; i++) { + auto event = castToObjectOrAbort(eventWaitList[i]); + if (event->isUserEvent() || (&event->getCommandQueue()->getDevice() != ¤tDevice)) { + continue; + } + auto timestampPacket = event->getTimestampPacket(); + + auto compareAddress = timestampPacket->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd); + + auto miSemaphoreCmd = commandStream->getSpaceForCmd(); + *miSemaphoreCmd = MI_SEMAPHORE_WAIT::sInit(); + miSemaphoreCmd->setCompareOperation(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); + miSemaphoreCmd->setSemaphoreDataDword(1); + miSemaphoreCmd->setSemaphoreGraphicsAddress(compareAddress); + } +} + template inline void GpgpuWalkerHelper::getDefaultDshSpace( const size_t &offsetInterfaceDescriptorTable, diff --git a/runtime/event/event.cpp b/runtime/event/event.cpp index a14318b68a..ae6b8a080a 100644 --- a/runtime/event/event.cpp +++ b/runtime/event/event.cpp @@ -719,4 +719,5 @@ void Event::setTimestampPacketNode(TagNode *node) { timestampPacketNode = node; } +TimestampPacket *Event::getTimestampPacket() const { return timestampPacketNode->tag; } } // namespace OCLRT diff --git a/runtime/event/event.h b/runtime/event/event.h index fed5de90c1..04f308db8c 100644 --- a/runtime/event/event.h +++ b/runtime/event/event.h @@ -127,6 +127,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { GraphicsAllocation *getHwTimeStampAllocation(); void setTimestampPacketNode(TagNode *node); + TimestampPacket *getTimestampPacket() const; bool isPerfCountersEnabled() { return perfCountersEnabled; diff --git a/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp b/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp index e3a1ea1146..e65e1ce3cb 100644 --- a/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp +++ b/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp @@ -241,7 +241,7 @@ HWTEST_F(ParentKernelCommandStreamFixture, GivenDispatchInfoWithParentKernelWhen size_t totalKernelSize = alignUp(numOfKernels * size, MemoryConstants::pageSize); - LinearStream &commandStream = getCommandStream(*pCmdQ, false, false, multiDispatchInfo); + LinearStream &commandStream = getCommandStream(*pCmdQ, 0, false, false, multiDispatchInfo); EXPECT_LT(totalKernelSize, commandStream.getMaxAvailableSpace()); diff --git a/unit_tests/helpers/timestamp_packet_tests.cpp b/unit_tests/helpers/timestamp_packet_tests.cpp index 6e85b822c4..00a90ec970 100644 --- a/unit_tests/helpers/timestamp_packet_tests.cpp +++ b/unit_tests/helpers/timestamp_packet_tests.cpp @@ -134,16 +134,39 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWriteEnabl MockMultiDispatchInfo multiDispatchInfo(std::vector({kernel1.mockKernel, kernel2.mockKernel})); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(cmdQ, false, false, multiDispatchInfo); + getCommandStream(cmdQ, 0, false, false, multiDispatchInfo); auto sizeWithDisabled = cmdQ.requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - getCommandStream(cmdQ, false, false, multiDispatchInfo); + getCommandStream(cmdQ, 0, false, false, multiDispatchInfo); auto sizeWithEnabled = cmdQ.requestedCmdStreamSize; EXPECT_EQ(sizeWithEnabled, sizeWithDisabled + (2 * sizeof(typename FamilyType::PIPE_CONTROL))); } +HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStreamSizeWithWaitlistThenAddSizeForSemaphores) { + auto device = std::unique_ptr(MockDevice::createWithNewExecutionEnvironment(platformDevices[0])); + MockCommandQueue cmdQ(nullptr, device.get(), nullptr); + MockKernelWithInternals kernel1(*device); + MockKernelWithInternals kernel2(*device); + MockMultiDispatchInfo multiDispatchInfo(std::vector({kernel1.mockKernel, kernel2.mockKernel})); + + cl_uint numEventsOnWaitlist = 5; + + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; + getCommandStream(cmdQ, numEventsOnWaitlist, false, false, multiDispatchInfo); + auto sizeWithDisabled = cmdQ.requestedCmdStreamSize; + + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + getCommandStream(cmdQ, numEventsOnWaitlist, false, false, multiDispatchInfo); + auto sizeWithEnabled = cmdQ.requestedCmdStreamSize; + + size_t extendedSize = sizeWithDisabled + (2 * sizeof(typename FamilyType::PIPE_CONTROL)) + + (numEventsOnWaitlist * sizeof(typename FamilyType::MI_SEMAPHORE_WAIT)); + + EXPECT_EQ(sizeWithEnabled, extendedSize); +} + HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWhenDispatchingGpuWalkerThenAddTwoPcForLastWalker) { using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; @@ -295,3 +318,85 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWriteEnabl } EXPECT_TRUE(walkerFound); } + +HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenDispatchingThenProgramSemaphoresForWaitlist) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using WALKER = WALKER_TYPE; + ExecutionEnvironment executionEnvironment; + executionEnvironment.incRefInternal(); + auto device1 = std::unique_ptr(Device::create(nullptr, &executionEnvironment)); + auto device2 = std::unique_ptr(Device::create(nullptr, &executionEnvironment)); + MockKernelWithInternals kernel1(*device1); + device1->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + MockContext context1(device1.get()); + MockContext context2(device2.get()); + + MockMultiDispatchInfo + multiDispatchInfo(std::vector({kernel1.mockKernel})); + + MockCommandQueue cmdQ1(&context1, device1.get(), nullptr); + MockCommandQueue cmdQ2(&context2, device2.get(), nullptr); + auto &cmdStream = cmdQ1.getCS(0); + + const cl_uint eventsOnWaitlist = 6; + TagNode *tagNodes[eventsOnWaitlist]; + for (size_t i = 0; i < eventsOnWaitlist; i++) { + tagNodes[i] = executionEnvironment.memoryManager->getTimestampPacketAllocator()->getTag(); + } + + UserEvent event1; + UserEvent event2; + Event event3(&cmdQ1, 0, 0, 0); + event3.setTimestampPacketNode(tagNodes[2]); + Event event4(&cmdQ2, 0, 0, 0); + event4.setTimestampPacketNode(tagNodes[3]); + Event event5(&cmdQ1, 0, 0, 0); + event5.setTimestampPacketNode(tagNodes[4]); + Event event6(&cmdQ2, 0, 0, 0); + event6.setTimestampPacketNode(tagNodes[5]); + + cl_event waitlist[] = {&event1, &event2, &event3, &event4, &event5, &event6}; + + GpgpuWalkerHelper::dispatchWalker( + cmdQ1, + multiDispatchInfo, + eventsOnWaitlist, + waitlist, + nullptr, + nullptr, + nullptr, + nullptr, + device1->getPreemptionMode(), + false); + + HardwareParse hwParser; + hwParser.parseCommands(cmdStream, 0); + + auto verifySemaphore = [](MI_SEMAPHORE_WAIT *semaphoreCmd, Event *compareEvent) { + EXPECT_EQ(semaphoreCmd->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); + EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); + EXPECT_EQ(compareEvent->getTimestampPacket()->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd), + semaphoreCmd->getSemaphoreGraphicsAddress()); + }; + + uint32_t semaphoresFound = 0; + uint32_t walkersFound = 0; + + for (auto it = hwParser.cmdList.begin(); it != hwParser.cmdList.end(); it++) { + auto semaphoreCmd = genCmdCast(*it); + if (semaphoreCmd) { + semaphoresFound++; + if (semaphoresFound == 1) { + verifySemaphore(semaphoreCmd, &event3); + } else if (semaphoresFound == 2) { + verifySemaphore(semaphoreCmd, &event5); + } + } + if (genCmdCast(*it)) { + walkersFound++; + EXPECT_EQ(2u, semaphoresFound); // semaphores from events programmed before walker + } + } + EXPECT_EQ(1u, walkersFound); + EXPECT_EQ(2u, semaphoresFound); // total number of semaphores found in cmdList +} diff --git a/unit_tests/libult/mock_gfx_family.h b/unit_tests/libult/mock_gfx_family.h index 898fa613f5..5e3bf90782 100644 --- a/unit_tests/libult/mock_gfx_family.h +++ b/unit_tests/libult/mock_gfx_family.h @@ -343,6 +343,21 @@ struct GENX { } } STATE_SIP; + typedef struct tagMI_SEMAPHORE_WAIT { + typedef enum tagCOMPARE_OPERATION { + COMPARE_OPERATION_SAD_NOT_EQUAL_SDD = 0x5, + } COMPARE_OPERATION; + + static tagMI_SEMAPHORE_WAIT sInit(void) { + MI_SEMAPHORE_WAIT state; + return state; + } + + inline void setSemaphoreDataDword(uint32_t value) {} + inline void setSemaphoreGraphicsAddress(uint64_t value) {} + inline void setCompareOperation(COMPARE_OPERATION value) {} + } MI_SEMAPHORE_WAIT; + typedef GPGPU_WALKER WALKER_TYPE; static GPGPU_WALKER cmdInitGpgpuWalker; static INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;