diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index a62d6385ce..e2c87b6075 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -241,10 +241,17 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } } - TimestampPacket *timestampPacket = nullptr; + TimestampPacket *currentTimestampPacket = nullptr; + TimestampPacket *previousTimestampPacket = nullptr; if (device->getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { + auto previousTimestampPacketNode = timestampPacketNode; obtainNewTimestampPacketNode(); - timestampPacket = timestampPacketNode->tag; + currentTimestampPacket = timestampPacketNode->tag; + + if (previousTimestampPacketNode && !previousTimestampPacketNode->tag->canBeReleased()) { + // keep dependency on previous enqueue + previousTimestampPacket = previousTimestampPacketNode->tag; + } } if (eventBuilder.getEvent()) { @@ -281,7 +288,8 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, &blockedCommandsData, hwTimeStamps, hwPerfCounter, - timestampPacket, + previousTimestampPacket, + currentTimestampPacket, preemption, blockQueue, commandType); diff --git a/runtime/command_queue/gpgpu_walker.h b/runtime/command_queue/gpgpu_walker.h index ac6a97e2f8..c712c2c73f 100644 --- a/runtime/command_queue/gpgpu_walker.h +++ b/runtime/command_queue/gpgpu_walker.h @@ -206,7 +206,8 @@ class GpgpuWalkerHelper { KernelOperation **blockedCommandsData, HwTimeStamps *hwTimeStamps, OCLRT::HwPerfCounter *hwPerfCounter, - TimestampPacket *timestampPacket, + TimestampPacket *previousTimestampPacket, + TimestampPacket *currentTimestampPacket, PreemptionMode preemptionMode, bool blockQueue, uint32_t commandType = 0); @@ -297,7 +298,7 @@ LinearStream &getCommandStream(CommandQueue &commandQueue, cl_uint numEventsInWa } if (commandQueue.getDevice().getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { expectedSizeCS += EnqueueOperation::getSizeRequiredForTimestampPacketWrite(); - expectedSizeCS += numEventsInWaitList * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT); + expectedSizeCS += (numEventsInWaitList + 1) * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT); } return commandQueue.getCS(expectedSizeCS); } diff --git a/runtime/command_queue/gpgpu_walker.inl b/runtime/command_queue/gpgpu_walker.inl index 842400adc2..3543613454 100644 --- a/runtime/command_queue/gpgpu_walker.inl +++ b/runtime/command_queue/gpgpu_walker.inl @@ -435,7 +435,8 @@ void GpgpuWalkerHelper::dispatchWalker( KernelOperation **blockedCommandsData, HwTimeStamps *hwTimeStamps, OCLRT::HwPerfCounter *hwPerfCounter, - TimestampPacket *timestampPacket, + TimestampPacket *previousTimestampPacket, + TimestampPacket *currentTimestampPacket, PreemptionMode preemptionMode, bool blockQueue, uint32_t commandType) { @@ -497,6 +498,10 @@ void GpgpuWalkerHelper::dispatchWalker( if (commandQueue.getDevice().getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { GpgpuWalkerHelper::dispatchOnDeviceWaitlistSemaphores(commandStream, commandQueue.getDevice(), numEventsInWaitList, eventWaitList); + if (previousTimestampPacket) { + auto compareAddress = previousTimestampPacket->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd); + KernelCommandsHelper::programMiSemaphoreWait(*commandStream, compareAddress, 1); + } } dsh->align(KernelCommandsHelper::alignInterfaceDescriptorData); @@ -590,9 +595,9 @@ void GpgpuWalkerHelper::dispatchWalker( dispatchWorkarounds(commandStream, commandQueue, kernel, true); - bool setupTimestampPacket = timestampPacket && (currentDispatchIndex == multiDispatchInfo.size() - 1); + bool setupTimestampPacket = currentTimestampPacket && (currentDispatchIndex == multiDispatchInfo.size() - 1); if (setupTimestampPacket) { - GpgpuWalkerHelper::setupTimestampPacket(commandStream, nullptr, timestampPacket, + GpgpuWalkerHelper::setupTimestampPacket(commandStream, nullptr, currentTimestampPacket, TimestampPacket::WriteOperationType::BeforeWalker); } @@ -601,7 +606,7 @@ void GpgpuWalkerHelper::dispatchWalker( *pWalkerCmd = GfxFamily::cmdInitGpgpuWalker; if (setupTimestampPacket) { - GpgpuWalkerHelper::setupTimestampPacket(commandStream, pWalkerCmd, timestampPacket, + GpgpuWalkerHelper::setupTimestampPacket(commandStream, pWalkerCmd, currentTimestampPacket, TimestampPacket::WriteOperationType::AfterWalker); } diff --git a/unit_tests/command_queue/dispatch_walker_tests.cpp b/unit_tests/command_queue/dispatch_walker_tests.cpp index 0a389ddf9d..c39c1d0c75 100644 --- a/unit_tests/command_queue/dispatch_walker_tests.cpp +++ b/unit_tests/command_queue/dispatch_walker_tests.cpp @@ -152,6 +152,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, shouldntChangeCommandStreamMemor nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -200,6 +201,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, noLocalIdsShouldntCrash) { nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -230,6 +232,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithDefaultLwsAlgorithm) nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -261,6 +264,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithSquaredLwsAlgorithm) nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); EXPECT_EQ(dimension, *kernel.workDim); @@ -290,6 +294,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithNDLwsAlgorithm) { nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); EXPECT_EQ(dimension, *kernel.workDim); @@ -320,6 +325,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithOldLwsAlgorithm) { nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); EXPECT_EQ(dimension, *kernel.workDim); @@ -350,6 +356,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNumWorkGroups) { nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -382,6 +389,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithOutComputeND) { nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); EXPECT_EQ(2u, *kernel.localWorkSizeX); @@ -413,6 +421,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeND) { nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); EXPECT_EQ(2u, *kernel.localWorkSizeX); @@ -445,6 +454,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeSquared) { nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); EXPECT_EQ(2u, *kernel.localWorkSizeX); @@ -477,6 +487,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithOutComputeSquaredAn nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); EXPECT_EQ(2u, *kernel.localWorkSizeX); @@ -507,6 +518,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSize) { nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); EXPECT_EQ(1u, *kernel.localWorkSizeX); @@ -540,6 +552,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizes) { nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); EXPECT_EQ(1u, *kernel.localWorkSizeX); @@ -577,6 +590,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizeForSplitKernel) { nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -628,6 +642,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizesForSplitWalker) { nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -683,6 +698,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerDoesntConsumeCommandStreamWhenQueueIs nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), blockQueue); @@ -723,6 +739,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromKernelW nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), blockQueue); @@ -761,6 +778,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromMdiWhen nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), blockQueue); @@ -794,6 +812,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfo) { nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -836,6 +855,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, dispatchWalkerWithMultipleDispat nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -921,6 +941,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, dispatchWalkerWithMultipleDispat nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -967,6 +988,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, dispatchWalkerWithMultipleDispat nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -1018,6 +1040,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, dispatchWalkerWithMultipleDispat nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -1061,7 +1084,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, givenMultiDispatchWhenWhiteliste DispatchInfo di2(&kernel, 1, Vec3(1, 1, 1), Vec3(1, 1, 1), Vec3(0, 0, 0)); MockMultiDispatchInfo multiDispatchInfo(std::vector({&di1, &di2})); - GpgpuWalkerHelper::dispatchWalker(*pCmdQ, multiDispatchInfo, 0, nullptr, nullptr, nullptr, nullptr, nullptr, pDevice->getPreemptionMode(), false); + GpgpuWalkerHelper::dispatchWalker(*pCmdQ, multiDispatchInfo, 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, pDevice->getPreemptionMode(), false); hwParser.parseCommands(cmdStream, 0); diff --git a/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp b/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp index e65e1ce3cb..2298b83f32 100644 --- a/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp +++ b/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp @@ -65,6 +65,7 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenDev nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -121,6 +122,7 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenDef nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -147,6 +149,7 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenSSH nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -183,6 +186,7 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsBlockedThenSSHSiz nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), true); ASSERT_NE(nullptr, blockedCommandsData); @@ -281,6 +285,7 @@ HWTEST_F(MockParentKernelDispatch, GivenBlockedQueueWhenParentKernelIsDispatched nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), true); @@ -315,6 +320,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, MockParentKernelDispatch, GivenParentKernelWhenDispa nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -372,6 +378,7 @@ HWTEST_F(MockParentKernelDispatch, GivenUsedSSHHeapWhenParentKernelIsDispatchedT nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); @@ -408,6 +415,7 @@ HWTEST_F(MockParentKernelDispatch, GivenNotUsedSSHHeapWhenParentKernelIsDispatch nullptr, nullptr, nullptr, + nullptr, pDevice->getPreemptionMode(), false); diff --git a/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp b/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp index 11cbd5291e..bdfbfe7597 100644 --- a/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp +++ b/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp @@ -450,6 +450,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenBlockedCommand nullptr, nullptr, nullptr, + nullptr, device->getPreemptionMode(), true); diff --git a/unit_tests/helpers/timestamp_packet_tests.cpp b/unit_tests/helpers/timestamp_packet_tests.cpp index be03083b4a..478436f218 100644 --- a/unit_tests/helpers/timestamp_packet_tests.cpp +++ b/unit_tests/helpers/timestamp_packet_tests.cpp @@ -65,6 +65,12 @@ struct TimestampPacketTests : public ::testing::Test { std::vector releaseReferenceNodes; std::vector returnedToFreePoolNodes; }; + + void setTagToReadyState(TimestampPacket *tag) { + memset(reinterpret_cast(tag->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextStart)), 0, timestampDataSize); + } + + const size_t timestampDataSize = sizeof(uint32_t) * static_cast(TimestampPacket::DataIndex::Max); }; TEST_F(TimestampPacketTests, whenEndTagIsNotOneThenCanBeReleased) { @@ -142,7 +148,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWriteEnabl getCommandStream(cmdQ, 0, false, false, multiDispatchInfo); auto sizeWithEnabled = cmdQ.requestedCmdStreamSize; - EXPECT_EQ(sizeWithEnabled, sizeWithDisabled + (2 * sizeof(typename FamilyType::PIPE_CONTROL))); + auto extendedSize = sizeWithDisabled + (2 * sizeof(typename FamilyType::PIPE_CONTROL)) + sizeof(typename FamilyType::MI_SEMAPHORE_WAIT); + + EXPECT_EQ(sizeWithEnabled, extendedSize); } HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStreamSizeWithWaitlistThenAddSizeForSemaphores) { @@ -163,7 +171,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr auto sizeWithEnabled = cmdQ.requestedCmdStreamSize; size_t extendedSize = sizeWithDisabled + EnqueueOperation::getSizeRequiredForTimestampPacketWrite() + - (numEventsOnWaitlist * sizeof(typename FamilyType::MI_SEMAPHORE_WAIT)); + ((numEventsOnWaitlist + 1) * sizeof(typename FamilyType::MI_SEMAPHORE_WAIT)); EXPECT_EQ(sizeWithEnabled, extendedSize); } @@ -190,6 +198,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWhenDispat nullptr, nullptr, nullptr, + nullptr, ×tampPacket, device->getPreemptionMode(), false); @@ -267,10 +276,8 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEnqueueingThe EXPECT_EQ(node1, mockTagAllocator->releaseReferenceNodes.at(0)); EXPECT_NE(node1, node2); - size_t dataSize = sizeof(uint32_t) * static_cast(TimestampPacket::DataIndex::Max); - // mark nodes as ready - memset(reinterpret_cast(node1->tag->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextStart)), 0, dataSize); - memset(reinterpret_cast(node2->tag->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextStart)), 0, dataSize); + setTagToReadyState(node1->tag); + setTagToReadyState(node2->tag); clReleaseEvent(event2); EXPECT_EQ(0u, mockTagAllocator->returnedToFreePoolNodes.size()); // nothing returned. cmdQ owns node2 @@ -446,6 +453,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenDispatchingTh nullptr, nullptr, nullptr, + nullptr, device1->getPreemptionMode(), false); @@ -493,12 +501,69 @@ TEST_F(TimestampPacketTests, givenAlreadyAssignedNodeWhenObtainingThenGetNewBefo auto firstNode = cmdQ.timestampPacketNode; EXPECT_TRUE(mockTagAllocator->freeTags.peekIsEmpty()); - // mark as ready to release - size_t dataSize = sizeof(uint32_t) * static_cast(TimestampPacket::DataIndex::Max); - memset(reinterpret_cast(firstNode->tag->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextStart)), 0, dataSize); + setTagToReadyState(firstNode->tag); cmdQ.obtainNewTimestampPacketNode(); auto secondNode = cmdQ.timestampPacketNode; EXPECT_FALSE(mockTagAllocator->freeTags.peekIsEmpty()); // new pool allocated for secondNode EXPECT_NE(firstNode, secondNode); } + +HWTEST_F(TimestampPacketTests, givenAlreadyAssignedNodeWhenEnqueueingThenDontKeepDependencyOnPreviousNodeIfItsReady) { + auto device = std::unique_ptr(MockDevice::createWithNewExecutionEnvironment(platformDevices[0])); + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + MockContext context(device.get()); + MockKernelWithInternals kernel(*device); + + MockCommandQueueHw cmdQ(&context, device.get(), nullptr); + cmdQ.obtainNewTimestampPacketNode(); + auto firstNode = cmdQ.timestampPacketNode; + setTagToReadyState(firstNode->tag); + + size_t gws[] = {1, 1, 1}; + cmdQ.enqueueKernel(kernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ.commandStream, 0); + + uint32_t semaphoresFound = 0; + for (auto it = hwParser.cmdList.begin(); it != hwParser.cmdList.end(); it++) { + if (genCmdCast(*it)) { + semaphoresFound++; + } + } + EXPECT_EQ(0u, semaphoresFound); +} + +HWTEST_F(TimestampPacketTests, givenAlreadyAssignedNodeWhenEnqueueingThenKeepDependencyOnPreviousNodeIfItsNotReady) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + auto device = std::unique_ptr(MockDevice::createWithNewExecutionEnvironment(platformDevices[0])); + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + MockContext context(device.get()); + MockKernelWithInternals kernel(*device); + + MockCommandQueueHw cmdQ(&context, device.get(), nullptr); + cmdQ.obtainNewTimestampPacketNode(); + auto firstNode = cmdQ.timestampPacketNode; + + size_t gws[] = {1, 1, 1}; + cmdQ.enqueueKernel(kernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ.commandStream, 0); + + auto semaphoreCmd = genCmdCast(*hwParser.cmdList.begin()); + EXPECT_NE(nullptr, semaphoreCmd); + EXPECT_EQ(firstNode->tag->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd), semaphoreCmd->getSemaphoreGraphicsAddress()); + EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, semaphoreCmd->getCompareOperation()); + + uint32_t semaphoresFound = 0; + auto it = hwParser.cmdList.begin(); + for (++it; it != hwParser.cmdList.end(); it++) { + if (genCmdCast(*it)) { + semaphoresFound++; + } + } + EXPECT_EQ(0u, semaphoresFound); +} diff --git a/unit_tests/mocks/mock_command_queue.h b/unit_tests/mocks/mock_command_queue.h index 0e1bdb66c5..ce94e47f43 100644 --- a/unit_tests/mocks/mock_command_queue.h +++ b/unit_tests/mocks/mock_command_queue.h @@ -78,7 +78,9 @@ class MockCommandQueueHw : public CommandQueueHw { typedef CommandQueueHw BaseClass; public: + using BaseClass::commandStream; using BaseClass::createAllocationForHostSurface; + using BaseClass::obtainNewTimestampPacketNode; using BaseClass::timestampPacketNode; MockCommandQueueHw(Context *context,