From 7eb70775ea85cbc2b32a870357972507980c6427 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Wed, 27 Apr 2022 12:31:11 +0000 Subject: [PATCH] Flush caches for cross CSR dependencies Signed-off-by: Lukasz Jobczyk --- opencl/source/helpers/properties_helper.cpp | 18 +- .../helpers/timestamp_packet_1_tests.cpp | 344 +++--------------- .../helpers/timestamp_packet_2_tests.cpp | 303 +++++++++++++++ shared/source/helpers/timestamp_packet.h | 2 +- 4 files changed, 361 insertions(+), 306 deletions(-) diff --git a/opencl/source/helpers/properties_helper.cpp b/opencl/source/helpers/properties_helper.cpp index 517335890f..806857b715 100644 --- a/opencl/source/helpers/properties_helper.cpp +++ b/opencl/source/helpers/properties_helper.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -34,13 +34,27 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci continue; } - auto sameCsr = (&event->getCommandQueue()->getGpgpuCommandStreamReceiver() == ¤tCsr); + auto &dependentCsr = event->getCommandQueue()->getGpgpuCommandStreamReceiver(); + auto sameCsr = (&dependentCsr == ¤tCsr); bool pushDependency = (CsrDependencies::DependenciesType::OnCsr == depsType && sameCsr) || (CsrDependencies::DependenciesType::OutOfCsr == depsType && !sameCsr) || (CsrDependencies::DependenciesType::All == depsType); if (pushDependency) { csrDeps.timestampPacketContainer.push_back(timestampPacketContainer); + + if (!sameCsr) { + const auto &hwInfoConfig = *NEO::HwInfoConfig::get(event->getCommandQueue()->getDevice().getHardwareInfo().platform.eProductFamily); + if (hwInfoConfig.isDcFlushAllowed()) { + if (!dependentCsr.isLatestTaskCountFlushed()) { + auto csrOwnership = dependentCsr.obtainUniqueOwnership(); + dependentCsr.flushBatchedSubmissions(); + dependentCsr.updateTagFromWait(); + csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast(dependentCsr.getTagAddress())}); + currentCsr.makeResident(*dependentCsr.getTagAllocation()); + } + } + } } } } diff --git a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp index 35a85130fd..b0791bebb8 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp @@ -310,6 +310,47 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat EXPECT_EQ(sizeWithEnabled, extendedSize); } +HWTEST_F(TimestampPacketTests, givenCrossCsrDependenciesWhenFillCsrDepsThenFlushCacheIfNeeded) { + auto mockCmdQHw = std::make_unique>(context, device.get(), nullptr); + mockCmdQHw->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + mockCmdQHw->getUltCommandStreamReceiver().taskCount = 1; + mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 0; + + cl_queue_properties props[] = {CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_LOW_KHR, 0}; + auto mockCmdQ2 = std::make_unique>(context, device.get(), props); + mockCmdQ2->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + mockCmdQ2->getUltCommandStreamReceiver().taskCount = 1; + mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 0; + + const cl_uint eventsOnWaitlist = 2; + MockTimestampPacketContainer timestamp(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + MockTimestampPacketContainer timestamp2(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + + Event event(mockCmdQ, 0, 0, 0); + event.addTimestampPacketNodes(timestamp); + Event event2(mockCmdQ2.get(), 0, 0, 0); + event2.addTimestampPacketNodes(timestamp2); + + cl_event waitlist[] = {&event, &event2}; + EventsRequest eventsRequest(eventsOnWaitlist, waitlist, nullptr); + CsrDependencies csrDeps; + + eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::All); + + const auto &hwInfoConfig = *NEO::HwInfoConfig::get(device->getHardwareInfo().platform.eProductFamily); + if (hwInfoConfig.isDcFlushAllowed()) { + EXPECT_TRUE(mockCmdQ2->getUltCommandStreamReceiver().flushBatchedSubmissionsCalled); + } else { + EXPECT_FALSE(mockCmdQ2->getUltCommandStreamReceiver().flushBatchedSubmissionsCalled); + } + EXPECT_FALSE(mockCmdQHw->getUltCommandStreamReceiver().flushBatchedSubmissionsCalled); + + mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 1; + *mockCmdQHw->getUltCommandStreamReceiver().tagAddress = 1; + mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 1; + *mockCmdQ2->getUltCommandStreamReceiver().tagAddress = 1; +} + HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStreamSizeWithWaitlistThenAddSizeForSemaphores) { MockKernelWithInternals kernel2(*device); MockMultiDispatchInfo multiDispatchInfo(device.get(), std::vector({kernel->mockKernel, kernel2.mockKernel})); @@ -1657,306 +1698,3 @@ HWTEST_F(TimestampPacketTests, givenEventsWaitlistFromDifferentDevicesWhenEnqueu EXPECT_TRUE(ultCsr.isMadeResident(tagNode1->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), ultCsr.taskCount)); EXPECT_TRUE(ultCsr.isMadeResident(tagNode2->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), ultCsr.taskCount)); } - -HWTEST_F(TimestampPacketTests, givenEventsWaitlistFromDifferentCSRsWhenEnqueueingThenMakeAllTimestampsResident) { - MockTagAllocator> tagAllocator(device->getRootDeviceIndex(), executionEnvironment->memoryManager.get(), 1, 1, - sizeof(TimestampPackets), false, device->getDeviceBitfield()); - - auto &ultCsr = device->getUltCommandStreamReceiver(); - ultCsr.timestampPacketWriteEnabled = true; - ultCsr.storeMakeResidentAllocations = true; - - auto cmdQ1 = std::make_unique>(context, device.get(), nullptr); - - // Create second (LOW_PRIORITY) queue on the same device - cl_queue_properties props[] = {CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_LOW_KHR, 0}; - auto cmdQ2 = std::make_unique>(context, device.get(), props); - cmdQ2->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - - MockTimestampPacketContainer node1(*ultCsr.getTimestampPacketAllocator(), 0); - MockTimestampPacketContainer node2(*ultCsr.getTimestampPacketAllocator(), 0); - - auto tagNode1 = tagAllocator.getTag(); - node1.add(tagNode1); - auto tagNode2 = tagAllocator.getTag(); - node2.add(tagNode2); - - Event event0(cmdQ1.get(), 0, 0, 0); - event0.addTimestampPacketNodes(node1); - Event event1(cmdQ2.get(), 0, 0, 0); - event1.addTimestampPacketNodes(node2); - - cl_event waitlist[] = {&event0, &event1}; - - cmdQ1->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 2, waitlist, nullptr); - - EXPECT_NE(tagNode1->getBaseGraphicsAllocation(), tagNode2->getBaseGraphicsAllocation()); - EXPECT_TRUE(ultCsr.isMadeResident(tagNode1->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), ultCsr.taskCount)); - EXPECT_TRUE(ultCsr.isMadeResident(tagNode2->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), ultCsr.taskCount)); -} - -HWTEST_F(TimestampPacketTests, givenTimestampPacketWhenEnqueueingNonBlockedThenMakeItResident) { - auto &csr = device->getUltCommandStreamReceiver(); - csr.timestampPacketWriteEnabled = true; - csr.storeMakeResidentAllocations = true; - - MockKernelWithInternals mockKernel(*device, context); - MockCommandQueueHw cmdQ(context, device.get(), nullptr); - - cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); - auto timestampPacketNode = cmdQ.timestampPacketContainer->peekNodes().at(0); - - EXPECT_TRUE(csr.isMadeResident(timestampPacketNode->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), csr.taskCount)); -} - -HWTEST_F(TimestampPacketTests, givenTimestampPacketWhenEnqueueingBlockedThenMakeItResidentOnSubmit) { - auto &csr = device->getUltCommandStreamReceiver(); - csr.timestampPacketWriteEnabled = true; - - MockKernelWithInternals mockKernel(*device, context); - - auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); - - csr.storeMakeResidentAllocations = true; - - UserEvent userEvent; - cl_event clEvent = &userEvent; - - cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 1, &clEvent, nullptr); - auto timestampPacketNode = cmdQ->timestampPacketContainer->peekNodes().at(0); - - EXPECT_FALSE(csr.isMadeResident(timestampPacketNode->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), csr.taskCount)); - userEvent.setStatus(CL_COMPLETE); - EXPECT_TRUE(csr.isMadeResident(timestampPacketNode->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), csr.taskCount)); - cmdQ->isQueueBlocked(); -} - -HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEnqueueingBlockedThenVirtualEventIncrementsRefInternalAndDecrementsAfterCompleteEvent) { - auto &csr = device->getUltCommandStreamReceiver(); - csr.timestampPacketWriteEnabled = true; - MockKernelWithInternals mockKernelWithInternals(*device, context); - auto mockKernel = mockKernelWithInternals.mockKernel; - auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); - - UserEvent userEvent; - cl_event waitlist = &userEvent; - - auto internalCount = userEvent.getRefInternalCount(); - cmdQ->enqueueKernel(mockKernel, 1, nullptr, gws, nullptr, 1, &waitlist, nullptr); - EXPECT_EQ(internalCount + 1, userEvent.getRefInternalCount()); - userEvent.setStatus(CL_COMPLETE); - cmdQ->isQueueBlocked(); - EXPECT_EQ(internalCount, mockKernel->getRefInternalCount()); -} - -TEST_F(TimestampPacketTests, givenDispatchSizeWhenAskingForNewTimestampsThenObtainEnoughTags) { - size_t dispatchSize = 3; - - mockCmdQ->timestampPacketContainer = std::make_unique(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 0); - EXPECT_EQ(0u, mockCmdQ->timestampPacketContainer->peekNodes().size()); - - TimestampPacketContainer previousNodes; - mockCmdQ->obtainNewTimestampPacketNodes(dispatchSize, previousNodes, false, mockCmdQ->getGpgpuCommandStreamReceiver()); - EXPECT_EQ(dispatchSize, mockCmdQ->timestampPacketContainer->peekNodes().size()); -} - -HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingWithoutKernelThenInheritTimestampPacketsWithoutSubmitting) { - device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - - auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); - - MockKernelWithInternals mockKernel(*device, context); - cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); // obtain first TimestampPackets - - TimestampPacketContainer cmdQNodes; - cmdQNodes.assignAndIncrementNodesRefCounts(*cmdQ->timestampPacketContainer); - - MockTimestampPacketContainer node1(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); - MockTimestampPacketContainer node2(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); - - Event event0(cmdQ.get(), 0, 0, 0); - event0.addTimestampPacketNodes(node1); - Event event1(cmdQ.get(), 0, 0, 0); - event1.addTimestampPacketNodes(node2); - UserEvent userEvent; - Event eventWithoutContainer(nullptr, 0, 0, 0); - - uint32_t numEventsWithContainer = 2; - uint32_t numEventsOnWaitlist = numEventsWithContainer + 2; // UserEvent + eventWithoutContainer - - cl_event waitlist[] = {&event0, &event1, &userEvent, &eventWithoutContainer}; - - cl_event clOutEvent; - cmdQ->enqueueMarkerWithWaitList(numEventsOnWaitlist, waitlist, &clOutEvent); - - auto outEvent = castToObject(clOutEvent); - - EXPECT_EQ(cmdQ->timestampPacketContainer->peekNodes().at(0), cmdQNodes.peekNodes().at(0)); // no new nodes obtained - EXPECT_EQ(1u, cmdQ->timestampPacketContainer->peekNodes().size()); - - auto &eventsNodes = outEvent->getTimestampPacketNodes()->peekNodes(); - EXPECT_EQ(numEventsWithContainer + 1, eventsNodes.size()); // numEventsWithContainer + command queue - EXPECT_EQ(cmdQNodes.peekNodes().at(0), eventsNodes.at(0)); - EXPECT_EQ(event0.getTimestampPacketNodes()->peekNodes().at(0), eventsNodes.at(1)); - EXPECT_EQ(event1.getTimestampPacketNodes()->peekNodes().at(0), eventsNodes.at(2)); - - clReleaseEvent(clOutEvent); - userEvent.setStatus(CL_COMPLETE); - cmdQ->isQueueBlocked(); -} - -HWTEST_F(TimestampPacketTests, givenBlockedEnqueueWithoutKernelWhenSubmittingThenDispatchBlockedCommands) { - using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; - - auto mockCsr = new MockCsrHw2(*device->getExecutionEnvironment(), device->getRootDeviceIndex(), device->getDeviceBitfield()); - device->resetCommandStreamReceiver(mockCsr); - mockCsr->timestampPacketWriteEnabled = true; - mockCsr->storeFlushedTaskStream = true; - - auto cmdQ0 = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); - - auto &secondEngine = device->getEngine(getChosenEngineType(device->getHardwareInfo()), EngineUsage::LowPriority); - static_cast *>(secondEngine.commandStreamReceiver)->timestampPacketWriteEnabled = true; - - auto cmdQ1 = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); - cmdQ1->gpgpuEngine = &secondEngine; - cmdQ1->timestampPacketContainer = std::make_unique(); - EXPECT_NE(&cmdQ0->getGpgpuCommandStreamReceiver(), &cmdQ1->getGpgpuCommandStreamReceiver()); - - MockTimestampPacketContainer node0(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); - MockTimestampPacketContainer node1(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); - - Event event0(cmdQ0.get(), 0, 0, 0); // on the same CSR - event0.addTimestampPacketNodes(node0); - Event event1(cmdQ1.get(), 0, 0, 0); // on different CSR - event1.addTimestampPacketNodes(node1); - - uint32_t numEventsOnWaitlist = 3; - - uint32_t commands[] = {CL_COMMAND_MARKER, CL_COMMAND_BARRIER}; - for (int i = 0; i < 2; i++) { - UserEvent userEvent; - cl_event waitlist[] = {&event0, &event1, &userEvent}; - if (commands[i] == CL_COMMAND_MARKER) { - cmdQ0->enqueueMarkerWithWaitList(numEventsOnWaitlist, waitlist, nullptr); - } else if (commands[i] == CL_COMMAND_BARRIER) { - cmdQ0->enqueueBarrierWithWaitList(numEventsOnWaitlist, waitlist, nullptr); - } else { - EXPECT_TRUE(false); - } - - auto initialCsrStreamOffset = mockCsr->commandStream.getUsed(); - userEvent.setStatus(CL_COMPLETE); - - HardwareParse hwParserCsr; - HardwareParse hwParserCmdQ; - LinearStream taskStream(mockCsr->storedTaskStream.get(), mockCsr->storedTaskStreamSize); - taskStream.getSpace(mockCsr->storedTaskStreamSize); - hwParserCsr.parseCommands(mockCsr->commandStream, initialCsrStreamOffset); - hwParserCmdQ.parseCommands(taskStream, 0); - - auto queueSemaphores = findAll(hwParserCmdQ.cmdList.begin(), hwParserCmdQ.cmdList.end()); - auto expectedQueueSemaphoresCount = 1u; - if (UnitTestHelper::isAdditionalMiSemaphoreWaitRequired(device->getHardwareInfo())) { - expectedQueueSemaphoresCount += 1; - } - EXPECT_EQ(expectedQueueSemaphoresCount, queueSemaphores.size()); - verifySemaphore(genCmdCast(*(queueSemaphores[0])), node0.getNode(0), 0); - - auto csrSemaphores = findAll(hwParserCsr.cmdList.begin(), hwParserCsr.cmdList.end()); - EXPECT_EQ(1u, csrSemaphores.size()); - verifySemaphore(genCmdCast(*(csrSemaphores[0])), node1.getNode(0), 0); - - EXPECT_TRUE(mockCsr->passedDispatchFlags.blocking); - EXPECT_TRUE(mockCsr->passedDispatchFlags.guardCommandBufferWithPipeControl); - EXPECT_EQ(device->getPreemptionMode(), mockCsr->passedDispatchFlags.preemptionMode); - - cmdQ0->isQueueBlocked(); - } -} - -HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingMarkerWithoutKernelThenInheritTimestampPacketsAndProgramSemaphores) { - using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; - auto device2 = std::make_unique(Device::create(executionEnvironment, 0u)); - - device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - device2->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - MockContext context2(device2.get()); - - auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); - auto cmdQ2 = std::make_unique>(&context2, device2.get(), nullptr); - - MockTimestampPacketContainer node1(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); - MockTimestampPacketContainer node2(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); - - Event event0(cmdQ.get(), 0, 0, 0); - event0.addTimestampPacketNodes(node1); - Event event1(cmdQ2.get(), 0, 0, 0); - event1.addTimestampPacketNodes(node2); - - uint32_t numEventsOnWaitlist = 2; - - cl_event waitlist[] = {&event0, &event1}; - - cmdQ->enqueueMarkerWithWaitList(numEventsOnWaitlist, waitlist, nullptr); - - HardwareParse hwParserCsr; - HardwareParse hwParserCmdQ; - hwParserCsr.parseCommands(device->getUltCommandStreamReceiver().commandStream, 0); - hwParserCmdQ.parseCommands(*cmdQ->commandStream, 0); - - auto csrSemaphores = findAll(hwParserCsr.cmdList.begin(), hwParserCsr.cmdList.end()); - EXPECT_EQ(1u, csrSemaphores.size()); - verifySemaphore(genCmdCast(*(csrSemaphores[0])), node2.getNode(0), 0); - - auto queueSemaphores = findAll(hwParserCmdQ.cmdList.begin(), hwParserCmdQ.cmdList.end()); - auto expectedQueueSemaphoresCount = 1u; - if (UnitTestHelper::isAdditionalMiSemaphoreWaitRequired(device->getHardwareInfo())) { - expectedQueueSemaphoresCount += 1; - } - EXPECT_EQ(expectedQueueSemaphoresCount, queueSemaphores.size()); - verifySemaphore(genCmdCast(*(queueSemaphores[0])), node1.getNode(0), 0); -} - -HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingBarrierWithoutKernelThenInheritTimestampPacketsAndProgramSemaphores) { - using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; - auto device2 = std::make_unique(Device::create(executionEnvironment, 0u)); - - device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - device2->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - MockContext context2(device2.get()); - - auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); - auto cmdQ2 = std::make_unique>(&context2, device2.get(), nullptr); - - MockTimestampPacketContainer node1(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); - MockTimestampPacketContainer node2(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); - - Event event0(cmdQ.get(), 0, 0, 0); - event0.addTimestampPacketNodes(node1); - Event event1(cmdQ2.get(), 0, 0, 0); - event1.addTimestampPacketNodes(node2); - - uint32_t numEventsOnWaitlist = 2; - - cl_event waitlist[] = {&event0, &event1}; - - cmdQ->enqueueBarrierWithWaitList(numEventsOnWaitlist, waitlist, nullptr); - - HardwareParse hwParserCsr; - HardwareParse hwParserCmdQ; - hwParserCsr.parseCommands(device->getUltCommandStreamReceiver().commandStream, 0); - hwParserCmdQ.parseCommands(*cmdQ->commandStream, 0); - - auto csrSemaphores = findAll(hwParserCsr.cmdList.begin(), hwParserCsr.cmdList.end()); - EXPECT_EQ(1u, csrSemaphores.size()); - verifySemaphore(genCmdCast(*(csrSemaphores[0])), node2.getNode(0), 0); - - auto queueSemaphores = findAll(hwParserCmdQ.cmdList.begin(), hwParserCmdQ.cmdList.end()); - auto expectedQueueSemaphoresCount = 1u; - if (UnitTestHelper::isAdditionalMiSemaphoreWaitRequired(device->getHardwareInfo())) { - expectedQueueSemaphoresCount += 1; - } - EXPECT_EQ(expectedQueueSemaphoresCount, queueSemaphores.size()); - verifySemaphore(genCmdCast(*(queueSemaphores[0])), node1.getNode(0), 0); -} diff --git a/opencl/test/unit_test/helpers/timestamp_packet_2_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_2_tests.cpp index 9fb750f482..bc8d7a8a9e 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_2_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_2_tests.cpp @@ -341,3 +341,306 @@ HWTEST_F(TimestampPacketTests, givenKernelWhichRequiresFlushWhenEnqueueingKernel EXPECT_NE(nullptr, node2); EXPECT_NE(node1, node2); } + +HWTEST_F(TimestampPacketTests, givenEventsWaitlistFromDifferentCSRsWhenEnqueueingThenMakeAllTimestampsResident) { + MockTagAllocator> tagAllocator(device->getRootDeviceIndex(), executionEnvironment->memoryManager.get(), 1, 1, + sizeof(TimestampPackets), false, device->getDeviceBitfield()); + + auto &ultCsr = device->getUltCommandStreamReceiver(); + ultCsr.timestampPacketWriteEnabled = true; + ultCsr.storeMakeResidentAllocations = true; + + auto cmdQ1 = std::make_unique>(context, device.get(), nullptr); + + // Create second (LOW_PRIORITY) queue on the same device + cl_queue_properties props[] = {CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_LOW_KHR, 0}; + auto cmdQ2 = std::make_unique>(context, device.get(), props); + cmdQ2->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + + MockTimestampPacketContainer node1(*ultCsr.getTimestampPacketAllocator(), 0); + MockTimestampPacketContainer node2(*ultCsr.getTimestampPacketAllocator(), 0); + + auto tagNode1 = tagAllocator.getTag(); + node1.add(tagNode1); + auto tagNode2 = tagAllocator.getTag(); + node2.add(tagNode2); + + Event event0(cmdQ1.get(), 0, 0, 0); + event0.addTimestampPacketNodes(node1); + Event event1(cmdQ2.get(), 0, 0, 0); + event1.addTimestampPacketNodes(node2); + + cl_event waitlist[] = {&event0, &event1}; + + cmdQ1->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 2, waitlist, nullptr); + + EXPECT_NE(tagNode1->getBaseGraphicsAllocation(), tagNode2->getBaseGraphicsAllocation()); + EXPECT_TRUE(ultCsr.isMadeResident(tagNode1->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), ultCsr.taskCount)); + EXPECT_TRUE(ultCsr.isMadeResident(tagNode2->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), ultCsr.taskCount)); +} + +HWTEST_F(TimestampPacketTests, givenTimestampPacketWhenEnqueueingNonBlockedThenMakeItResident) { + auto &csr = device->getUltCommandStreamReceiver(); + csr.timestampPacketWriteEnabled = true; + csr.storeMakeResidentAllocations = true; + + MockKernelWithInternals mockKernel(*device, context); + MockCommandQueueHw cmdQ(context, device.get(), nullptr); + + cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + auto timestampPacketNode = cmdQ.timestampPacketContainer->peekNodes().at(0); + + EXPECT_TRUE(csr.isMadeResident(timestampPacketNode->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), csr.taskCount)); +} + +HWTEST_F(TimestampPacketTests, givenTimestampPacketWhenEnqueueingBlockedThenMakeItResidentOnSubmit) { + auto &csr = device->getUltCommandStreamReceiver(); + csr.timestampPacketWriteEnabled = true; + + MockKernelWithInternals mockKernel(*device, context); + + auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); + + csr.storeMakeResidentAllocations = true; + + UserEvent userEvent; + cl_event clEvent = &userEvent; + + cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 1, &clEvent, nullptr); + auto timestampPacketNode = cmdQ->timestampPacketContainer->peekNodes().at(0); + + EXPECT_FALSE(csr.isMadeResident(timestampPacketNode->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), csr.taskCount)); + userEvent.setStatus(CL_COMPLETE); + EXPECT_TRUE(csr.isMadeResident(timestampPacketNode->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), csr.taskCount)); + cmdQ->isQueueBlocked(); +} + +HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEnqueueingBlockedThenVirtualEventIncrementsRefInternalAndDecrementsAfterCompleteEvent) { + auto &csr = device->getUltCommandStreamReceiver(); + csr.timestampPacketWriteEnabled = true; + MockKernelWithInternals mockKernelWithInternals(*device, context); + auto mockKernel = mockKernelWithInternals.mockKernel; + auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); + + UserEvent userEvent; + cl_event waitlist = &userEvent; + + auto internalCount = userEvent.getRefInternalCount(); + cmdQ->enqueueKernel(mockKernel, 1, nullptr, gws, nullptr, 1, &waitlist, nullptr); + EXPECT_EQ(internalCount + 1, userEvent.getRefInternalCount()); + userEvent.setStatus(CL_COMPLETE); + cmdQ->isQueueBlocked(); + EXPECT_EQ(internalCount, mockKernel->getRefInternalCount()); +} + +TEST_F(TimestampPacketTests, givenDispatchSizeWhenAskingForNewTimestampsThenObtainEnoughTags) { + size_t dispatchSize = 3; + + mockCmdQ->timestampPacketContainer = std::make_unique(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 0); + EXPECT_EQ(0u, mockCmdQ->timestampPacketContainer->peekNodes().size()); + + TimestampPacketContainer previousNodes; + mockCmdQ->obtainNewTimestampPacketNodes(dispatchSize, previousNodes, false, mockCmdQ->getGpgpuCommandStreamReceiver()); + EXPECT_EQ(dispatchSize, mockCmdQ->timestampPacketContainer->peekNodes().size()); +} + +HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingWithoutKernelThenInheritTimestampPacketsWithoutSubmitting) { + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + + auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); + + MockKernelWithInternals mockKernel(*device, context); + cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); // obtain first TimestampPackets + + TimestampPacketContainer cmdQNodes; + cmdQNodes.assignAndIncrementNodesRefCounts(*cmdQ->timestampPacketContainer); + + MockTimestampPacketContainer node1(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + MockTimestampPacketContainer node2(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + + Event event0(cmdQ.get(), 0, 0, 0); + event0.addTimestampPacketNodes(node1); + Event event1(cmdQ.get(), 0, 0, 0); + event1.addTimestampPacketNodes(node2); + UserEvent userEvent; + Event eventWithoutContainer(nullptr, 0, 0, 0); + + uint32_t numEventsWithContainer = 2; + uint32_t numEventsOnWaitlist = numEventsWithContainer + 2; // UserEvent + eventWithoutContainer + + cl_event waitlist[] = {&event0, &event1, &userEvent, &eventWithoutContainer}; + + cl_event clOutEvent; + cmdQ->enqueueMarkerWithWaitList(numEventsOnWaitlist, waitlist, &clOutEvent); + + auto outEvent = castToObject(clOutEvent); + + EXPECT_EQ(cmdQ->timestampPacketContainer->peekNodes().at(0), cmdQNodes.peekNodes().at(0)); // no new nodes obtained + EXPECT_EQ(1u, cmdQ->timestampPacketContainer->peekNodes().size()); + + auto &eventsNodes = outEvent->getTimestampPacketNodes()->peekNodes(); + EXPECT_EQ(numEventsWithContainer + 1, eventsNodes.size()); // numEventsWithContainer + command queue + EXPECT_EQ(cmdQNodes.peekNodes().at(0), eventsNodes.at(0)); + EXPECT_EQ(event0.getTimestampPacketNodes()->peekNodes().at(0), eventsNodes.at(1)); + EXPECT_EQ(event1.getTimestampPacketNodes()->peekNodes().at(0), eventsNodes.at(2)); + + clReleaseEvent(clOutEvent); + userEvent.setStatus(CL_COMPLETE); + cmdQ->isQueueBlocked(); +} + +HWTEST_F(TimestampPacketTests, givenBlockedEnqueueWithoutKernelWhenSubmittingThenDispatchBlockedCommands) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + auto mockCsr = new MockCsrHw2(*device->getExecutionEnvironment(), device->getRootDeviceIndex(), device->getDeviceBitfield()); + device->resetCommandStreamReceiver(mockCsr); + mockCsr->timestampPacketWriteEnabled = true; + mockCsr->storeFlushedTaskStream = true; + + auto cmdQ0 = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); + + auto &secondEngine = device->getEngine(getChosenEngineType(device->getHardwareInfo()), EngineUsage::LowPriority); + static_cast *>(secondEngine.commandStreamReceiver)->timestampPacketWriteEnabled = true; + + auto cmdQ1 = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); + cmdQ1->gpgpuEngine = &secondEngine; + cmdQ1->timestampPacketContainer = std::make_unique(); + EXPECT_NE(&cmdQ0->getGpgpuCommandStreamReceiver(), &cmdQ1->getGpgpuCommandStreamReceiver()); + + MockTimestampPacketContainer node0(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + MockTimestampPacketContainer node1(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + + Event event0(cmdQ0.get(), 0, 0, 0); // on the same CSR + event0.addTimestampPacketNodes(node0); + Event event1(cmdQ1.get(), 0, 0, 0); // on different CSR + event1.addTimestampPacketNodes(node1); + + uint32_t numEventsOnWaitlist = 3; + + uint32_t commands[] = {CL_COMMAND_MARKER, CL_COMMAND_BARRIER}; + for (int i = 0; i < 2; i++) { + UserEvent userEvent; + cl_event waitlist[] = {&event0, &event1, &userEvent}; + if (commands[i] == CL_COMMAND_MARKER) { + cmdQ0->enqueueMarkerWithWaitList(numEventsOnWaitlist, waitlist, nullptr); + } else if (commands[i] == CL_COMMAND_BARRIER) { + cmdQ0->enqueueBarrierWithWaitList(numEventsOnWaitlist, waitlist, nullptr); + } else { + EXPECT_TRUE(false); + } + + auto initialCsrStreamOffset = mockCsr->commandStream.getUsed(); + userEvent.setStatus(CL_COMPLETE); + + HardwareParse hwParserCsr; + HardwareParse hwParserCmdQ; + LinearStream taskStream(mockCsr->storedTaskStream.get(), mockCsr->storedTaskStreamSize); + taskStream.getSpace(mockCsr->storedTaskStreamSize); + hwParserCsr.parseCommands(mockCsr->commandStream, initialCsrStreamOffset); + hwParserCmdQ.parseCommands(taskStream, 0); + + auto queueSemaphores = findAll(hwParserCmdQ.cmdList.begin(), hwParserCmdQ.cmdList.end()); + auto expectedQueueSemaphoresCount = 1u; + if (UnitTestHelper::isAdditionalMiSemaphoreWaitRequired(device->getHardwareInfo())) { + expectedQueueSemaphoresCount += 1; + } + EXPECT_EQ(expectedQueueSemaphoresCount, queueSemaphores.size()); + verifySemaphore(genCmdCast(*(queueSemaphores[0])), node0.getNode(0), 0); + + auto csrSemaphores = findAll(hwParserCsr.cmdList.begin(), hwParserCsr.cmdList.end()); + EXPECT_EQ(1u, csrSemaphores.size()); + verifySemaphore(genCmdCast(*(csrSemaphores[0])), node1.getNode(0), 0); + + EXPECT_TRUE(mockCsr->passedDispatchFlags.blocking); + EXPECT_TRUE(mockCsr->passedDispatchFlags.guardCommandBufferWithPipeControl); + EXPECT_EQ(device->getPreemptionMode(), mockCsr->passedDispatchFlags.preemptionMode); + + cmdQ0->isQueueBlocked(); + } +} + +HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingMarkerWithoutKernelThenInheritTimestampPacketsAndProgramSemaphores) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + auto device2 = std::make_unique(Device::create(executionEnvironment, 0u)); + + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + device2->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + MockContext context2(device2.get()); + + auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); + auto cmdQ2 = std::make_unique>(&context2, device2.get(), nullptr); + + MockTimestampPacketContainer node1(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + MockTimestampPacketContainer node2(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + + Event event0(cmdQ.get(), 0, 0, 0); + event0.addTimestampPacketNodes(node1); + Event event1(cmdQ2.get(), 0, 0, 0); + event1.addTimestampPacketNodes(node2); + + uint32_t numEventsOnWaitlist = 2; + + cl_event waitlist[] = {&event0, &event1}; + + cmdQ->enqueueMarkerWithWaitList(numEventsOnWaitlist, waitlist, nullptr); + + HardwareParse hwParserCsr; + HardwareParse hwParserCmdQ; + hwParserCsr.parseCommands(device->getUltCommandStreamReceiver().commandStream, 0); + hwParserCmdQ.parseCommands(*cmdQ->commandStream, 0); + + auto csrSemaphores = findAll(hwParserCsr.cmdList.begin(), hwParserCsr.cmdList.end()); + EXPECT_EQ(1u, csrSemaphores.size()); + verifySemaphore(genCmdCast(*(csrSemaphores[0])), node2.getNode(0), 0); + + auto queueSemaphores = findAll(hwParserCmdQ.cmdList.begin(), hwParserCmdQ.cmdList.end()); + auto expectedQueueSemaphoresCount = 1u; + if (UnitTestHelper::isAdditionalMiSemaphoreWaitRequired(device->getHardwareInfo())) { + expectedQueueSemaphoresCount += 1; + } + EXPECT_EQ(expectedQueueSemaphoresCount, queueSemaphores.size()); + verifySemaphore(genCmdCast(*(queueSemaphores[0])), node1.getNode(0), 0); +} + +HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingBarrierWithoutKernelThenInheritTimestampPacketsAndProgramSemaphores) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + auto device2 = std::make_unique(Device::create(executionEnvironment, 0u)); + + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + device2->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + MockContext context2(device2.get()); + + auto cmdQ = clUniquePtr(new MockCommandQueueHw(context, device.get(), nullptr)); + auto cmdQ2 = std::make_unique>(&context2, device2.get(), nullptr); + + MockTimestampPacketContainer node1(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + MockTimestampPacketContainer node2(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1); + + Event event0(cmdQ.get(), 0, 0, 0); + event0.addTimestampPacketNodes(node1); + Event event1(cmdQ2.get(), 0, 0, 0); + event1.addTimestampPacketNodes(node2); + + uint32_t numEventsOnWaitlist = 2; + + cl_event waitlist[] = {&event0, &event1}; + + cmdQ->enqueueBarrierWithWaitList(numEventsOnWaitlist, waitlist, nullptr); + + HardwareParse hwParserCsr; + HardwareParse hwParserCmdQ; + hwParserCsr.parseCommands(device->getUltCommandStreamReceiver().commandStream, 0); + hwParserCmdQ.parseCommands(*cmdQ->commandStream, 0); + + auto csrSemaphores = findAll(hwParserCsr.cmdList.begin(), hwParserCsr.cmdList.end()); + EXPECT_EQ(1u, csrSemaphores.size()); + verifySemaphore(genCmdCast(*(csrSemaphores[0])), node2.getNode(0), 0); + + auto queueSemaphores = findAll(hwParserCmdQ.cmdList.begin(), hwParserCmdQ.cmdList.end()); + auto expectedQueueSemaphoresCount = 1u; + if (UnitTestHelper::isAdditionalMiSemaphoreWaitRequired(device->getHardwareInfo())) { + expectedQueueSemaphoresCount += 1; + } + EXPECT_EQ(expectedQueueSemaphoresCount, queueSemaphores.size()); + verifySemaphore(genCmdCast(*(queueSemaphores[0])), node1.getNode(0), 0); +} diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h index 24a24beb98..d09ff31260 100644 --- a/shared/source/helpers/timestamp_packet.h +++ b/shared/source/helpers/timestamp_packet.h @@ -147,7 +147,7 @@ struct TimestampPacketHelper { template static void programCsrDependenciesForForTaskCountContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) { - auto taskCountContainer = csrDependencies.taskCountContainer; + auto &taskCountContainer = csrDependencies.taskCountContainer; for (auto &[taskCountPreviousRootDevice, tagAddressPreviousRootDevice] : taskCountContainer) { using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;