From 1399e55df7fafbb8365f897891471c352696e13c Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Mon, 9 Mar 2020 13:48:30 +0100 Subject: [PATCH] Flush cache for blit aux translation Change-Id: I108273bee286cdeed06e0c287945099cea481a73 Signed-off-by: Bartosz Dunajski --- .../built_ins/aux_translation_builtin.h | 4 +- .../command_queue/command_queue_hw_base.inl | 4 +- opencl/source/command_queue/enqueue_common.h | 17 +-- .../command_queue/gpgpu_walker_base.inl | 9 +- .../command_queue/hardware_interface_base.inl | 4 +- opencl/source/helpers/dispatch_info.h | 4 +- .../command_queue/blit_enqueue_tests.cpp | 108 +++++++++++++++++- .../helpers/timestamp_packet_tests.cpp | 49 ++++---- .../test/unit_test/mocks/mock_command_queue.h | 11 ++ .../source/helpers/blit_commands_helper.cpp | 1 + shared/source/helpers/timestamp_packet.h | 27 ++++- 11 files changed, 185 insertions(+), 53 deletions(-) diff --git a/opencl/source/built_ins/aux_translation_builtin.h b/opencl/source/built_ins/aux_translation_builtin.h index 114f98d7b7..64fa43abe5 100644 --- a/opencl/source/built_ins/aux_translation_builtin.h +++ b/opencl/source/built_ins/aux_translation_builtin.h @@ -64,12 +64,12 @@ class BuiltInOp : public BuiltinDispatchInfoBuilder using RegisteredMethodDispatcherT = RegisteredMethodDispatcher; template - static void dispatchPipeControl(LinearStream &linearStream, TimestampPacketDependencies *) { + static void dispatchPipeControl(LinearStream &linearStream, TimestampPacketDependencies *, const HardwareInfo &) { MemorySynchronizationCommands::addPipeControl(linearStream, dcFlush); } template - static size_t getSizeForSinglePipeControl(size_t) { + static size_t getSizeForSinglePipeControl(size_t, const HardwareInfo &, bool) { return MemorySynchronizationCommands::getSizeForSinglePipeControl(); } diff --git a/opencl/source/command_queue/command_queue_hw_base.inl b/opencl/source/command_queue/command_queue_hw_base.inl index ba1235815d..9fe258be33 100644 --- a/opencl/source/command_queue/command_queue_hw_base.inl +++ b/opencl/source/command_queue/command_queue_hw_base.inl @@ -134,13 +134,13 @@ void CommandQueueHw::setupBlitAuxTranslation(MultiDispatchInfo &multiDis TimestampPacketHelper::programSemaphoreWithImplicitDependencyForAuxTranslation); multiDispatchInfo.begin()->dispatchInitCommands.registerCommandsSizeEstimationMethod( - TimestampPacketHelper::getRequiredCmdStreamSizeForAuxTranslationNodeDependency); + TimestampPacketHelper::getRequiredCmdStreamSizeForAuxTranslationNodeDependency); multiDispatchInfo.rbegin()->dispatchEpilogueCommands.registerMethod( TimestampPacketHelper::programSemaphoreWithImplicitDependencyForAuxTranslation); multiDispatchInfo.rbegin()->dispatchEpilogueCommands.registerCommandsSizeEstimationMethod( - TimestampPacketHelper::getRequiredCmdStreamSizeForAuxTranslationNodeDependency); + TimestampPacketHelper::getRequiredCmdStreamSizeForAuxTranslationNodeDependency); } template diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index ece9fd4720..8e715908a5 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -198,8 +198,11 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, CsrDependencies csrDeps; BlitPropertiesContainer blitPropertiesContainer; + bool enqueueWithBlitAuxTranslation = HwHelperHw::isBlitAuxTranslationRequired(device->getHardwareInfo(), multiDispatchInfo); + if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { eventsRequest.fillCsrDependencies(csrDeps, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); + auto allocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); size_t nodesCount = 0u; if (blitEnqueue || isCacheFlushCommand(commandType)) { @@ -208,15 +211,12 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo); } - if (blitEnqueue) { - auto allocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); + if (isCacheFlushForBcsRequired() && (blitEnqueue || enqueueWithBlitAuxTranslation)) { + timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag()); + } - if (isCacheFlushForBcsRequired()) { - timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag()); - } - if (!blockQueue && getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()) { - timestampPacketDependencies.barrierNodes.add(allocator->getTag()); - } + if (blitEnqueue && !blockQueue && getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()) { + timestampPacketDependencies.barrierNodes.add(allocator->getTag()); } if (nodesCount > 0) { @@ -673,6 +673,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( if (timestampPacketContainer) { timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver()); timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver()); + timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver()); } bool anyUncacheableArgs = false; diff --git a/opencl/source/command_queue/gpgpu_walker_base.inl b/opencl/source/command_queue/gpgpu_walker_base.inl index 32c77ac624..8530258c89 100644 --- a/opencl/source/command_queue/gpgpu_walker_base.inl +++ b/opencl/source/command_queue/gpgpu_walker_base.inl @@ -211,11 +211,10 @@ void GpgpuWalkerHelper::adjustMiStoreRegMemMode(MI_STORE_REG_MEM size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) { size_t expectedSizeCS = 0; + auto &hwInfo = commandQueue.getDevice().getHardwareInfo(); + auto &commandQueueHw = static_cast &>(commandQueue); if (blitEnqueue) { - auto &hwInfo = commandQueue.getDevice().getHardwareInfo(); - auto &commandQueueHw = static_cast &>(commandQueue); - size_t expectedSizeCS = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); if (commandQueueHw.isCacheFlushForBcsRequired()) { expectedSizeCS += MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo); @@ -228,8 +227,8 @@ size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, c for (auto &dispatchInfo : multiDispatchInfo) { expectedSizeCS += EnqueueOperation::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, dispatchInfo.getKernel()); size_t memObjAuxCount = multiDispatchInfo.getMemObjsForAuxTranslation() != nullptr ? multiDispatchInfo.getMemObjsForAuxTranslation()->size() : 0; - expectedSizeCS += dispatchInfo.dispatchInitCommands.estimateCommandsSize(memObjAuxCount); - expectedSizeCS += dispatchInfo.dispatchEpilogueCommands.estimateCommandsSize(memObjAuxCount); + expectedSizeCS += dispatchInfo.dispatchInitCommands.estimateCommandsSize(memObjAuxCount, hwInfo, commandQueueHw.isCacheFlushForBcsRequired()); + expectedSizeCS += dispatchInfo.dispatchEpilogueCommands.estimateCommandsSize(memObjAuxCount, hwInfo, commandQueueHw.isCacheFlushForBcsRequired()); } if (parentKernel) { SchedulerKernel &scheduler = commandQueue.getContext().getSchedulerKernel(); diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index eb2665be70..e1e6dd7f25 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -83,7 +83,7 @@ void HardwareInterface::dispatchWalker( size_t currentDispatchIndex = 0; for (auto &dispatchInfo : multiDispatchInfo) { - dispatchInfo.dispatchInitCommands(*commandStream, timestampPacketDependencies); + dispatchInfo.dispatchInitCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo()); bool isMainKernel = (dispatchInfo.getKernel() == mainKernel); dispatchKernelCommands(commandQueue, dispatchInfo, commandType, *commandStream, isMainKernel, @@ -91,7 +91,7 @@ void HardwareInterface::dispatchWalker( offsetInterfaceDescriptorTable, *dsh, *ioh, *ssh); currentDispatchIndex++; - dispatchInfo.dispatchEpilogueCommands(*commandStream, timestampPacketDependencies); + dispatchInfo.dispatchEpilogueCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo()); } if (mainKernel->requiresCacheFlushCommand(commandQueue)) { uint64_t postSyncAddress = 0; diff --git a/opencl/source/helpers/dispatch_info.h b/opencl/source/helpers/dispatch_info.h index 40f46c0c80..ffd1dd8d39 100644 --- a/opencl/source/helpers/dispatch_info.h +++ b/opencl/source/helpers/dispatch_info.h @@ -26,8 +26,8 @@ struct TimestampPacketDependencies; class DispatchInfo { public: - using DispatchCommandMethodT = void(LinearStream &commandStream, TimestampPacketDependencies *timestampPacketDependencies); - using EstimateCommandsMethodT = size_t(size_t); + using DispatchCommandMethodT = void(LinearStream &commandStream, TimestampPacketDependencies *timestampPacketDependencies, const HardwareInfo &); + using EstimateCommandsMethodT = size_t(size_t, const HardwareInfo &, bool); DispatchInfo() = default; DispatchInfo(Kernel *kernel, uint32_t dim, Vec3 gws, Vec3 elws, Vec3 offset) diff --git a/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp b/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp index df9eeb9b48..13d5f1efa3 100644 --- a/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp +++ b/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp @@ -406,6 +406,9 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing setMockKernelArgs(std::array{{buffer.get()}}); auto mockCmdQ = static_cast *>(commandQueue.get()); + mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true; + mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = false; + mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); auto kernelNode = mockCmdQ->timestampPacketContainer->peekNodes()[0]; @@ -421,6 +424,54 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing verifySemaphore(semaphore, kernelNodeAddress); } +HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingCommandBufferThenSynchronizeCacheFlush) { + using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + + auto buffer = createBuffer(1, true); + setMockKernelArgs(std::array{{buffer.get()}}); + + auto mockCmdQ = static_cast *>(commandQueue.get()); + mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true; + mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = true; + mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + + auto cmdListBcs = getCmdList(bcsCsr->getCS(0)); + auto cmdListQueue = getCmdList(mockCmdQ->getCS(0)); + + uint64_t cacheFlushWriteAddress = 0; + + { + auto cmdFound = expectCommand(cmdListQueue.begin(), cmdListQueue.end()); + cmdFound = expectPipeControl(++cmdFound, cmdListQueue.end()); + + auto pipeControlCmd = genCmdCast(*cmdFound); + if (!pipeControlCmd->getDcFlushEnable()) { + // skip pipe control with TimestampPacket write + cmdFound = expectPipeControl(++cmdFound, cmdListQueue.end()); + pipeControlCmd = genCmdCast(*cmdFound); + } + + EXPECT_TRUE(pipeControlCmd->getDcFlushEnable()); + EXPECT_TRUE(pipeControlCmd->getCommandStreamerStallEnable()); + uint64_t low = pipeControlCmd->getAddress(); + uint64_t high = pipeControlCmd->getAddressHigh(); + cacheFlushWriteAddress = (high << 32) | low; + EXPECT_NE(0u, cacheFlushWriteAddress); + } + + { + // Aux to nonAux + auto cmdFound = expectCommand(cmdListBcs.begin(), cmdListBcs.end()); + + // semaphore before NonAux to Aux + cmdFound = expectCommand(++cmdFound, cmdListBcs.end()); + verifySemaphore(cmdFound, cacheFlushWriteAddress); + } +} + HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingCommandBufferThenSynchronizeEvents) { using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; @@ -487,6 +538,11 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenDispatchi using WALKER_TYPE = typename FamilyType::WALKER_TYPE; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + auto &hwInfo = device->getHardwareInfo(); + auto mockCmdQ = static_cast *>(commandQueue.get()); + mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true; + mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = false; + auto buffer0 = createBuffer(1, true); auto buffer1 = createBuffer(1, false); auto buffer2 = createBuffer(1, true); @@ -500,7 +556,6 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenDispatchi setMockKernelArgs(std::array{{buffer0.get(), buffer1.get(), buffer2.get()}}); - auto mockCmdQ = static_cast *>(commandQueue.get()); mockCmdQ->storeMultiDispatchInfo = true; mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); @@ -510,11 +565,51 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenDispatchi EXPECT_NE(firstDispatchInfo, lastDispatchInfo); // walker split - EXPECT_EQ(dependencySize, firstDispatchInfo->dispatchInitCommands.estimateCommandsSize(memObjects.size())); - EXPECT_EQ(0u, firstDispatchInfo->dispatchEpilogueCommands.estimateCommandsSize(memObjects.size())); + EXPECT_EQ(dependencySize, firstDispatchInfo->dispatchInitCommands.estimateCommandsSize(memObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); + EXPECT_EQ(0u, firstDispatchInfo->dispatchEpilogueCommands.estimateCommandsSize(memObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); - EXPECT_EQ(0u, lastDispatchInfo->dispatchInitCommands.estimateCommandsSize(memObjects.size())); - EXPECT_EQ(dependencySize, lastDispatchInfo->dispatchEpilogueCommands.estimateCommandsSize(memObjects.size())); + EXPECT_EQ(0u, lastDispatchInfo->dispatchInitCommands.estimateCommandsSize(memObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); + EXPECT_EQ(dependencySize, lastDispatchInfo->dispatchEpilogueCommands.estimateCommandsSize(memObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); +} + +HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWithRequiredCacheFlushWhenDispatchingThenEstimateCmdBufferSize) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + auto &hwInfo = device->getHardwareInfo(); + auto mockCmdQ = static_cast *>(commandQueue.get()); + mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true; + mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = true; + + auto buffer0 = createBuffer(1, true); + auto buffer1 = createBuffer(1, false); + auto buffer2 = createBuffer(1, true); + + MemObjsForAuxTranslation memObjects; + memObjects.insert(buffer0.get()); + memObjects.insert(buffer2.get()); + + size_t numBuffersToEstimate = 2; + size_t dependencySize = numBuffersToEstimate * TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); + + size_t cacheFlushSize = MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo); + + setMockKernelArgs(std::array{{buffer0.get(), buffer1.get(), buffer2.get()}}); + + mockCmdQ->storeMultiDispatchInfo = true; + mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); + + MultiDispatchInfo &multiDispatchInfo = mockCmdQ->storedMultiDispatchInfo; + DispatchInfo *firstDispatchInfo = multiDispatchInfo.begin(); + DispatchInfo *lastDispatchInfo = &(*multiDispatchInfo.rbegin()); + + EXPECT_NE(firstDispatchInfo, lastDispatchInfo); // walker split + + EXPECT_EQ(dependencySize, firstDispatchInfo->dispatchInitCommands.estimateCommandsSize(memObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); + EXPECT_EQ(0u, firstDispatchInfo->dispatchEpilogueCommands.estimateCommandsSize(memObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); + + EXPECT_EQ(0u, lastDispatchInfo->dispatchInitCommands.estimateCommandsSize(memObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); + EXPECT_EQ(dependencySize + cacheFlushSize, lastDispatchInfo->dispatchEpilogueCommands.estimateCommandsSize(memObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingBlockedCommandBufferThenSynchronizeBarrier) { @@ -603,6 +698,9 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing // semaphore before NonAux to Aux auto semaphore = expectCommand(++cmdFound, cmdList.end()); + if (mockCmdQ->isCacheFlushForBcsRequired()) { + semaphore = expectCommand(++semaphore, cmdList.end()); + } verifySemaphore(semaphore, kernelNodeAddress); EXPECT_FALSE(commandQueue->isQueueBlocked()); diff --git a/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp index 346c37ade0..64c076b9d5 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_tests.cpp @@ -319,14 +319,15 @@ HWTEST_F(TimestampPacketTests, givenDebugFlagSetWhenCreatingTimestampPacketAlloc HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStreamSizeThenAddPipeControl) { MockKernelWithInternals kernel2(*device); MockMultiDispatchInfo multiDispatchInfo(std::vector({kernel->mockKernel, kernel2.mockKernel})); + auto mockCmdQHw = std::make_unique>(context, device.get(), nullptr); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(*mockCmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0); - auto sizeWithDisabled = mockCmdQ->requestedCmdStreamSize; + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0); + auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; - getCommandStream(*mockCmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0); - auto sizeWithEnabled = mockCmdQ->requestedCmdStreamSize; + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0); + auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; auto extendedSize = sizeWithDisabled + sizeof(typename FamilyType::PIPE_CONTROL); EXPECT_EQ(sizeWithEnabled, extendedSize); @@ -334,12 +335,13 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWriteEnabl HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimatingStreamSizeDontDontAddAdditionalSize) { MockMultiDispatchInfo multiDispatchInfo(std::vector({kernel->mockKernel})); - mockCmdQ->setOoqEnabled(); + auto mockCmdQHw = std::make_unique>(context, device.get(), nullptr); + mockCmdQHw->setOoqEnabled(); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(*mockCmdQ, CsrDependencies(), false, false, + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0); - auto sizeWithDisabled = mockCmdQ->requestedCmdStreamSize; + auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; @@ -349,15 +351,15 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat MockTimestampPacketContainer timestamp4(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 4); MockTimestampPacketContainer timestamp5(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 5); - Event event1(mockCmdQ, 0, 0, 0); + Event event1(mockCmdQHw.get(), 0, 0, 0); event1.addTimestampPacketNodes(timestamp1); - Event event2(mockCmdQ, 0, 0, 0); + Event event2(mockCmdQHw.get(), 0, 0, 0); event2.addTimestampPacketNodes(timestamp2); - Event event3(mockCmdQ, 0, 0, 0); + Event event3(mockCmdQHw.get(), 0, 0, 0); event3.addTimestampPacketNodes(timestamp3); - Event event4(mockCmdQ, 0, 0, 0); + Event event4(mockCmdQHw.get(), 0, 0, 0); event4.addTimestampPacketNodes(timestamp4); - Event event5(mockCmdQ, 0, 0, 0); + Event event5(mockCmdQHw.get(), 0, 0, 0); event5.addTimestampPacketNodes(timestamp5); const cl_uint numEventsOnWaitlist = 5; @@ -368,8 +370,8 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat eventsRequest.fillCsrDependencies( csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); - getCommandStream(*mockCmdQ, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0); - auto sizeWithEnabled = mockCmdQ->requestedCmdStreamSize; + getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0); + auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; size_t sizeForNodeDependency = 0; for (auto timestampPacketContainer : csrDeps) { @@ -385,10 +387,11 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStreamSizeWithWaitlistThenAddSizeForSemaphores) { MockKernelWithInternals kernel2(*device); MockMultiDispatchInfo multiDispatchInfo(std::vector({kernel->mockKernel, kernel2.mockKernel})); + auto mockCmdQHw = std::make_unique>(context, device.get(), nullptr); device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; - getCommandStream(*mockCmdQ, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0); - auto sizeWithDisabled = mockCmdQ->requestedCmdStreamSize; + getCommandStream(*mockCmdQHw, CsrDependencies(), false, false, false, multiDispatchInfo, nullptr, 0); + auto sizeWithDisabled = mockCmdQHw->requestedCmdStreamSize; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; @@ -398,15 +401,15 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr MockTimestampPacketContainer timestamp4(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 4); MockTimestampPacketContainer timestamp5(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 5); - Event event1(mockCmdQ, 0, 0, 0); + Event event1(mockCmdQHw.get(), 0, 0, 0); event1.addTimestampPacketNodes(timestamp1); - Event event2(mockCmdQ, 0, 0, 0); + Event event2(mockCmdQHw.get(), 0, 0, 0); event2.addTimestampPacketNodes(timestamp2); - Event event3(mockCmdQ, 0, 0, 0); + Event event3(mockCmdQHw.get(), 0, 0, 0); event3.addTimestampPacketNodes(timestamp3); - Event event4(mockCmdQ, 0, 0, 0); + Event event4(mockCmdQHw.get(), 0, 0, 0); event4.addTimestampPacketNodes(timestamp4); - Event event5(mockCmdQ, 0, 0, 0); + Event event5(mockCmdQHw.get(), 0, 0, 0); event5.addTimestampPacketNodes(timestamp5); const cl_uint numEventsOnWaitlist = 5; @@ -416,8 +419,8 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr CsrDependencies csrDeps; eventsRequest.fillCsrDependencies(csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); - getCommandStream(*mockCmdQ, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0); - auto sizeWithEnabled = mockCmdQ->requestedCmdStreamSize; + getCommandStream(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0); + auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize; size_t sizeForNodeDependency = 0; for (auto timestampPacketContainer : csrDeps) { diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index edf1d236d3..9f6df29afe 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -188,6 +188,7 @@ class MockCommandQueueHw : public CommandQueueHw { public: using BaseClass::bcsEngine; using BaseClass::bcsTaskCount; + using BaseClass::commandQueueProperties; using BaseClass::commandStream; using BaseClass::gpgpuEngine; using BaseClass::obtainCommandStream; @@ -201,6 +202,15 @@ class MockCommandQueueHw : public CommandQueueHw { cl_queue_properties *properties) : BaseClass(context, device, properties, false) { } + void setOoqEnabled() { + commandQueueProperties |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; + } + + LinearStream &getCS(size_t minRequiredSize) override { + requestedCmdStreamSize = minRequiredSize; + return CommandQueue::getCS(minRequiredSize); + } + UltCommandStreamReceiver &getUltCommandStreamReceiver() { return reinterpret_cast &>(*BaseClass::gpgpuEngine->commandStreamReceiver); } @@ -276,6 +286,7 @@ class MockCommandQueueHw : public CommandQueueHw { MultiDispatchInfo storedMultiDispatchInfo; size_t EnqueueWriteImageCounter = 0; size_t EnqueueWriteBufferCounter = 0; + size_t requestedCmdStreamSize = 0; bool blockingWriteBuffer = false; bool storeMultiDispatchInfo = false; bool notifyEnqueueReadBufferCalled = false; diff --git a/shared/source/helpers/blit_commands_helper.cpp b/shared/source/helpers/blit_commands_helper.cpp index d35bd12e4d..6e2fa277dc 100644 --- a/shared/source/helpers/blit_commands_helper.cpp +++ b/shared/source/helpers/blit_commands_helper.cpp @@ -119,6 +119,7 @@ void BlitProperties::setupDependenciesForAuxTranslation(BlitPropertiesContainer } // wait for NDR before NonAuxToAux + blitPropertiesContainer[numObjects].csrDependencies.push_back(×tampPacketDependencies.cacheFlushNodes); blitPropertiesContainer[numObjects].csrDependencies.push_back(&kernelTimestamps); } diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h index fa25c567fc..1c2ce44975 100644 --- a/shared/source/helpers/timestamp_packet.h +++ b/shared/source/helpers/timestamp_packet.h @@ -10,6 +10,7 @@ #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_stream/csr_deps.h" #include "shared/source/helpers/aux_translation.h" +#include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/non_copyable_or_moveable.h" #include "shared/source/utilities/tag_allocator.h" @@ -137,19 +138,37 @@ struct TimestampPacketHelper { template static void programSemaphoreWithImplicitDependencyForAuxTranslation(LinearStream &cmdStream, - const TimestampPacketDependencies *timestampPacketDependencies) { + const TimestampPacketDependencies *timestampPacketDependencies, + const HardwareInfo &hwInfo) { auto &container = (auxTranslationDirection == AuxTranslationDirection::AuxToNonAux) ? timestampPacketDependencies->auxToNonAuxNodes : timestampPacketDependencies->nonAuxToAuxNodes; + // cache flush after NDR, before NonAuxToAux + if (auxTranslationDirection == AuxTranslationDirection::NonAuxToAux && timestampPacketDependencies->cacheFlushNodes.peekNodes().size() > 0) { + UNRECOVERABLE_IF(timestampPacketDependencies->cacheFlushNodes.peekNodes().size() != 1); + auto cacheFlushTimestampPacketGpuAddress = timestampPacketDependencies->cacheFlushNodes.peekNodes()[0]->getGpuAddress() + + offsetof(TimestampPacketStorage, packets[0].contextEnd); + + MemorySynchronizationCommands::obtainPipeControlAndProgramPostSyncOperation( + cmdStream, GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + cacheFlushTimestampPacketGpuAddress, 0, true, hwInfo); + } + for (auto &node : container.peekNodes()) { TimestampPacketHelper::programSemaphoreWithImplicitDependency(cmdStream, *node); } } - template - static size_t getRequiredCmdStreamSizeForAuxTranslationNodeDependency(size_t count) { - return count * TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); + template + static size_t getRequiredCmdStreamSizeForAuxTranslationNodeDependency(size_t count, const HardwareInfo &hwInfo, bool cacheFlushForBcsRequired) { + size_t size = count * TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); + + if (auxTranslationDirection == AuxTranslationDirection::NonAuxToAux && cacheFlushForBcsRequired) { + size += MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo); + } + + return size; } template