diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 2be3ecfbf9..36c7013b3b 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -9,6 +9,7 @@ #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/device/device.h" +#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h" #include "shared/source/helpers/engine_control.h" #include "shared/source/helpers/hw_info.h" #include "shared/source/os_interface/os_context.h" @@ -401,7 +402,8 @@ class CommandQueueHw : public CommandQueue { EventBuilder &externalEventBuilder, std::unique_ptr &&printfHandler, CommandStreamReceiver *bcsCsr, - TagNodeBase *multiRootDeviceSyncNode); + TagNodeBase *multiRootDeviceSyncNode, + CsrDependencyContainer *csrDependencies); CompletionStamp enqueueCommandWithoutKernel(Surface **surfaces, size_t surfaceCount, @@ -449,6 +451,7 @@ class CommandQueueHw : public CommandQueue { protected: MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){}; + MOCKABLE_VIRTUAL bool prepareCsrDependency(CsrDependencies &csrDeps, CsrDependencyContainer &dependencyTags, TimestampPacketDependencies ×tampPacketDependencies, TagAllocatorBase *allocator, bool blockQueue); size_t calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image); cl_int enqueueReadWriteBufferOnCpuWithMemoryTransfer(cl_command_type commandType, Buffer *buffer, diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index afe7559980..fce4fbdb7d 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -9,6 +9,7 @@ #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/command_stream/wait_status.h" #include "shared/source/direct_submission/relaxed_ordering_helper.h" +#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h" #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/helpers/flat_batch_buffer_helper.h" #include "shared/source/helpers/flush_stamp.h" @@ -457,7 +458,8 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, eventBuilder, std::move(printfHandler), nullptr, - multiRootEventSyncStamp); + multiRootEventSyncStamp, + nullptr); } if (deferredTimestampPackets.get()) { @@ -994,7 +996,8 @@ void CommandQueueHw::enqueueBlocked( EventBuilder &externalEventBuilder, std::unique_ptr &&printfHandler, CommandStreamReceiver *bcsCsr, - TagNodeBase *multiRootDeviceSyncNode) { + TagNodeBase *multiRootDeviceSyncNode, + CsrDependencyContainer *dependencyTags) { TakeOwnershipWrapper> queueOwnership(*this); @@ -1033,9 +1036,8 @@ void CommandQueueHw::enqueueBlocked( storeTimestampPackets = (timestampPacketContainer != nullptr); } - if (enqueueProperties.operation != EnqueueProperties::Operation::gpuKernel) { - command = std::make_unique(*this, blockedCommandsData); + command = std::make_unique(*this, blockedCommandsData, dependencyTags); } else { // store task data in event std::vector allSurfaces; @@ -1244,6 +1246,23 @@ size_t CommandQueueHw::calculateHostPtrSizeForImage(const size_t *reg return Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, image->getImageDesc().image_type); } +template +bool CommandQueueHw::prepareCsrDependency(CsrDependencies &csrDeps, CsrDependencyContainer &dependencyTags, TimestampPacketDependencies ×tampPacketDependencies, TagAllocatorBase *allocator, bool blockQueue) { + for (auto &dependentCsr : csrDeps.csrWithMultiEngineDependencies) { + auto tag = allocator->getTag(); + timestampPacketDependencies.multiCsrDependencies.add(tag); + if (!blockQueue) { + bool submitStatus = dependentCsr->submitDependencyUpdate(tag); + if (!submitStatus) { + return submitStatus; + } + } else { + dependencyTags.push_back(std::make_pair(dependentCsr, tag)); + } + } + return true; +} + template bool CommandQueueHw::isSplitEnqueueBlitNeeded(TransferDirection transferDirection, size_t transferSize, CommandStreamReceiver &csr) { auto bcsSplit = getDevice().isBcsSplitSupported() && @@ -1438,14 +1457,7 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp if (isCacheFlushForBcsRequired() && gpgpuSubmission) { timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag()); } - for (auto &dependentCsr : csrDeps.csrWithMultiEngineDependencies) { - auto tag = allocator->getTag(); - timestampPacketDependencies.multiCsrDependencies.add(tag); - bool submitStatus = dependentCsr->submitDependencyUpdate(tag); - if (!submitStatus) { - return CL_OUT_OF_RESOURCES; - } - } + obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr); csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes); @@ -1472,6 +1484,13 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp gpgpuCommandStream = obtainCommandStream(csrDeps, true, blockQueue, multiDispatchInfo, eventsRequest, blockedCommandsData, nullptr, 0, false, false); gpgpuCommandStreamStart = gpgpuCommandStream->getUsed(); } + CsrDependencyContainer dependencyTags; + if (csrDeps.csrWithMultiEngineDependencies.size() > 0) { + bool submitStatus = prepareCsrDependency(csrDeps, dependencyTags, timestampPacketDependencies, allocator, blockQueue); + if (!submitStatus) { + return CL_OUT_OF_RESOURCES; + } + } blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies, eventsRequest, gpgpuCommandStream, cmdType, blockQueue, multiRootEventSyncStamp)); @@ -1501,7 +1520,7 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp updateFromCompletionStamp(completionStamp, pEventBuilder->getEvent()); if (blockQueue) { - enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, *pEventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp); + enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, *pEventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp, &dependencyTags); if (gpgpuSubmission) { if (debugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) { diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index a58cc99dfe..141328b5c4 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -130,7 +130,7 @@ CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::uniq bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr &&printfHandler, PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount, TagNodeBase *multiRootDeviceSyncNode) - : Command(commandQueue, kernelOperation), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM), + : Command(commandQueue, kernelOperation, nullptr), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM), commandType(commandType), printfHandler(std::move(printfHandler)), kernel(kernel), kernelCount(kernelCount), preemptionMode(preemptionMode), multiRootDeviceSyncNode(multiRootDeviceSyncNode) { UNRECOVERABLE_IF(nullptr == this->kernel); @@ -326,6 +326,7 @@ TaskCountType CommandWithoutKernel::dispatchBlitOperation() { blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies->cacheFlushNodes); blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies->previousEnqueueNodes); blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies->barrierNodes); + blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies->multiCsrDependencies); blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0]; if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { @@ -348,7 +349,13 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term this->terminated = true; return completionStamp; } - + for (auto &tagCsrPair : csrDependencies) { + bool submitStatus = tagCsrPair.first->submitDependencyUpdate(tagCsrPair.second); + if (!submitStatus) { + completionStamp.taskCount = CompletionStamp::gpuHang; + return completionStamp; + } + } auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver(); if (!kernelOperation) { @@ -532,6 +539,10 @@ void Command::makeTimestampPacketsResident(CommandStreamReceiver &commandStreamR Command::Command(CommandQueue &commandQueue) : commandQueue(commandQueue) {} -Command::Command(CommandQueue &commandQueue, std::unique_ptr &kernelOperation) - : commandQueue(commandQueue), kernelOperation(std::move(kernelOperation)) {} +Command::Command(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, CsrDependencyContainer *csrDependencies) + : commandQueue(commandQueue), kernelOperation(std::move(kernelOperation)) { + if (csrDependencies) { + this->csrDependencies = *csrDependencies; + } +} } // namespace NEO diff --git a/opencl/source/helpers/task_information.h b/opencl/source/helpers/task_information.h index 1f78cade9d..5b2ed58a06 100644 --- a/opencl/source/helpers/task_information.h +++ b/opencl/source/helpers/task_information.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -7,6 +7,7 @@ #pragma once #include "shared/source/command_stream/linear_stream.h" +#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h" #include "shared/source/helpers/blit_properties.h" #include "shared/source/helpers/completion_stamp.h" #include "shared/source/helpers/map_operation_type.h" @@ -87,7 +88,7 @@ class Command : public IFNode { Command() = delete; Command(CommandQueue &commandQueue); - Command(CommandQueue &commandQueue, std::unique_ptr &kernelOperation); + Command(CommandQueue &commandQueue, std::unique_ptr &kernelOperation, CsrDependencyContainer *csrDependencies); ~Command() override; virtual LinearStream *getCommandStream() { @@ -108,6 +109,7 @@ class Command : public IFNode { std::unique_ptr timestampPacketDependencies; EventsRequest eventsRequest = {0, nullptr, nullptr}; std::vector eventsWaitlist; + CsrDependencyContainer csrDependencies; }; class CommandMapUnmap : public Command { diff --git a/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp b/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp index f491fc9f1b..abf493b798 100644 --- a/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp @@ -1377,7 +1377,7 @@ HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenBlockedEventWhenWaitingForCom EXPECT_EQ(2u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); clWaitForEvents(1, &outEvent1); - EXPECT_EQ(2u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); + EXPECT_EQ(1u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); EXPECT_EQ(1u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); clReleaseEvent(outEvent1); diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp index b89ec4601e..1ea0d5ffaf 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp @@ -11,6 +11,7 @@ #include "shared/source/gmm_helper/gmm.h" #include "shared/source/helpers/array_count.h" #include "shared/source/helpers/basic_math.h" +#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h" #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/helpers/timestamp_packet.h" #include "shared/source/memory_manager/internal_allocation_storage.h" @@ -2514,6 +2515,76 @@ TEST_F(CommandQueueWithTimestampPacketTests, givenQueueWhenSettingAndQueryingLas } } +HWTEST_F(CommandQueueWithTimestampPacketTests, givedDependencyBetweenCsrWhenPrepareDependencyUpdateCalledThenNewTagAddedToTimestampDependencies) { + MockContext context{}; + auto mockCmdQ = std::make_unique>(&context, context.getDevice(0), nullptr); + auto dependentCsr = std::make_unique(*context.getDevice(0)->getExecutionEnvironment(), context.getDevice(0)->getRootDeviceIndex(), 1); + TimestampPacketDependencies dependencies{}; + CsrDependencies csrDeps; + csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr.get()); + CsrDependencyContainer dependencyMap; + TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); + bool blockQueue = false; + mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue); + EXPECT_EQ(dependencies.multiCsrDependencies.peekNodes().size(), 1u); +} + +HWTEST_F(CommandQueueWithTimestampPacketTests, givedNoDependencyBetweenCsrWhenPrepareDependencyUpdateCalledThenTagIsNotAddedToTimestampDependencies) { + MockContext context{}; + auto mockCmdQ = std::make_unique>(&context, context.getDevice(0), nullptr); + TimestampPacketDependencies dependencies{}; + CsrDependencies csrDeps; + CsrDependencyContainer dependencyMap; + TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); + bool blockQueue = false; + mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue); + EXPECT_EQ(dependencies.multiCsrDependencies.peekNodes().size(), 0u); +} + +HWTEST_F(CommandQueueWithTimestampPacketTests, givedDependencyBetweenCsrWhenPrepareDependencyUpdateCalledForNonBlockedQueueThenSubmitDependencyUpdateCalled) { + MockContext context{}; + auto mockCmdQ = std::make_unique>(&context, context.getDevice(0), nullptr); + auto dependentCsr = std::make_unique(*context.getDevice(0)->getExecutionEnvironment(), context.getDevice(0)->getRootDeviceIndex(), 1); + TimestampPacketDependencies dependencies{}; + CsrDependencies csrDeps; + csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr.get()); + CsrDependencyContainer dependencyMap; + TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); + bool blockQueue = false; + mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue); + EXPECT_EQ(dependentCsr->submitDependencyUpdateCalledTimes, 1u); + EXPECT_EQ(dependencyMap.size(), 0u); +} + +HWTEST_F(CommandQueueWithTimestampPacketTests, givedDependencyBetweenCsrWhenPrepareDependencyUpdateCalledForBlockedQueueThenDependencyMapHasOneItem) { + MockContext context{}; + auto mockCmdQ = std::make_unique>(&context, context.getDevice(0), nullptr); + auto dependentCsr = std::make_unique(*context.getDevice(0)->getExecutionEnvironment(), context.getDevice(0)->getRootDeviceIndex(), 1); + TimestampPacketDependencies dependencies{}; + CsrDependencies csrDeps; + csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr.get()); + CsrDependencyContainer dependencyMap; + TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); + bool blockQueue = true; + mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue); + EXPECT_EQ(dependentCsr->submitDependencyUpdateCalledTimes, 0u); + EXPECT_EQ(dependencyMap.size(), 1u); +} + +HWTEST_F(CommandQueueWithTimestampPacketTests, givedDependencyBetweenCsrWhenSubmitDependencyUpdateReturnsFalseThenProcessDependencyReturnsFalse) { + MockContext context{}; + auto mockCmdQ = std::make_unique>(&context, context.getDevice(0), nullptr); + auto dependentCsr = std::make_unique(*context.getDevice(0)->getExecutionEnvironment(), context.getDevice(0)->getRootDeviceIndex(), 1); + TimestampPacketDependencies dependencies{}; + CsrDependencies csrDeps; + csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr.get()); + CsrDependencyContainer dependencyMap; + TagAllocatorBase *allocator = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); + bool blockQueue = false; + dependentCsr->submitDependencyUpdateReturnValue = false; + EXPECT_FALSE(mockCmdQ->prepareCsrDependency(csrDeps, dependencyMap, dependencies, allocator, blockQueue)); +} + using KernelExecutionTypesTests = DispatchFlagsTests; HWTEST_F(KernelExecutionTypesTests, givenConcurrentKernelWhileDoingNonBlockedEnqueueThenCorrectKernelTypeIsSetInCSR) { using CsrType = MockCsrHw2; diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp index 52aab100e4..c7ad7cc8cd 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -167,7 +167,7 @@ HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontReg Surface *surfaces[] = {nullptr}; mockCmdQ->enqueueBlocked(CL_COMMAND_MARKER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueuePropertiesForDependencyFlush, eventsRequest, - eventBuilder, std::unique_ptr(nullptr), nullptr, nullptr); + eventBuilder, std::unique_ptr(nullptr), nullptr, nullptr, nullptr); EXPECT_FALSE(blockedCommandsDataForDependencyFlush->blitEnqueue); } @@ -200,7 +200,7 @@ HWTEST_F(EnqueueHandlerTest, givenBlitPropertyWhenEnqueueIsBlockedThenRegisterBl Surface *surfaces[] = {nullptr}; mockCmdQ->enqueueBlocked(CL_COMMAND_READ_BUFFER, surfaces, size_t(0), multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueuePropertiesForBlitEnqueue, eventsRequest, - eventBuilder, std::unique_ptr(nullptr), mockCmdQ->getBcsForAuxTranslation(), nullptr); + eventBuilder, std::unique_ptr(nullptr), mockCmdQ->getBcsForAuxTranslation(), nullptr, nullptr); EXPECT_TRUE(blockedCommandsDataForBlitEnqueue->blitEnqueue); EXPECT_EQ(blitProperties.srcAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->srcAllocation); EXPECT_EQ(blitProperties.dstAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->dstAllocation); diff --git a/opencl/test/unit_test/helpers/task_information_tests.cpp b/opencl/test/unit_test/helpers/task_information_tests.cpp index 63a40d7dfb..f595a49a09 100644 --- a/opencl/test/unit_test/helpers/task_information_tests.cpp +++ b/opencl/test/unit_test/helpers/task_information_tests.cpp @@ -1,10 +1,11 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * */ +#include "shared/source/helpers/bcs_ccs_dependency_pair_container.h" #include "shared/source/memory_manager/allocation_properties.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/test/common/mocks/mock_csr.h" @@ -372,7 +373,7 @@ HWTEST_F(DispatchFlagsTests, givenCommandWithoutKernelWhenSubmitThenPassCorrectD auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 1, AllocationType::commandBuffer, device->getDeviceBitfield()})); auto kernelOperation = std::make_unique(cmdStream, *mockCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage()); kernelOperation->setHeaps(ih1, ih2, ih3); - std::unique_ptr command(new CommandWithoutKernel(*mockCmdQ, kernelOperation)); + std::unique_ptr command(new CommandWithoutKernel(*mockCmdQ, kernelOperation, nullptr)); command->setTimestampPacketNode(*mockCmdQ->timestampPacketContainer, std::move(timestampPacketDependencies)); command->submit(20, false); @@ -393,6 +394,64 @@ HWTEST_F(DispatchFlagsTests, givenCommandWithoutKernelWhenSubmitThenPassCorrectD EXPECT_FALSE(mockCsr->passedDispatchFlags.epilogueRequired); } +HWTEST_F(DispatchFlagsTests, givenCsrDependencyWhenSubmitCommandWithoutKernelThenDependencyUpdateWasCalled) { + using CsrType = MockCsr1; + setUpImpl(); + + auto mockCmdQ = std::make_unique>(context.get(), device.get(), nullptr); + auto mockCsr = static_cast(&mockCmdQ->getGpgpuCommandStreamReceiver()); + auto dependentCsr = std::make_unique(*device->getExecutionEnvironment(), device->getRootDeviceIndex(), device->getDeviceBitfield()); + + mockCsr->timestampPacketWriteEnabled = true; + mockCmdQ->timestampPacketContainer = std::make_unique(); + IndirectHeap *ih1 = nullptr, *ih2 = nullptr, *ih3 = nullptr; + TimestampPacketDependencies timestampPacketDependencies; + mockCmdQ->allocateHeapMemory(IndirectHeap::Type::dynamicState, 1, ih1); + mockCmdQ->allocateHeapMemory(IndirectHeap::Type::indirectObject, 1, ih2); + mockCmdQ->allocateHeapMemory(IndirectHeap::Type::surfaceState, 1, ih3); + + auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 1, AllocationType::commandBuffer, device->getDeviceBitfield()})); + auto kernelOperation = std::make_unique(cmdStream, *mockCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage()); + kernelOperation->setHeaps(ih1, ih2, ih3); + CsrDependencyContainer dependencyMap; + auto tag = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag(); + dependencyMap.push_back(std::make_pair(dependentCsr.get(), tag)); + std::unique_ptr command(new CommandWithoutKernel(*mockCmdQ, kernelOperation, &dependencyMap)); + command->setTimestampPacketNode(*mockCmdQ->timestampPacketContainer, std::move(timestampPacketDependencies)); + + command->submit(20, false); + EXPECT_EQ(dependentCsr->submitDependencyUpdateCalledTimes, 1u); +} + +HWTEST_F(DispatchFlagsTests, givenCsrDependencyWhendependencyUpdateReturnsFalseThenSubmitReturnGpuHang) { + using CsrType = MockCsr1; + setUpImpl(); + + auto mockCmdQ = std::make_unique>(context.get(), device.get(), nullptr); + auto mockCsr = static_cast(&mockCmdQ->getGpgpuCommandStreamReceiver()); + auto dependentCsr = std::make_unique(*device->getExecutionEnvironment(), device->getRootDeviceIndex(), device->getDeviceBitfield()); + + mockCsr->timestampPacketWriteEnabled = true; + mockCmdQ->timestampPacketContainer = std::make_unique(); + IndirectHeap *ih1 = nullptr, *ih2 = nullptr, *ih3 = nullptr; + TimestampPacketDependencies timestampPacketDependencies; + mockCmdQ->allocateHeapMemory(IndirectHeap::Type::dynamicState, 1, ih1); + mockCmdQ->allocateHeapMemory(IndirectHeap::Type::indirectObject, 1, ih2); + mockCmdQ->allocateHeapMemory(IndirectHeap::Type::surfaceState, 1, ih3); + + auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 1, AllocationType::commandBuffer, device->getDeviceBitfield()})); + auto kernelOperation = std::make_unique(cmdStream, *mockCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage()); + kernelOperation->setHeaps(ih1, ih2, ih3); + CsrDependencyContainer dependencyMap; + auto tag = mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag(); + dependencyMap.push_back(std::make_pair(dependentCsr.get(), tag)); + std::unique_ptr command(new CommandWithoutKernel(*mockCmdQ, kernelOperation, &dependencyMap)); + command->setTimestampPacketNode(*mockCmdQ->timestampPacketContainer, std::move(timestampPacketDependencies)); + dependentCsr->submitDependencyUpdateReturnValue = false; + auto stamp = command->submit(20, false); + EXPECT_EQ(stamp.taskCount, CompletionStamp::gpuHang); +} + HWTEST_F(DispatchFlagsTests, givenCommandComputeKernelWhenSubmitThenPassCorrectDispatchHints) { using CsrType = MockCsr1; setUpImpl(); diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index baa71962ad..a141fd8dcb 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -281,6 +281,7 @@ class MockCommandQueueHw : public CommandQueueHw { using BaseClass::obtainCommandStream; using BaseClass::obtainNewTimestampPacketNodes; using BaseClass::overrideEngine; + using BaseClass::prepareCsrDependency; using BaseClass::processDispatchForKernels; using BaseClass::relaxedOrderingForGpgpuAllowed; using BaseClass::requiresCacheFlushAfterWalker; diff --git a/shared/source/helpers/CMakeLists.txt b/shared/source/helpers/CMakeLists.txt index 41d307fdfb..e8e2cd5959 100644 --- a/shared/source/helpers/CMakeLists.txt +++ b/shared/source/helpers/CMakeLists.txt @@ -20,6 +20,7 @@ set(NEO_CORE_HELPERS ${CMAKE_CURRENT_SOURCE_DIR}/array_count.h ${CMAKE_CURRENT_SOURCE_DIR}/aux_translation.h ${CMAKE_CURRENT_SOURCE_DIR}/basic_math.h + ${CMAKE_CURRENT_SOURCE_DIR}/bcs_ccs_dependency_pair_container.h ${CMAKE_CURRENT_SOURCE_DIR}/bindless_heaps_helper.cpp ${CMAKE_CURRENT_SOURCE_DIR}/bindless_heaps_helper.h ${CMAKE_CURRENT_SOURCE_DIR}/bit_helpers.h diff --git a/shared/source/helpers/bcs_ccs_dependency_pair_container.h b/shared/source/helpers/bcs_ccs_dependency_pair_container.h new file mode 100644 index 0000000000..379598dca8 --- /dev/null +++ b/shared/source/helpers/bcs_ccs_dependency_pair_container.h @@ -0,0 +1,15 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once +#include + +namespace NEO { +class CommandStreamReceiver; +class TagNodeBase; +using CsrDependencyContainer = std::vector>; +} // namespace NEO diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index 8e65725a81..3851f1c53c 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -78,7 +78,10 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { SubmissionStatus flushTagUpdate() override { return SubmissionStatus::success; }; void updateTagFromWait() override{}; - bool submitDependencyUpdate(TagNodeBase *tag) override { return true; }; + bool submitDependencyUpdate(TagNodeBase *tag) override { + submitDependencyUpdateCalledTimes++; + return submitDependencyUpdateReturnValue; + } bool isUpdateTagFromWaitEnabled() override { return false; }; void writeMemoryAub(aub_stream::AllocationParams &allocationParams) override { @@ -243,6 +246,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { uint32_t writeMemoryAubCalled = 0; uint32_t makeResidentCalledTimes = 0; uint32_t downloadAllocationsCalledCount = 0; + uint32_t submitDependencyUpdateCalledTimes = 0; int hostPtrSurfaceCreationMutexLockCount = 0; bool multiOsContextCapable = false; bool memoryCompressionEnabled = false; @@ -259,6 +263,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { BatchBuffer latestFlushedBatchBuffer = {}; QueueThrottle getLastDirectSubmissionThrottleReturnValue = QueueThrottle::MEDIUM; bool getAcLineConnectedReturnValue = true; + bool submitDependencyUpdateReturnValue = true; }; class MockCommandStreamReceiverWithFailingSubmitBatch : public MockCommandStreamReceiver {