diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 36b6ccb7a7..f624b6f186 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -304,8 +304,8 @@ struct CommandListCoreFamily : public CommandListImp { const void **pRanges); ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]); - ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t &threadGroupDimensions); - void programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize); + ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t &threadGroupDimensions, size_t &patchIndex); + void programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize, size_t &patchIndex); void appendWriteKernelTimestamp(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool maskLsb, bool workloadPartition, bool copyOperation); void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds, bool maskLsb, uint32_t mask, bool workloadPartition, bool copyOperation); void appendEventForProfiling(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool skipBarrierForEndProfiling, bool skipAddingEventToResidency, bool copyOperation); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index a61b55046c..37caf66b84 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2806,7 +2806,7 @@ void CommandListCoreFamily::appendSignalInOrderDependencyCounter( template ze_result_t CommandListCoreFamily::programSyncBuffer(Kernel &kernel, NEO::Device &device, - const ze_group_count_t &threadGroupDimensions) { + const ze_group_count_t &threadGroupDimensions, size_t &patchIndex) { uint32_t maximalNumberOfWorkgroupsAllowed = kernel.suggestMaxCooperativeGroupCount(this->engineGroupType, false); size_t requestedNumberOfWorkgroups = (threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ); @@ -2817,17 +2817,41 @@ ze_result_t CommandListCoreFamily::programSyncBuffer(Kernel &kern auto patchData = NEO::KernelHelper::getSyncBufferAllocationOffset(device, requestedNumberOfWorkgroups); kernel.patchSyncBuffer(patchData.first, patchData.second); + if (!isImmediateType()) { + patchIndex = commandsToPatch.size(); + + CommandToPatch syncBufferSpace; + syncBufferSpace.type = CommandToPatch::NoopSpace; + syncBufferSpace.offset = patchData.second; + syncBufferSpace.pDestination = ptrOffset(patchData.first->getUnderlyingBuffer(), patchData.second); + syncBufferSpace.patchSize = NEO::KernelHelper::getSyncBufferSize(requestedNumberOfWorkgroups); + + commandsToPatch.push_back(syncBufferSpace); + } + return ZE_RESULT_SUCCESS; } template -void CommandListCoreFamily::programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize) { +void CommandListCoreFamily::programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize, size_t &patchIndex) { auto neoDevice = device->getNEODevice(); auto threadGroupCount = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ; auto patchData = NEO::KernelHelper::getRegionGroupBarrierAllocationOffset(*neoDevice, threadGroupCount, localRegionSize); kernel.patchRegionGroupBarrier(patchData.first, patchData.second); + + if (!isImmediateType()) { + patchIndex = commandsToPatch.size(); + + CommandToPatch regionBarrierSpace; + regionBarrierSpace.type = CommandToPatch::NoopSpace; + regionBarrierSpace.offset = patchData.second; + regionBarrierSpace.pDestination = ptrOffset(patchData.first->getUnderlyingBuffer(), patchData.second); + regionBarrierSpace.patchSize = NEO::KernelHelper::getRegionGroupBarrierSize(threadGroupCount, localRegionSize); + + commandsToPatch.push_back(regionBarrierSpace); + } } template @@ -3360,6 +3384,7 @@ void CommandListCoreFamily::clearCommandsToPatch() { break; case CommandToPatch::ComputeWalkerInlineDataScratch: case CommandToPatch::ComputeWalkerImplicitArgsScratch: + case CommandToPatch::NoopSpace: break; default: UNRECOVERABLE_IF(true); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl index c1f3b7de88..0485717fbc 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl @@ -154,7 +154,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K containsCooperativeKernelsFlag = (containsCooperativeKernelsFlag || launchParams.isCooperative); if (kernel->usesSyncBuffer()) { auto retVal = (launchParams.isCooperative - ? programSyncBuffer(*kernel, *device->getNEODevice(), threadGroupDimensions) + ? programSyncBuffer(*kernel, *device->getNEODevice(), threadGroupDimensions, launchParams.syncBufferPatchIndex) : ZE_RESULT_ERROR_INVALID_ARGUMENT); if (retVal) { return retVal; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 79f1eccf88..b61ecf203b 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -250,7 +250,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K if (!launchParams.makeKernelCommandView) { if (kernel->usesSyncBuffer()) { auto retVal = (launchParams.isCooperative - ? programSyncBuffer(*kernel, *neoDevice, threadGroupDimensions) + ? programSyncBuffer(*kernel, *neoDevice, threadGroupDimensions, launchParams.syncBufferPatchIndex) : ZE_RESULT_ERROR_INVALID_ARGUMENT); if (retVal) { return retVal; @@ -258,7 +258,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K } if (kernel->usesRegionGroupBarrier()) { - programRegionGroupBarrier(*kernel, threadGroupDimensions, launchParams.localRegionSize); + programRegionGroupBarrier(*kernel, threadGroupDimensions, launchParams.localRegionSize, launchParams.regionBarrierPatchIndex); } } diff --git a/level_zero/core/source/cmdlist/cmdlist_launch_params.h b/level_zero/core/source/cmdlist/cmdlist_launch_params.h index b9ee778023..aa911b0d71 100644 --- a/level_zero/core/source/cmdlist/cmdlist_launch_params.h +++ b/level_zero/core/source/cmdlist/cmdlist_launch_params.h @@ -32,6 +32,7 @@ struct CommandToPatch { CbWaitEventLoadRegisterImm, ComputeWalkerInlineDataScratch, ComputeWalkerImplicitArgsScratch, + NoopSpace, Invalid }; void *pDestination = nullptr; @@ -51,6 +52,8 @@ struct CmdListKernelLaunchParams { void *hostPayloadBuffer = nullptr; CommandToPatch *outSyncCommand = nullptr; CommandToPatchContainer *outListCommands = nullptr; + size_t syncBufferPatchIndex = std::numeric_limits::max(); + size_t regionBarrierPatchIndex = std::numeric_limits::max(); uint32_t externalPerThreadScratchSize[2] = {0U, 0U}; NEO::RequiredPartitionDim requiredPartitionDim = NEO::RequiredPartitionDim::none; NEO::RequiredDispatchWalkOrder requiredDispatchWalkOrder = NEO::RequiredDispatchWalkOrder::none; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw_skl_to_tgllp.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw_skl_to_tgllp.inl index eb77014f4a..1d392db738 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw_skl_to_tgllp.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw_skl_to_tgllp.inl @@ -191,6 +191,10 @@ void CommandQueueHw::patchCommands(CommandList &commandList, uint args); break; } + case CommandToPatch::NoopSpace: { + memset(commandToPatch.pDestination, 0, commandToPatch.patchSize); + break; + } default: { UNRECOVERABLE_IF(true); } diff --git a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl index 63f1f6a48b..f5fd0b12b7 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl @@ -250,6 +250,10 @@ void CommandQueueHw::patchCommands(CommandList &commandList, uint std::memcpy(scratchAddressPatch, &fullScratchAddress, commandToPatch.patchSize); break; } + case CommandToPatch::NoopSpace: { + memset(commandToPatch.pDestination, 0, commandToPatch.patchSize); + break; + } default: UNRECOVERABLE_IF(true); } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp index b2d981e0ca..637424aaad 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp @@ -66,8 +66,9 @@ HWTEST2_F(MultiTileImmediateCommandListTest, givenMultipleTilesWhenAllocatingBar size_t requestedNumberOfWorkgroups = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ; size_t localRegionSize = 4; + size_t patchIndex = 0; - whiteBoxCmdList->programRegionGroupBarrier(mockKernel, threadGroupDimensions, localRegionSize); + whiteBoxCmdList->programRegionGroupBarrier(mockKernel, threadGroupDimensions, localRegionSize, patchIndex); auto patchData = neoDevice->syncBufferHandler->obtainAllocationAndOffset(1); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index 7471a81d08..15097c8b0c 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -464,6 +464,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau commandList->initialize(device, engineGroupType, 0u); auto result = commandList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, cooperativeParams, false); EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_NE(std::numeric_limits::max(), cooperativeParams.syncBufferPatchIndex); auto mockSyncBufferHandler = reinterpret_cast(device->getNEODevice()->syncBufferHandler.get()); auto syncBufferAllocation = mockSyncBufferHandler->graphicsAllocation; @@ -476,6 +477,13 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau EXPECT_EQ(syncBufferAllocation, kernel.getSyncBufferAllocation()); + auto &cmdsToPatch = commandList->getCommandsToPatch(); + ASSERT_NE(0u, cmdsToPatch.size()); + + auto noopParam = cmdsToPatch[cooperativeParams.syncBufferPatchIndex]; + EXPECT_EQ(CommandToPatch::NoopSpace, noopParam.type); + EXPECT_NE(0u, noopParam.patchSize); + commandList = std::make_unique>>(); commandList->initialize(device, engineGroupType, 0u); CmdListKernelLaunchParams launchParams = {}; @@ -514,7 +522,18 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau result = commandList->appendLaunchKernelWithParams(&kernel, groupCount, nullptr, launchParams); EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, result); } + + const ze_command_queue_desc_t desc = {}; + std::unique_ptr commandListImmediate(CommandList::createImmediate(productFamily, device, &desc, false, engineGroupType, result)); + + cooperativeParams.isCooperative = true; + cooperativeParams.syncBufferPatchIndex = std::numeric_limits::max(); + + result = commandListImmediate->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, cooperativeParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(std::numeric_limits::max(), cooperativeParams.syncBufferPatchIndex); } + HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLaunchCooperativeKernelWithMakeViewIsCalledThenNoAllocationCreated, IsAtLeastXeHpCore) { Mock<::L0::KernelImp> kernel; auto pMockModule = std::unique_ptr(new Mock(device, nullptr)); @@ -599,6 +618,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenA CmdListKernelLaunchParams launchParams = {}; launchParams.localRegionSize = 4; EXPECT_EQ(ZE_RESULT_SUCCESS, cmdList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false)); + EXPECT_EQ(std::numeric_limits::max(), launchParams.regionBarrierPatchIndex); auto patchPtr = *reinterpret_cast(ptrOffset(kernel.crossThreadData.get(), regionGroupBarrier.stateless)); EXPECT_NE(0u, patchPtr); @@ -633,6 +653,18 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenA auto offset = alignUp((requestedNumberOfWorkgroups / launchParams.localRegionSize) * (launchParams.localRegionSize + 1) * 2 * sizeof(uint32_t), MemoryConstants::cacheLineSize); EXPECT_EQ(patchPtr2, patchPtr + offset); + + std::unique_ptr cmdListRegular(CommandList::create(productFamily, device, NEO::EngineGroupType::compute, 0, result, false)); + + EXPECT_EQ(ZE_RESULT_SUCCESS, cmdListRegular->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false)); + EXPECT_NE(std::numeric_limits::max(), launchParams.regionBarrierPatchIndex); + + auto &cmdsToPatch = cmdListRegular->getCommandsToPatch(); + ASSERT_NE(0u, cmdsToPatch.size()); + + auto noopParam = cmdsToPatch[launchParams.regionBarrierPatchIndex]; + EXPECT_EQ(CommandToPatch::NoopSpace, noopParam.type); + EXPECT_NE(0u, noopParam.patchSize); } HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenAppendLaunchKernelWithMakeViewIsCalledThenNoPatchBuffer, IsAtLeastXeHpCore) { diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp index 83eb317f9e..16af76a96b 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp @@ -1027,5 +1027,31 @@ HWTEST2_F(CommandQueueScratchTests, givenCommandsToPatchToNotSupportedPlatformWh commandList->commandsToPatch.clear(); } +using CommandQueueCreate = Test; + +HWTEST2_F(CommandQueueCreate, givenCommandsToPatchWithNoopSpacePatchWhenPatchCommandsIsCalledThenSpaceIsNooped, MatchAny) { + ze_command_queue_desc_t desc = {}; + NEO::CommandStreamReceiver *csr = nullptr; + device->getCsrForOrdinalAndIndex(&csr, 0u, 0u, ZE_COMMAND_QUEUE_PRIORITY_NORMAL, false); + auto commandQueue = std::make_unique>(device, csr, &desc); + auto commandList = std::make_unique>>(); + + constexpr uint32_t dataSize = 64; + auto patchBuffer = std::make_unique(dataSize); + auto zeroBuffer = std::make_unique(dataSize); + memset(patchBuffer.get(), 0xFF, dataSize); + memset(zeroBuffer.get(), 0x0, dataSize); + + CommandToPatch commandToPatch; + + commandToPatch.type = CommandToPatch::NoopSpace; + commandToPatch.pDestination = patchBuffer.get(); + commandToPatch.patchSize = dataSize; + + commandList->commandsToPatch.push_back(commandToPatch); + commandQueue->patchCommands(*commandList, 0, false); + EXPECT_EQ(0, memcmp(patchBuffer.get(), zeroBuffer.get(), dataSize)); +} + } // namespace ult } // namespace L0