From 692def2c793572cd343f10de661bea7fa3474dbc Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Mon, 3 Jun 2024 14:19:36 +0000 Subject: [PATCH] feature: region group barrier allocation support Related-To: NEO-11031 Signed-off-by: Bartosz Dunajski --- level_zero/core/source/cmdlist/cmdlist_hw.h | 1 + level_zero/core/source/cmdlist/cmdlist_hw.inl | 10 +++++ .../cmdlist/cmdlist_hw_xehp_and_later.inl | 4 ++ level_zero/core/source/kernel/kernel.h | 4 +- level_zero/core/source/kernel/kernel_imp.cpp | 12 +++++ level_zero/core/source/kernel/kernel_imp.h | 2 + .../test_cmdlist_append_launch_kernel_3.cpp | 45 +++++++++++++++++++ .../zebin/zeinfo_decoder.cpp | 3 +- shared/source/kernel/kernel_descriptor.h | 5 ++- shared/source/program/sync_buffer_handler.cpp | 19 +++++++- shared/source/program/sync_buffer_handler.h | 4 +- shared/source/program/sync_buffer_handler.inl | 16 ++----- .../zebin_decoder_tests.cpp | 6 ++- 13 files changed, 112 insertions(+), 19 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index dd9f2205a0..acf5bd5d4f 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -307,6 +307,7 @@ struct CommandListCoreFamily : public CommandListImp { ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]); ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t &threadGroupDimensions); + void programRegionGroupBarrier(Kernel &kernel); void appendWriteKernelTimestamp(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool maskLsb, bool workloadPartition, bool copyOperation); void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds, bool maskLsb, uint32_t mask, bool workloadPartition, bool copyOperation); void appendEventForProfiling(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool skipBarrierForEndProfiling, bool skipAddingEventToResidency, bool copyOperation); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index cd6d440116..3e65292a5c 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2789,6 +2789,16 @@ ze_result_t CommandListCoreFamily::programSyncBuffer(Kernel &kern return ZE_RESULT_SUCCESS; } +template +void CommandListCoreFamily::programRegionGroupBarrier(Kernel &kernel) { + auto neoDevice = device->getNEODevice(); + + neoDevice->allocateSyncBufferHandler(); + auto patchData = neoDevice->syncBufferHandler->obtainAllocationAndOffset(MemoryConstants::cacheLineSize); + + kernel.patchRegionGroupBarrier(patchData.first, patchData.second); +} + template void CommandListCoreFamily::appendWriteKernelTimestamp(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool maskLsb, bool workloadPartition, bool copyOperation) { constexpr uint32_t mask = 0xfffffffe; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 5ee4f891f5..07db7a2fd1 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -277,6 +277,10 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K } } + if (kernel->usesRegionGroupBarrier()) { + programRegionGroupBarrier(*kernel); + } + bool uncachedMocsKernel = isKernelUncachedMocsRequired(kernelImp->getKernelRequiresUncachedMocs()); this->requiresQueueUncachedMocs |= kernelImp->getKernelRequiresQueueUncachedMocs(); diff --git a/level_zero/core/source/kernel/kernel.h b/level_zero/core/source/kernel/kernel.h index 2dea6f847e..f9abf2238d 100644 --- a/level_zero/core/source/kernel/kernel.h +++ b/level_zero/core/source/kernel/kernel.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -158,7 +158,9 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI { virtual void printPrintfOutput(bool hangDetected) = 0; virtual bool usesSyncBuffer() = 0; + virtual bool usesRegionGroupBarrier() const = 0; virtual void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0; + virtual void patchRegionGroupBarrier(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0; virtual NEO::GraphicsAllocation *allocatePrivateMemoryGraphicsAllocation() = 0; virtual void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) = 0; diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 33c9f4bd19..3fbe0310f2 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -1191,6 +1191,10 @@ bool KernelImp::usesSyncBuffer() { return this->kernelImmData->getDescriptor().kernelAttributes.flags.usesSyncBuffer; } +bool KernelImp::usesRegionGroupBarrier() const { + return this->kernelImmData->getDescriptor().kernelAttributes.flags.usesRegionGroupBarrier; +} + void KernelImp::patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) { this->residencyContainer.push_back(gfxAllocation); NEO::patchPointer(ArrayRef(crossThreadData.get(), crossThreadDataSize), @@ -1198,6 +1202,14 @@ void KernelImp::patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t b static_cast(ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset))); } +void KernelImp::patchRegionGroupBarrier(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) { + this->residencyContainer.push_back(gfxAllocation); + + NEO::patchPointer(ArrayRef(crossThreadData.get(), crossThreadDataSize), + this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.regionGroupBarrierBuffer, + static_cast(ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset))); +} + uint32_t KernelImp::getSurfaceStateHeapDataSize() const { if (NEO::KernelDescriptor::isBindlessAddressingKernel(kernelImmData->getDescriptor())) { const auto bindlessHeapsHelper = this->module && this->module->getDevice()->getNEODevice()->getBindlessHeapsHelper(); diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h index f30fdf3cc2..14fdb48fdc 100644 --- a/level_zero/core/source/kernel/kernel_imp.h +++ b/level_zero/core/source/kernel/kernel_imp.h @@ -111,7 +111,9 @@ struct KernelImp : Kernel { void printPrintfOutput(bool hangDetected) override; bool usesSyncBuffer() override; + bool usesRegionGroupBarrier() const override; void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) override; + void patchRegionGroupBarrier(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) override; const uint8_t *getSurfaceStateHeapData() const override { return surfaceStateHeapData.get(); } uint32_t getSurfaceStateHeapDataSize() const override; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index 6d5062c2d5..ab291e9f1a 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -15,6 +15,7 @@ #include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/unit_test_helper.h" +#include "shared/test/common/libult/ult_command_stream_receiver.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/common/test_macros/hw_test.h" @@ -485,6 +486,50 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau } } +HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenAppendLaunchKernelIsCalledThenPatchBuffer, IsAtLeastXeHpCore) { + auto ultCsr = static_cast *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver); + ultCsr->storeMakeResidentAllocations = true; + + Mock<::L0::KernelImp> kernel; + auto pMockModule = std::unique_ptr(new Mock(device, nullptr)); + kernel.module = pMockModule.get(); + + kernel.crossThreadData = std::make_unique(64); + kernel.crossThreadDataSize = 64; + + kernel.setGroupSize(4, 1, 1); + ze_group_count_t groupCount{8, 1, 1}; + + auto &kernelAttributes = kernel.immutableData.kernelDescriptor->kernelAttributes; + kernelAttributes.flags.usesRegionGroupBarrier = true; + + auto ®ionGroupBarrier = kernel.immutableData.kernelDescriptor->payloadMappings.implicitArgs.regionGroupBarrierBuffer; + regionGroupBarrier.stateless = 0x8; + regionGroupBarrier.pointerSize = 8; + + const ze_command_queue_desc_t desc = {}; + ze_result_t result = ZE_RESULT_SUCCESS; + + std::unique_ptr cmdList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::renderCompute, result)); + + CmdListKernelLaunchParams launchParams = {}; + EXPECT_EQ(ZE_RESULT_SUCCESS, cmdList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false)); + + auto patchPtr = *reinterpret_cast(ptrOffset(kernel.crossThreadData.get(), regionGroupBarrier.stateless)); + EXPECT_NE(0u, patchPtr); + + auto allocIter = std::find_if(ultCsr->makeResidentAllocations.begin(), ultCsr->makeResidentAllocations.end(), [patchPtr](const std::pair &element) { + return element.first->getGpuAddressToPatch() == patchPtr; + }); + ASSERT_NE(ultCsr->makeResidentAllocations.end(), allocIter); + + EXPECT_EQ(ZE_RESULT_SUCCESS, cmdList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false)); + + auto patchPtr2 = *reinterpret_cast(ptrOffset(kernel.crossThreadData.get(), regionGroupBarrier.stateless)); + + EXPECT_EQ(patchPtr2, patchPtr + MemoryConstants::cacheLineSize); +} + HWTEST2_F(CommandListAppendLaunchKernel, whenAppendLaunchCooperativeKernelAndQueryKernelTimestampsToTheSameCmdlistThenFronEndStateIsNotChanged, IsAtLeastSkl) { Mock<::L0::KernelImp> kernel; auto pMockModule = std::unique_ptr(new Mock(device, nullptr)); diff --git a/shared/source/device_binary_format/zebin/zeinfo_decoder.cpp b/shared/source/device_binary_format/zebin/zeinfo_decoder.cpp index 5d793c4013..880822afda 100644 --- a/shared/source/device_binary_format/zebin/zeinfo_decoder.cpp +++ b/shared/source/device_binary_format/zebin/zeinfo_decoder.cpp @@ -1396,7 +1396,8 @@ DecodeError populateKernelPayloadArgument(NEO::KernelDescriptor &dst, const Kern return populateWithOffsetChecked(dst.payloadMappings.dispatchTraits.regionGroupWgCount, sizeof(int32_t), Tags::Kernel::PayloadArgument::ArgType::regionGroupWgCount); case Types::Kernel::argTypeRegionGroupBarrierBuffer: - return populateWithOffsetChecked(dst.payloadMappings.dispatchTraits.regionGroupBarrierBuffer, sizeof(int64_t), Tags::Kernel::PayloadArgument::ArgType::regionGroupBarrierBuffer); + dst.kernelAttributes.flags.usesRegionGroupBarrier = true; + return populateArgPointerStateless(dst.payloadMappings.implicitArgs.regionGroupBarrierBuffer); } UNREACHABLE(); diff --git a/shared/source/kernel/kernel_descriptor.h b/shared/source/kernel/kernel_descriptor.h index 0d9ffeda8d..7f93735c52 100644 --- a/shared/source/kernel/kernel_descriptor.h +++ b/shared/source/kernel/kernel_descriptor.h @@ -124,7 +124,8 @@ struct KernelDescriptor { bool hasSample : 1; // 3 bool usesAssert : 1; - bool reserved : 7; + bool usesRegionGroupBarrier : 1; + bool reserved : 6; }; std::array packed; } flags = {}; @@ -158,7 +159,6 @@ struct KernelDescriptor { CrossThreadDataOffset regionGroupSize[3] = {undefined, undefined, undefined}; CrossThreadDataOffset regionGroupDimension = undefined; CrossThreadDataOffset regionGroupWgCount = undefined; - CrossThreadDataOffset regionGroupBarrierBuffer = undefined; } dispatchTraits; struct { @@ -185,6 +185,7 @@ struct KernelDescriptor { ArgDescPointer syncBufferAddress; ArgDescPointer rtDispatchGlobals; ArgDescPointer assertBufferAddress; + ArgDescPointer regionGroupBarrierBuffer; CrossThreadDataOffset privateMemorySize = undefined; CrossThreadDataOffset maxWorkGroupSize = undefined; CrossThreadDataOffset simdSize = undefined; diff --git a/shared/source/program/sync_buffer_handler.cpp b/shared/source/program/sync_buffer_handler.cpp index f299e98b84..9c11e56800 100644 --- a/shared/source/program/sync_buffer_handler.cpp +++ b/shared/source/program/sync_buffer_handler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -42,4 +42,21 @@ void SyncBufferHandler::allocateNewBuffer() { std::memset(cpuPointer, 0, bufferSize); } +std::pair SyncBufferHandler::obtainAllocationAndOffset(size_t requiredSize) { + std::lock_guard guard(this->mutex); + + bool isCurrentBufferFull = (usedBufferSize + requiredSize > bufferSize); + if (isCurrentBufferFull) { + memoryManager.checkGpuUsageAndDestroyGraphicsAllocations(graphicsAllocation); + allocateNewBuffer(); + usedBufferSize = 0; + } + + std::pair allocationAndOffset = {graphicsAllocation, usedBufferSize}; + + usedBufferSize += requiredSize; + + return allocationAndOffset; +} + } // namespace NEO diff --git a/shared/source/program/sync_buffer_handler.h b/shared/source/program/sync_buffer_handler.h index 6ad9686298..eab3b52827 100644 --- a/shared/source/program/sync_buffer_handler.h +++ b/shared/source/program/sync_buffer_handler.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -29,6 +29,8 @@ class SyncBufferHandler : NonCopyableOrMovableClass { void prepareForEnqueue(size_t workGroupsCount, KernelT &kernel); void makeResident(CommandStreamReceiver &csr); + std::pair obtainAllocationAndOffset(size_t requiredSize); + protected: void allocateNewBuffer(); diff --git a/shared/source/program/sync_buffer_handler.inl b/shared/source/program/sync_buffer_handler.inl index 3d3038d920..f9867db153 100644 --- a/shared/source/program/sync_buffer_handler.inl +++ b/shared/source/program/sync_buffer_handler.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -10,16 +10,8 @@ template void NEO::SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, KernelT &kernel) { auto requiredSize = alignUp(workGroupsCount, CommonConstants::maximalSizeOfAtomicType); - std::lock_guard guard(this->mutex); - bool isCurrentBufferFull = (usedBufferSize + requiredSize > bufferSize); - if (isCurrentBufferFull) { - memoryManager.checkGpuUsageAndDestroyGraphicsAllocations(graphicsAllocation); - allocateNewBuffer(); - usedBufferSize = 0; - } + auto patchData = obtainAllocationAndOffset(requiredSize); - kernel.patchSyncBuffer(graphicsAllocation, usedBufferSize); - - usedBufferSize += requiredSize; -} + kernel.patchSyncBuffer(patchData.first, patchData.second); +} \ No newline at end of file diff --git a/shared/test/unit_test/device_binary_format/zebin_decoder_tests.cpp b/shared/test/unit_test/device_binary_format/zebin_decoder_tests.cpp index 2bf198626d..e181f21805 100644 --- a/shared/test/unit_test/device_binary_format/zebin_decoder_tests.cpp +++ b/shared/test/unit_test/device_binary_format/zebin_decoder_tests.cpp @@ -5070,6 +5070,7 @@ TEST_F(decodeZeInfoKernelEntryTest, givenRegionArgTypesWhenArgSizeIsCorrectThenR for (uint32_t i = 0; i < 3; ++i) { EXPECT_EQ(16 + sizeof(uint32_t) * i, kernelDescriptor->payloadMappings.dispatchTraits.regionGroupSize[i]); } + EXPECT_FALSE(kernelDescriptor->kernelAttributes.flags.usesRegionGroupBarrier); ConstStringRef zeInfoRegionGroupDim = R"===( kernels: @@ -5087,6 +5088,7 @@ TEST_F(decodeZeInfoKernelEntryTest, givenRegionArgTypesWhenArgSizeIsCorrectThenR EXPECT_TRUE(warnings.empty()) << warnings; EXPECT_EQ(16, kernelDescriptor->payloadMappings.dispatchTraits.regionGroupDimension); + EXPECT_FALSE(kernelDescriptor->kernelAttributes.flags.usesRegionGroupBarrier); ConstStringRef zeInfoRegionGroupCount = R"===( kernels: @@ -5104,6 +5106,7 @@ TEST_F(decodeZeInfoKernelEntryTest, givenRegionArgTypesWhenArgSizeIsCorrectThenR EXPECT_TRUE(warnings.empty()) << warnings; EXPECT_EQ(16, kernelDescriptor->payloadMappings.dispatchTraits.regionGroupWgCount); + EXPECT_FALSE(kernelDescriptor->kernelAttributes.flags.usesRegionGroupBarrier); ConstStringRef zeInfoRegionGroupBarrier = R"===( kernels: @@ -5120,7 +5123,8 @@ TEST_F(decodeZeInfoKernelEntryTest, givenRegionArgTypesWhenArgSizeIsCorrectThenR EXPECT_TRUE(errors.empty()) << errors; EXPECT_TRUE(warnings.empty()) << warnings; - EXPECT_EQ(16, kernelDescriptor->payloadMappings.dispatchTraits.regionGroupBarrierBuffer); + EXPECT_EQ(16, kernelDescriptor->payloadMappings.implicitArgs.regionGroupBarrierBuffer.stateless); + EXPECT_TRUE(kernelDescriptor->kernelAttributes.flags.usesRegionGroupBarrier); } TEST_F(decodeZeInfoKernelEntryTest, GivenArgTypeGlobalSizeWhenArgSizeValidThenPopulatesKernelDescriptor) {