mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-20 00:24:58 +08:00
feature: region group barrier allocation support
Related-To: NEO-11031 Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
844b29d17c
commit
692def2c79
@@ -307,6 +307,7 @@ struct CommandListCoreFamily : public CommandListImp {
|
||||
|
||||
ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]);
|
||||
ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t &threadGroupDimensions);
|
||||
void programRegionGroupBarrier(Kernel &kernel);
|
||||
void appendWriteKernelTimestamp(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool maskLsb, bool workloadPartition, bool copyOperation);
|
||||
void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds, bool maskLsb, uint32_t mask, bool workloadPartition, bool copyOperation);
|
||||
void appendEventForProfiling(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool skipBarrierForEndProfiling, bool skipAddingEventToResidency, bool copyOperation);
|
||||
|
||||
@@ -2789,6 +2789,16 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::programSyncBuffer(Kernel &kern
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::programRegionGroupBarrier(Kernel &kernel) {
|
||||
auto neoDevice = device->getNEODevice();
|
||||
|
||||
neoDevice->allocateSyncBufferHandler();
|
||||
auto patchData = neoDevice->syncBufferHandler->obtainAllocationAndOffset(MemoryConstants::cacheLineSize);
|
||||
|
||||
kernel.patchRegionGroupBarrier(patchData.first, patchData.second);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::appendWriteKernelTimestamp(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool maskLsb, bool workloadPartition, bool copyOperation) {
|
||||
constexpr uint32_t mask = 0xfffffffe;
|
||||
|
||||
@@ -277,6 +277,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
|
||||
}
|
||||
}
|
||||
|
||||
if (kernel->usesRegionGroupBarrier()) {
|
||||
programRegionGroupBarrier(*kernel);
|
||||
}
|
||||
|
||||
bool uncachedMocsKernel = isKernelUncachedMocsRequired(kernelImp->getKernelRequiresUncachedMocs());
|
||||
this->requiresQueueUncachedMocs |= kernelImp->getKernelRequiresQueueUncachedMocs();
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
* Copyright (C) 2020-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -158,7 +158,9 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
|
||||
virtual void printPrintfOutput(bool hangDetected) = 0;
|
||||
|
||||
virtual bool usesSyncBuffer() = 0;
|
||||
virtual bool usesRegionGroupBarrier() const = 0;
|
||||
virtual void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0;
|
||||
virtual void patchRegionGroupBarrier(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0;
|
||||
|
||||
virtual NEO::GraphicsAllocation *allocatePrivateMemoryGraphicsAllocation() = 0;
|
||||
virtual void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) = 0;
|
||||
|
||||
@@ -1191,6 +1191,10 @@ bool KernelImp::usesSyncBuffer() {
|
||||
return this->kernelImmData->getDescriptor().kernelAttributes.flags.usesSyncBuffer;
|
||||
}
|
||||
|
||||
bool KernelImp::usesRegionGroupBarrier() const {
|
||||
return this->kernelImmData->getDescriptor().kernelAttributes.flags.usesRegionGroupBarrier;
|
||||
}
|
||||
|
||||
void KernelImp::patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
|
||||
this->residencyContainer.push_back(gfxAllocation);
|
||||
NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
|
||||
@@ -1198,6 +1202,14 @@ void KernelImp::patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t b
|
||||
static_cast<uintptr_t>(ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset)));
|
||||
}
|
||||
|
||||
void KernelImp::patchRegionGroupBarrier(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
|
||||
this->residencyContainer.push_back(gfxAllocation);
|
||||
|
||||
NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
|
||||
this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.regionGroupBarrierBuffer,
|
||||
static_cast<uintptr_t>(ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset)));
|
||||
}
|
||||
|
||||
uint32_t KernelImp::getSurfaceStateHeapDataSize() const {
|
||||
if (NEO::KernelDescriptor::isBindlessAddressingKernel(kernelImmData->getDescriptor())) {
|
||||
const auto bindlessHeapsHelper = this->module && this->module->getDevice()->getNEODevice()->getBindlessHeapsHelper();
|
||||
|
||||
@@ -111,7 +111,9 @@ struct KernelImp : Kernel {
|
||||
void printPrintfOutput(bool hangDetected) override;
|
||||
|
||||
bool usesSyncBuffer() override;
|
||||
bool usesRegionGroupBarrier() const override;
|
||||
void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) override;
|
||||
void patchRegionGroupBarrier(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) override;
|
||||
|
||||
const uint8_t *getSurfaceStateHeapData() const override { return surfaceStateHeapData.get(); }
|
||||
uint32_t getSurfaceStateHeapDataSize() const override;
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/helpers/unit_test_helper.h"
|
||||
#include "shared/test/common/libult/ult_command_stream_receiver.h"
|
||||
#include "shared/test/common/mocks/mock_device.h"
|
||||
#include "shared/test/common/test_macros/hw_test.h"
|
||||
|
||||
@@ -485,6 +486,50 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenAppendLaunchKernelIsCalledThenPatchBuffer, IsAtLeastXeHpCore) {
|
||||
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
|
||||
ultCsr->storeMakeResidentAllocations = true;
|
||||
|
||||
Mock<::L0::KernelImp> kernel;
|
||||
auto pMockModule = std::unique_ptr<Module>(new Mock<Module>(device, nullptr));
|
||||
kernel.module = pMockModule.get();
|
||||
|
||||
kernel.crossThreadData = std::make_unique<uint8_t[]>(64);
|
||||
kernel.crossThreadDataSize = 64;
|
||||
|
||||
kernel.setGroupSize(4, 1, 1);
|
||||
ze_group_count_t groupCount{8, 1, 1};
|
||||
|
||||
auto &kernelAttributes = kernel.immutableData.kernelDescriptor->kernelAttributes;
|
||||
kernelAttributes.flags.usesRegionGroupBarrier = true;
|
||||
|
||||
auto ®ionGroupBarrier = kernel.immutableData.kernelDescriptor->payloadMappings.implicitArgs.regionGroupBarrierBuffer;
|
||||
regionGroupBarrier.stateless = 0x8;
|
||||
regionGroupBarrier.pointerSize = 8;
|
||||
|
||||
const ze_command_queue_desc_t desc = {};
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
|
||||
std::unique_ptr<L0::CommandList> cmdList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::renderCompute, result));
|
||||
|
||||
CmdListKernelLaunchParams launchParams = {};
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, cmdList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false));
|
||||
|
||||
auto patchPtr = *reinterpret_cast<uint64_t *>(ptrOffset(kernel.crossThreadData.get(), regionGroupBarrier.stateless));
|
||||
EXPECT_NE(0u, patchPtr);
|
||||
|
||||
auto allocIter = std::find_if(ultCsr->makeResidentAllocations.begin(), ultCsr->makeResidentAllocations.end(), [patchPtr](const std::pair<GraphicsAllocation *, uint32_t> &element) {
|
||||
return element.first->getGpuAddressToPatch() == patchPtr;
|
||||
});
|
||||
ASSERT_NE(ultCsr->makeResidentAllocations.end(), allocIter);
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, cmdList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false));
|
||||
|
||||
auto patchPtr2 = *reinterpret_cast<uint64_t *>(ptrOffset(kernel.crossThreadData.get(), regionGroupBarrier.stateless));
|
||||
|
||||
EXPECT_EQ(patchPtr2, patchPtr + MemoryConstants::cacheLineSize);
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandListAppendLaunchKernel, whenAppendLaunchCooperativeKernelAndQueryKernelTimestampsToTheSameCmdlistThenFronEndStateIsNotChanged, IsAtLeastSkl) {
|
||||
Mock<::L0::KernelImp> kernel;
|
||||
auto pMockModule = std::unique_ptr<Module>(new Mock<Module>(device, nullptr));
|
||||
|
||||
@@ -1396,7 +1396,8 @@ DecodeError populateKernelPayloadArgument(NEO::KernelDescriptor &dst, const Kern
|
||||
return populateWithOffsetChecked(dst.payloadMappings.dispatchTraits.regionGroupWgCount, sizeof(int32_t), Tags::Kernel::PayloadArgument::ArgType::regionGroupWgCount);
|
||||
|
||||
case Types::Kernel::argTypeRegionGroupBarrierBuffer:
|
||||
return populateWithOffsetChecked(dst.payloadMappings.dispatchTraits.regionGroupBarrierBuffer, sizeof(int64_t), Tags::Kernel::PayloadArgument::ArgType::regionGroupBarrierBuffer);
|
||||
dst.kernelAttributes.flags.usesRegionGroupBarrier = true;
|
||||
return populateArgPointerStateless(dst.payloadMappings.implicitArgs.regionGroupBarrierBuffer);
|
||||
}
|
||||
|
||||
UNREACHABLE();
|
||||
|
||||
@@ -124,7 +124,8 @@ struct KernelDescriptor {
|
||||
bool hasSample : 1;
|
||||
// 3
|
||||
bool usesAssert : 1;
|
||||
bool reserved : 7;
|
||||
bool usesRegionGroupBarrier : 1;
|
||||
bool reserved : 6;
|
||||
};
|
||||
std::array<bool, 4> packed;
|
||||
} flags = {};
|
||||
@@ -158,7 +159,6 @@ struct KernelDescriptor {
|
||||
CrossThreadDataOffset regionGroupSize[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
|
||||
CrossThreadDataOffset regionGroupDimension = undefined<CrossThreadDataOffset>;
|
||||
CrossThreadDataOffset regionGroupWgCount = undefined<CrossThreadDataOffset>;
|
||||
CrossThreadDataOffset regionGroupBarrierBuffer = undefined<CrossThreadDataOffset>;
|
||||
} dispatchTraits;
|
||||
|
||||
struct {
|
||||
@@ -185,6 +185,7 @@ struct KernelDescriptor {
|
||||
ArgDescPointer syncBufferAddress;
|
||||
ArgDescPointer rtDispatchGlobals;
|
||||
ArgDescPointer assertBufferAddress;
|
||||
ArgDescPointer regionGroupBarrierBuffer;
|
||||
CrossThreadDataOffset privateMemorySize = undefined<CrossThreadDataOffset>;
|
||||
CrossThreadDataOffset maxWorkGroupSize = undefined<CrossThreadDataOffset>;
|
||||
CrossThreadDataOffset simdSize = undefined<CrossThreadDataOffset>;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
* Copyright (C) 2019-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -42,4 +42,21 @@ void SyncBufferHandler::allocateNewBuffer() {
|
||||
std::memset(cpuPointer, 0, bufferSize);
|
||||
}
|
||||
|
||||
std::pair<GraphicsAllocation *, size_t> SyncBufferHandler::obtainAllocationAndOffset(size_t requiredSize) {
|
||||
std::lock_guard<std::mutex> guard(this->mutex);
|
||||
|
||||
bool isCurrentBufferFull = (usedBufferSize + requiredSize > bufferSize);
|
||||
if (isCurrentBufferFull) {
|
||||
memoryManager.checkGpuUsageAndDestroyGraphicsAllocations(graphicsAllocation);
|
||||
allocateNewBuffer();
|
||||
usedBufferSize = 0;
|
||||
}
|
||||
|
||||
std::pair<GraphicsAllocation *, size_t> allocationAndOffset = {graphicsAllocation, usedBufferSize};
|
||||
|
||||
usedBufferSize += requiredSize;
|
||||
|
||||
return allocationAndOffset;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
* Copyright (C) 2019-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -29,6 +29,8 @@ class SyncBufferHandler : NonCopyableOrMovableClass {
|
||||
void prepareForEnqueue(size_t workGroupsCount, KernelT &kernel);
|
||||
void makeResident(CommandStreamReceiver &csr);
|
||||
|
||||
std::pair<GraphicsAllocation *, size_t> obtainAllocationAndOffset(size_t requiredSize);
|
||||
|
||||
protected:
|
||||
void allocateNewBuffer();
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Intel Corporation
|
||||
* Copyright (C) 2021-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -10,16 +10,8 @@
|
||||
template <typename KernelT>
|
||||
void NEO::SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, KernelT &kernel) {
|
||||
auto requiredSize = alignUp(workGroupsCount, CommonConstants::maximalSizeOfAtomicType);
|
||||
std::lock_guard<std::mutex> guard(this->mutex);
|
||||
|
||||
bool isCurrentBufferFull = (usedBufferSize + requiredSize > bufferSize);
|
||||
if (isCurrentBufferFull) {
|
||||
memoryManager.checkGpuUsageAndDestroyGraphicsAllocations(graphicsAllocation);
|
||||
allocateNewBuffer();
|
||||
usedBufferSize = 0;
|
||||
}
|
||||
auto patchData = obtainAllocationAndOffset(requiredSize);
|
||||
|
||||
kernel.patchSyncBuffer(graphicsAllocation, usedBufferSize);
|
||||
|
||||
usedBufferSize += requiredSize;
|
||||
}
|
||||
kernel.patchSyncBuffer(patchData.first, patchData.second);
|
||||
}
|
||||
@@ -5070,6 +5070,7 @@ TEST_F(decodeZeInfoKernelEntryTest, givenRegionArgTypesWhenArgSizeIsCorrectThenR
|
||||
for (uint32_t i = 0; i < 3; ++i) {
|
||||
EXPECT_EQ(16 + sizeof(uint32_t) * i, kernelDescriptor->payloadMappings.dispatchTraits.regionGroupSize[i]);
|
||||
}
|
||||
EXPECT_FALSE(kernelDescriptor->kernelAttributes.flags.usesRegionGroupBarrier);
|
||||
|
||||
ConstStringRef zeInfoRegionGroupDim = R"===(
|
||||
kernels:
|
||||
@@ -5087,6 +5088,7 @@ TEST_F(decodeZeInfoKernelEntryTest, givenRegionArgTypesWhenArgSizeIsCorrectThenR
|
||||
EXPECT_TRUE(warnings.empty()) << warnings;
|
||||
|
||||
EXPECT_EQ(16, kernelDescriptor->payloadMappings.dispatchTraits.regionGroupDimension);
|
||||
EXPECT_FALSE(kernelDescriptor->kernelAttributes.flags.usesRegionGroupBarrier);
|
||||
|
||||
ConstStringRef zeInfoRegionGroupCount = R"===(
|
||||
kernels:
|
||||
@@ -5104,6 +5106,7 @@ TEST_F(decodeZeInfoKernelEntryTest, givenRegionArgTypesWhenArgSizeIsCorrectThenR
|
||||
EXPECT_TRUE(warnings.empty()) << warnings;
|
||||
|
||||
EXPECT_EQ(16, kernelDescriptor->payloadMappings.dispatchTraits.regionGroupWgCount);
|
||||
EXPECT_FALSE(kernelDescriptor->kernelAttributes.flags.usesRegionGroupBarrier);
|
||||
|
||||
ConstStringRef zeInfoRegionGroupBarrier = R"===(
|
||||
kernels:
|
||||
@@ -5120,7 +5123,8 @@ TEST_F(decodeZeInfoKernelEntryTest, givenRegionArgTypesWhenArgSizeIsCorrectThenR
|
||||
EXPECT_TRUE(errors.empty()) << errors;
|
||||
EXPECT_TRUE(warnings.empty()) << warnings;
|
||||
|
||||
EXPECT_EQ(16, kernelDescriptor->payloadMappings.dispatchTraits.regionGroupBarrierBuffer);
|
||||
EXPECT_EQ(16, kernelDescriptor->payloadMappings.implicitArgs.regionGroupBarrierBuffer.stateless);
|
||||
EXPECT_TRUE(kernelDescriptor->kernelAttributes.flags.usesRegionGroupBarrier);
|
||||
}
|
||||
|
||||
TEST_F(decodeZeInfoKernelEntryTest, GivenArgTypeGlobalSizeWhenArgSizeValidThenPopulatesKernelDescriptor) {
|
||||
|
||||
Reference in New Issue
Block a user