fix: zero sync and region barrier buffers before use on regular command lists

Related-To: NEO-13350

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2024-12-05 20:03:41 +00:00
committed by Compute-Runtime-Automation
parent 93fba587d0
commit 75139d2322
10 changed files with 103 additions and 8 deletions

View File

@@ -304,8 +304,8 @@ struct CommandListCoreFamily : public CommandListImp {
const void **pRanges);
ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]);
ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t &threadGroupDimensions);
void programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize);
ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t &threadGroupDimensions, size_t &patchIndex);
void programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize, size_t &patchIndex);
void appendWriteKernelTimestamp(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool maskLsb, bool workloadPartition, bool copyOperation);
void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds, bool maskLsb, uint32_t mask, bool workloadPartition, bool copyOperation);
void appendEventForProfiling(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool skipBarrierForEndProfiling, bool skipAddingEventToResidency, bool copyOperation);

View File

@@ -2806,7 +2806,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::programSyncBuffer(Kernel &kernel, NEO::Device &device,
const ze_group_count_t &threadGroupDimensions) {
const ze_group_count_t &threadGroupDimensions, size_t &patchIndex) {
uint32_t maximalNumberOfWorkgroupsAllowed = kernel.suggestMaxCooperativeGroupCount(this->engineGroupType, false);
size_t requestedNumberOfWorkgroups = (threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ);
@@ -2817,17 +2817,41 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::programSyncBuffer(Kernel &kern
auto patchData = NEO::KernelHelper::getSyncBufferAllocationOffset(device, requestedNumberOfWorkgroups);
kernel.patchSyncBuffer(patchData.first, patchData.second);
if (!isImmediateType()) {
patchIndex = commandsToPatch.size();
CommandToPatch syncBufferSpace;
syncBufferSpace.type = CommandToPatch::NoopSpace;
syncBufferSpace.offset = patchData.second;
syncBufferSpace.pDestination = ptrOffset(patchData.first->getUnderlyingBuffer(), patchData.second);
syncBufferSpace.patchSize = NEO::KernelHelper::getSyncBufferSize(requestedNumberOfWorkgroups);
commandsToPatch.push_back(syncBufferSpace);
}
return ZE_RESULT_SUCCESS;
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize) {
void CommandListCoreFamily<gfxCoreFamily>::programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize, size_t &patchIndex) {
auto neoDevice = device->getNEODevice();
auto threadGroupCount = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ;
auto patchData = NEO::KernelHelper::getRegionGroupBarrierAllocationOffset(*neoDevice, threadGroupCount, localRegionSize);
kernel.patchRegionGroupBarrier(patchData.first, patchData.second);
if (!isImmediateType()) {
patchIndex = commandsToPatch.size();
CommandToPatch regionBarrierSpace;
regionBarrierSpace.type = CommandToPatch::NoopSpace;
regionBarrierSpace.offset = patchData.second;
regionBarrierSpace.pDestination = ptrOffset(patchData.first->getUnderlyingBuffer(), patchData.second);
regionBarrierSpace.patchSize = NEO::KernelHelper::getRegionGroupBarrierSize(threadGroupCount, localRegionSize);
commandsToPatch.push_back(regionBarrierSpace);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -3360,6 +3384,7 @@ void CommandListCoreFamily<gfxCoreFamily>::clearCommandsToPatch() {
break;
case CommandToPatch::ComputeWalkerInlineDataScratch:
case CommandToPatch::ComputeWalkerImplicitArgsScratch:
case CommandToPatch::NoopSpace:
break;
default:
UNRECOVERABLE_IF(true);

View File

@@ -154,7 +154,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
containsCooperativeKernelsFlag = (containsCooperativeKernelsFlag || launchParams.isCooperative);
if (kernel->usesSyncBuffer()) {
auto retVal = (launchParams.isCooperative
? programSyncBuffer(*kernel, *device->getNEODevice(), threadGroupDimensions)
? programSyncBuffer(*kernel, *device->getNEODevice(), threadGroupDimensions, launchParams.syncBufferPatchIndex)
: ZE_RESULT_ERROR_INVALID_ARGUMENT);
if (retVal) {
return retVal;

View File

@@ -250,7 +250,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
if (!launchParams.makeKernelCommandView) {
if (kernel->usesSyncBuffer()) {
auto retVal = (launchParams.isCooperative
? programSyncBuffer(*kernel, *neoDevice, threadGroupDimensions)
? programSyncBuffer(*kernel, *neoDevice, threadGroupDimensions, launchParams.syncBufferPatchIndex)
: ZE_RESULT_ERROR_INVALID_ARGUMENT);
if (retVal) {
return retVal;
@@ -258,7 +258,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
}
if (kernel->usesRegionGroupBarrier()) {
programRegionGroupBarrier(*kernel, threadGroupDimensions, launchParams.localRegionSize);
programRegionGroupBarrier(*kernel, threadGroupDimensions, launchParams.localRegionSize, launchParams.regionBarrierPatchIndex);
}
}

View File

@@ -32,6 +32,7 @@ struct CommandToPatch {
CbWaitEventLoadRegisterImm,
ComputeWalkerInlineDataScratch,
ComputeWalkerImplicitArgsScratch,
NoopSpace,
Invalid
};
void *pDestination = nullptr;
@@ -51,6 +52,8 @@ struct CmdListKernelLaunchParams {
void *hostPayloadBuffer = nullptr;
CommandToPatch *outSyncCommand = nullptr;
CommandToPatchContainer *outListCommands = nullptr;
size_t syncBufferPatchIndex = std::numeric_limits<size_t>::max();
size_t regionBarrierPatchIndex = std::numeric_limits<size_t>::max();
uint32_t externalPerThreadScratchSize[2] = {0U, 0U};
NEO::RequiredPartitionDim requiredPartitionDim = NEO::RequiredPartitionDim::none;
NEO::RequiredDispatchWalkOrder requiredDispatchWalkOrder = NEO::RequiredDispatchWalkOrder::none;

View File

@@ -191,6 +191,10 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
args);
break;
}
case CommandToPatch::NoopSpace: {
memset(commandToPatch.pDestination, 0, commandToPatch.patchSize);
break;
}
default: {
UNRECOVERABLE_IF(true);
}

View File

@@ -250,6 +250,10 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
std::memcpy(scratchAddressPatch, &fullScratchAddress, commandToPatch.patchSize);
break;
}
case CommandToPatch::NoopSpace: {
memset(commandToPatch.pDestination, 0, commandToPatch.patchSize);
break;
}
default:
UNRECOVERABLE_IF(true);
}

View File

@@ -66,8 +66,9 @@ HWTEST2_F(MultiTileImmediateCommandListTest, givenMultipleTilesWhenAllocatingBar
size_t requestedNumberOfWorkgroups = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ;
size_t localRegionSize = 4;
size_t patchIndex = 0;
whiteBoxCmdList->programRegionGroupBarrier(mockKernel, threadGroupDimensions, localRegionSize);
whiteBoxCmdList->programRegionGroupBarrier(mockKernel, threadGroupDimensions, localRegionSize, patchIndex);
auto patchData = neoDevice->syncBufferHandler->obtainAllocationAndOffset(1);

View File

@@ -464,6 +464,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau
commandList->initialize(device, engineGroupType, 0u);
auto result = commandList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, cooperativeParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_NE(std::numeric_limits<size_t>::max(), cooperativeParams.syncBufferPatchIndex);
auto mockSyncBufferHandler = reinterpret_cast<MockSyncBufferHandler *>(device->getNEODevice()->syncBufferHandler.get());
auto syncBufferAllocation = mockSyncBufferHandler->graphicsAllocation;
@@ -476,6 +477,13 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau
EXPECT_EQ(syncBufferAllocation, kernel.getSyncBufferAllocation());
auto &cmdsToPatch = commandList->getCommandsToPatch();
ASSERT_NE(0u, cmdsToPatch.size());
auto noopParam = cmdsToPatch[cooperativeParams.syncBufferPatchIndex];
EXPECT_EQ(CommandToPatch::NoopSpace, noopParam.type);
EXPECT_NE(0u, noopParam.patchSize);
commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
commandList->initialize(device, engineGroupType, 0u);
CmdListKernelLaunchParams launchParams = {};
@@ -514,7 +522,18 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau
result = commandList->appendLaunchKernelWithParams(&kernel, groupCount, nullptr, launchParams);
EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, result);
}
const ze_command_queue_desc_t desc = {};
std::unique_ptr<L0::CommandList> commandListImmediate(CommandList::createImmediate(productFamily, device, &desc, false, engineGroupType, result));
cooperativeParams.isCooperative = true;
cooperativeParams.syncBufferPatchIndex = std::numeric_limits<size_t>::max();
result = commandListImmediate->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, cooperativeParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(std::numeric_limits<size_t>::max(), cooperativeParams.syncBufferPatchIndex);
}
HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLaunchCooperativeKernelWithMakeViewIsCalledThenNoAllocationCreated, IsAtLeastXeHpCore) {
Mock<::L0::KernelImp> kernel;
auto pMockModule = std::unique_ptr<Module>(new Mock<Module>(device, nullptr));
@@ -599,6 +618,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenA
CmdListKernelLaunchParams launchParams = {};
launchParams.localRegionSize = 4;
EXPECT_EQ(ZE_RESULT_SUCCESS, cmdList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false));
EXPECT_EQ(std::numeric_limits<size_t>::max(), launchParams.regionBarrierPatchIndex);
auto patchPtr = *reinterpret_cast<uint64_t *>(ptrOffset(kernel.crossThreadData.get(), regionGroupBarrier.stateless));
EXPECT_NE(0u, patchPtr);
@@ -633,6 +653,18 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenA
auto offset = alignUp((requestedNumberOfWorkgroups / launchParams.localRegionSize) * (launchParams.localRegionSize + 1) * 2 * sizeof(uint32_t), MemoryConstants::cacheLineSize);
EXPECT_EQ(patchPtr2, patchPtr + offset);
std::unique_ptr<L0::CommandList> cmdListRegular(CommandList::create(productFamily, device, NEO::EngineGroupType::compute, 0, result, false));
EXPECT_EQ(ZE_RESULT_SUCCESS, cmdListRegular->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false));
EXPECT_NE(std::numeric_limits<size_t>::max(), launchParams.regionBarrierPatchIndex);
auto &cmdsToPatch = cmdListRegular->getCommandsToPatch();
ASSERT_NE(0u, cmdsToPatch.size());
auto noopParam = cmdsToPatch[launchParams.regionBarrierPatchIndex];
EXPECT_EQ(CommandToPatch::NoopSpace, noopParam.type);
EXPECT_NE(0u, noopParam.patchSize);
}
HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenAppendLaunchKernelWithMakeViewIsCalledThenNoPatchBuffer, IsAtLeastXeHpCore) {

View File

@@ -1027,5 +1027,31 @@ HWTEST2_F(CommandQueueScratchTests, givenCommandsToPatchToNotSupportedPlatformWh
commandList->commandsToPatch.clear();
}
using CommandQueueCreate = Test<DeviceFixture>;
HWTEST2_F(CommandQueueCreate, givenCommandsToPatchWithNoopSpacePatchWhenPatchCommandsIsCalledThenSpaceIsNooped, MatchAny) {
ze_command_queue_desc_t desc = {};
NEO::CommandStreamReceiver *csr = nullptr;
device->getCsrForOrdinalAndIndex(&csr, 0u, 0u, ZE_COMMAND_QUEUE_PRIORITY_NORMAL, false);
auto commandQueue = std::make_unique<MockCommandQueueHw<gfxCoreFamily>>(device, csr, &desc);
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
constexpr uint32_t dataSize = 64;
auto patchBuffer = std::make_unique<uint8_t[]>(dataSize);
auto zeroBuffer = std::make_unique<uint8_t[]>(dataSize);
memset(patchBuffer.get(), 0xFF, dataSize);
memset(zeroBuffer.get(), 0x0, dataSize);
CommandToPatch commandToPatch;
commandToPatch.type = CommandToPatch::NoopSpace;
commandToPatch.pDestination = patchBuffer.get();
commandToPatch.patchSize = dataSize;
commandList->commandsToPatch.push_back(commandToPatch);
commandQueue->patchCommands(*commandList, 0, false);
EXPECT_EQ(0, memcmp(patchBuffer.get(), zeroBuffer.get(), dataSize));
}
} // namespace ult
} // namespace L0