mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-30 09:58:55 +08:00
fix: zero sync and region barrier buffers before use on regular command lists
Related-To: NEO-13350 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
93fba587d0
commit
75139d2322
@@ -304,8 +304,8 @@ struct CommandListCoreFamily : public CommandListImp {
|
||||
const void **pRanges);
|
||||
|
||||
ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]);
|
||||
ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t &threadGroupDimensions);
|
||||
void programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize);
|
||||
ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t &threadGroupDimensions, size_t &patchIndex);
|
||||
void programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize, size_t &patchIndex);
|
||||
void appendWriteKernelTimestamp(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool maskLsb, bool workloadPartition, bool copyOperation);
|
||||
void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds, bool maskLsb, uint32_t mask, bool workloadPartition, bool copyOperation);
|
||||
void appendEventForProfiling(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool skipBarrierForEndProfiling, bool skipAddingEventToResidency, bool copyOperation);
|
||||
|
||||
@@ -2806,7 +2806,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandListCoreFamily<gfxCoreFamily>::programSyncBuffer(Kernel &kernel, NEO::Device &device,
|
||||
const ze_group_count_t &threadGroupDimensions) {
|
||||
const ze_group_count_t &threadGroupDimensions, size_t &patchIndex) {
|
||||
uint32_t maximalNumberOfWorkgroupsAllowed = kernel.suggestMaxCooperativeGroupCount(this->engineGroupType, false);
|
||||
|
||||
size_t requestedNumberOfWorkgroups = (threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ);
|
||||
@@ -2817,17 +2817,41 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::programSyncBuffer(Kernel &kern
|
||||
auto patchData = NEO::KernelHelper::getSyncBufferAllocationOffset(device, requestedNumberOfWorkgroups);
|
||||
kernel.patchSyncBuffer(patchData.first, patchData.second);
|
||||
|
||||
if (!isImmediateType()) {
|
||||
patchIndex = commandsToPatch.size();
|
||||
|
||||
CommandToPatch syncBufferSpace;
|
||||
syncBufferSpace.type = CommandToPatch::NoopSpace;
|
||||
syncBufferSpace.offset = patchData.second;
|
||||
syncBufferSpace.pDestination = ptrOffset(patchData.first->getUnderlyingBuffer(), patchData.second);
|
||||
syncBufferSpace.patchSize = NEO::KernelHelper::getSyncBufferSize(requestedNumberOfWorkgroups);
|
||||
|
||||
commandsToPatch.push_back(syncBufferSpace);
|
||||
}
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize) {
|
||||
void CommandListCoreFamily<gfxCoreFamily>::programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize, size_t &patchIndex) {
|
||||
auto neoDevice = device->getNEODevice();
|
||||
|
||||
auto threadGroupCount = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ;
|
||||
auto patchData = NEO::KernelHelper::getRegionGroupBarrierAllocationOffset(*neoDevice, threadGroupCount, localRegionSize);
|
||||
|
||||
kernel.patchRegionGroupBarrier(patchData.first, patchData.second);
|
||||
|
||||
if (!isImmediateType()) {
|
||||
patchIndex = commandsToPatch.size();
|
||||
|
||||
CommandToPatch regionBarrierSpace;
|
||||
regionBarrierSpace.type = CommandToPatch::NoopSpace;
|
||||
regionBarrierSpace.offset = patchData.second;
|
||||
regionBarrierSpace.pDestination = ptrOffset(patchData.first->getUnderlyingBuffer(), patchData.second);
|
||||
regionBarrierSpace.patchSize = NEO::KernelHelper::getRegionGroupBarrierSize(threadGroupCount, localRegionSize);
|
||||
|
||||
commandsToPatch.push_back(regionBarrierSpace);
|
||||
}
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
@@ -3360,6 +3384,7 @@ void CommandListCoreFamily<gfxCoreFamily>::clearCommandsToPatch() {
|
||||
break;
|
||||
case CommandToPatch::ComputeWalkerInlineDataScratch:
|
||||
case CommandToPatch::ComputeWalkerImplicitArgsScratch:
|
||||
case CommandToPatch::NoopSpace:
|
||||
break;
|
||||
default:
|
||||
UNRECOVERABLE_IF(true);
|
||||
|
||||
@@ -154,7 +154,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
|
||||
containsCooperativeKernelsFlag = (containsCooperativeKernelsFlag || launchParams.isCooperative);
|
||||
if (kernel->usesSyncBuffer()) {
|
||||
auto retVal = (launchParams.isCooperative
|
||||
? programSyncBuffer(*kernel, *device->getNEODevice(), threadGroupDimensions)
|
||||
? programSyncBuffer(*kernel, *device->getNEODevice(), threadGroupDimensions, launchParams.syncBufferPatchIndex)
|
||||
: ZE_RESULT_ERROR_INVALID_ARGUMENT);
|
||||
if (retVal) {
|
||||
return retVal;
|
||||
|
||||
@@ -250,7 +250,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
|
||||
if (!launchParams.makeKernelCommandView) {
|
||||
if (kernel->usesSyncBuffer()) {
|
||||
auto retVal = (launchParams.isCooperative
|
||||
? programSyncBuffer(*kernel, *neoDevice, threadGroupDimensions)
|
||||
? programSyncBuffer(*kernel, *neoDevice, threadGroupDimensions, launchParams.syncBufferPatchIndex)
|
||||
: ZE_RESULT_ERROR_INVALID_ARGUMENT);
|
||||
if (retVal) {
|
||||
return retVal;
|
||||
@@ -258,7 +258,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
|
||||
}
|
||||
|
||||
if (kernel->usesRegionGroupBarrier()) {
|
||||
programRegionGroupBarrier(*kernel, threadGroupDimensions, launchParams.localRegionSize);
|
||||
programRegionGroupBarrier(*kernel, threadGroupDimensions, launchParams.localRegionSize, launchParams.regionBarrierPatchIndex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ struct CommandToPatch {
|
||||
CbWaitEventLoadRegisterImm,
|
||||
ComputeWalkerInlineDataScratch,
|
||||
ComputeWalkerImplicitArgsScratch,
|
||||
NoopSpace,
|
||||
Invalid
|
||||
};
|
||||
void *pDestination = nullptr;
|
||||
@@ -51,6 +52,8 @@ struct CmdListKernelLaunchParams {
|
||||
void *hostPayloadBuffer = nullptr;
|
||||
CommandToPatch *outSyncCommand = nullptr;
|
||||
CommandToPatchContainer *outListCommands = nullptr;
|
||||
size_t syncBufferPatchIndex = std::numeric_limits<size_t>::max();
|
||||
size_t regionBarrierPatchIndex = std::numeric_limits<size_t>::max();
|
||||
uint32_t externalPerThreadScratchSize[2] = {0U, 0U};
|
||||
NEO::RequiredPartitionDim requiredPartitionDim = NEO::RequiredPartitionDim::none;
|
||||
NEO::RequiredDispatchWalkOrder requiredDispatchWalkOrder = NEO::RequiredDispatchWalkOrder::none;
|
||||
|
||||
@@ -191,6 +191,10 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
|
||||
args);
|
||||
break;
|
||||
}
|
||||
case CommandToPatch::NoopSpace: {
|
||||
memset(commandToPatch.pDestination, 0, commandToPatch.patchSize);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
|
||||
@@ -250,6 +250,10 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
|
||||
std::memcpy(scratchAddressPatch, &fullScratchAddress, commandToPatch.patchSize);
|
||||
break;
|
||||
}
|
||||
case CommandToPatch::NoopSpace: {
|
||||
memset(commandToPatch.pDestination, 0, commandToPatch.patchSize);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
|
||||
@@ -66,8 +66,9 @@ HWTEST2_F(MultiTileImmediateCommandListTest, givenMultipleTilesWhenAllocatingBar
|
||||
size_t requestedNumberOfWorkgroups = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ;
|
||||
|
||||
size_t localRegionSize = 4;
|
||||
size_t patchIndex = 0;
|
||||
|
||||
whiteBoxCmdList->programRegionGroupBarrier(mockKernel, threadGroupDimensions, localRegionSize);
|
||||
whiteBoxCmdList->programRegionGroupBarrier(mockKernel, threadGroupDimensions, localRegionSize, patchIndex);
|
||||
|
||||
auto patchData = neoDevice->syncBufferHandler->obtainAllocationAndOffset(1);
|
||||
|
||||
|
||||
@@ -464,6 +464,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau
|
||||
commandList->initialize(device, engineGroupType, 0u);
|
||||
auto result = commandList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, cooperativeParams, false);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_NE(std::numeric_limits<size_t>::max(), cooperativeParams.syncBufferPatchIndex);
|
||||
|
||||
auto mockSyncBufferHandler = reinterpret_cast<MockSyncBufferHandler *>(device->getNEODevice()->syncBufferHandler.get());
|
||||
auto syncBufferAllocation = mockSyncBufferHandler->graphicsAllocation;
|
||||
@@ -476,6 +477,13 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau
|
||||
|
||||
EXPECT_EQ(syncBufferAllocation, kernel.getSyncBufferAllocation());
|
||||
|
||||
auto &cmdsToPatch = commandList->getCommandsToPatch();
|
||||
ASSERT_NE(0u, cmdsToPatch.size());
|
||||
|
||||
auto noopParam = cmdsToPatch[cooperativeParams.syncBufferPatchIndex];
|
||||
EXPECT_EQ(CommandToPatch::NoopSpace, noopParam.type);
|
||||
EXPECT_NE(0u, noopParam.patchSize);
|
||||
|
||||
commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
|
||||
commandList->initialize(device, engineGroupType, 0u);
|
||||
CmdListKernelLaunchParams launchParams = {};
|
||||
@@ -514,7 +522,18 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLau
|
||||
result = commandList->appendLaunchKernelWithParams(&kernel, groupCount, nullptr, launchParams);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, result);
|
||||
}
|
||||
|
||||
const ze_command_queue_desc_t desc = {};
|
||||
std::unique_ptr<L0::CommandList> commandListImmediate(CommandList::createImmediate(productFamily, device, &desc, false, engineGroupType, result));
|
||||
|
||||
cooperativeParams.isCooperative = true;
|
||||
cooperativeParams.syncBufferPatchIndex = std::numeric_limits<size_t>::max();
|
||||
|
||||
result = commandListImmediate->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, cooperativeParams, false);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(std::numeric_limits<size_t>::max(), cooperativeParams.syncBufferPatchIndex);
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLaunchCooperativeKernelWithMakeViewIsCalledThenNoAllocationCreated, IsAtLeastXeHpCore) {
|
||||
Mock<::L0::KernelImp> kernel;
|
||||
auto pMockModule = std::unique_ptr<Module>(new Mock<Module>(device, nullptr));
|
||||
@@ -599,6 +618,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenA
|
||||
CmdListKernelLaunchParams launchParams = {};
|
||||
launchParams.localRegionSize = 4;
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, cmdList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false));
|
||||
EXPECT_EQ(std::numeric_limits<size_t>::max(), launchParams.regionBarrierPatchIndex);
|
||||
|
||||
auto patchPtr = *reinterpret_cast<uint64_t *>(ptrOffset(kernel.crossThreadData.get(), regionGroupBarrier.stateless));
|
||||
EXPECT_NE(0u, patchPtr);
|
||||
@@ -633,6 +653,18 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenA
|
||||
auto offset = alignUp((requestedNumberOfWorkgroups / launchParams.localRegionSize) * (launchParams.localRegionSize + 1) * 2 * sizeof(uint32_t), MemoryConstants::cacheLineSize);
|
||||
|
||||
EXPECT_EQ(patchPtr2, patchPtr + offset);
|
||||
|
||||
std::unique_ptr<L0::CommandList> cmdListRegular(CommandList::create(productFamily, device, NEO::EngineGroupType::compute, 0, result, false));
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, cmdListRegular->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false));
|
||||
EXPECT_NE(std::numeric_limits<size_t>::max(), launchParams.regionBarrierPatchIndex);
|
||||
|
||||
auto &cmdsToPatch = cmdListRegular->getCommandsToPatch();
|
||||
ASSERT_NE(0u, cmdsToPatch.size());
|
||||
|
||||
auto noopParam = cmdsToPatch[launchParams.regionBarrierPatchIndex];
|
||||
EXPECT_EQ(CommandToPatch::NoopSpace, noopParam.type);
|
||||
EXPECT_NE(0u, noopParam.patchSize);
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingRegionGroupBarrierWhenAppendLaunchKernelWithMakeViewIsCalledThenNoPatchBuffer, IsAtLeastXeHpCore) {
|
||||
|
||||
@@ -1027,5 +1027,31 @@ HWTEST2_F(CommandQueueScratchTests, givenCommandsToPatchToNotSupportedPlatformWh
|
||||
commandList->commandsToPatch.clear();
|
||||
}
|
||||
|
||||
using CommandQueueCreate = Test<DeviceFixture>;
|
||||
|
||||
HWTEST2_F(CommandQueueCreate, givenCommandsToPatchWithNoopSpacePatchWhenPatchCommandsIsCalledThenSpaceIsNooped, MatchAny) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
NEO::CommandStreamReceiver *csr = nullptr;
|
||||
device->getCsrForOrdinalAndIndex(&csr, 0u, 0u, ZE_COMMAND_QUEUE_PRIORITY_NORMAL, false);
|
||||
auto commandQueue = std::make_unique<MockCommandQueueHw<gfxCoreFamily>>(device, csr, &desc);
|
||||
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
|
||||
|
||||
constexpr uint32_t dataSize = 64;
|
||||
auto patchBuffer = std::make_unique<uint8_t[]>(dataSize);
|
||||
auto zeroBuffer = std::make_unique<uint8_t[]>(dataSize);
|
||||
memset(patchBuffer.get(), 0xFF, dataSize);
|
||||
memset(zeroBuffer.get(), 0x0, dataSize);
|
||||
|
||||
CommandToPatch commandToPatch;
|
||||
|
||||
commandToPatch.type = CommandToPatch::NoopSpace;
|
||||
commandToPatch.pDestination = patchBuffer.get();
|
||||
commandToPatch.patchSize = dataSize;
|
||||
|
||||
commandList->commandsToPatch.push_back(commandToPatch);
|
||||
commandQueue->patchCommands(*commandList, 0, false);
|
||||
EXPECT_EQ(0, memcmp(patchBuffer.get(), zeroBuffer.get(), dataSize));
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
|
||||
Reference in New Issue
Block a user