feature: add option to make compute walker command view

Related-To: NEO-11972

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2024-07-29 22:10:00 +00:00
committed by Compute-Runtime-Automation
parent 21b20578b4
commit 7d1b59f008
8 changed files with 131 additions and 48 deletions

View File

@@ -202,6 +202,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
reinterpret_cast<const void *>(&threadGroupDimensions), // threadGroupDimensions
nullptr, // outWalkerPtr
nullptr, // cpuWalkerBuffer
nullptr, // cpuPayloadBuffer
&additionalCommands, // additionalCommands
commandListPreemptionMode, // preemptionMode
launchParams.requiredPartitionDim, // requiredPartitionDim
@@ -225,6 +226,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
this->heaplessStateInitEnabled, // isHeaplessStateInitEnabled
false, // interruptEvent
!this->scratchAddressPatchingEnabled, // immediateScratchAddressPatching
false, // makeCommandView
};
NEO::EncodeDispatchKernel<GfxFamily>::encodeCommon(commandContainer, dispatchKernelArgs);

View File

@@ -338,6 +338,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
reinterpret_cast<const void *>(&threadGroupDimensions), // threadGroupDimensions
nullptr, // outWalkerPtr
launchParams.cmdWalkerBuffer, // cpuWalkerBuffer
nullptr, // cpuPayloadBuffer
&additionalCommands, // additionalCommands
kernelPreemptionMode, // preemptionMode
launchParams.requiredPartitionDim, // requiredPartitionDim
@@ -361,6 +362,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
this->heaplessStateInitEnabled, // isHeaplessStateInitEnabled
interruptEvent, // interruptEvent
!this->scratchAddressPatchingEnabled, // immediateScratchAddressPatching
false, // makeCommandView
};
NEO::EncodeDispatchKernel<GfxFamily>::encodeCommon(commandContainer, dispatchKernelArgs);

View File

@@ -199,6 +199,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenNotEnoughSpaceInCommandStreamWhenA
threadGroupDimensions, // threadGroupDimensions
nullptr, // outWalkerPtr
nullptr, // cpuWalkerBuffer
nullptr, // cpuPayloadBuffer
nullptr, // additionalCommands
PreemptionMode::MidBatch, // preemptionMode
NEO::RequiredPartitionDim::none, // requiredPartitionDim
@@ -222,6 +223,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenNotEnoughSpaceInCommandStreamWhenA
false, // isHeaplessStateInitEnabled
false, // interruptEvent
false, // immediateScratchAddressPatching
false, // makeCommandView
};
NEO::EncodeDispatchKernel<FamilyType>::template encode<DefaultWalkerType>(commandContainer, dispatchKernelArgs);

View File

@@ -748,6 +748,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenNotEnoughSpaceInCommandStreamWhenA
threadGroupDimensions, // threadGroupDimensions
nullptr, // outWalkerPtr
nullptr, // cpuWalkerBuffer
nullptr, // cpuPayloadBuffer
nullptr, // additionalCommands
PreemptionMode::MidBatch, // preemptionMode
NEO::RequiredPartitionDim::none, // requiredPartitionDim
@@ -771,6 +772,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenNotEnoughSpaceInCommandStreamWhenA
false, // isHeaplessStateInitEnabled
false, // interruptEvent
false, // immediateScratchAddressPatching
false, // makeCommandView
};
EXPECT_THROW(NEO::EncodeDispatchKernel<FamilyType>::template encode<DefaultWalkerType>(commandContainer, dispatchKernelArgs), std::exception);
}

View File

@@ -57,6 +57,7 @@ struct EncodeDispatchKernelArgs {
const void *threadGroupDimensions = nullptr;
void *outWalkerPtr = nullptr;
void *cpuWalkerBuffer = nullptr;
void *cpuPayloadBuffer = nullptr;
std::list<void *> *additionalCommands = nullptr;
PreemptionMode preemptionMode = PreemptionMode::Initial;
NEO::RequiredPartitionDim requiredPartitionDim = NEO::RequiredPartitionDim::none;
@@ -80,6 +81,7 @@ struct EncodeDispatchKernelArgs {
bool isHeaplessStateInitEnabled = false;
bool interruptEvent = false;
bool immediateScratchAddressPatching = false;
bool makeCommandView = false;
bool requiresSystemMemoryFence() const {
return (isHostScopeSignalEvent && isKernelUsingSystemAllocation);

View File

@@ -52,6 +52,8 @@ template <typename WalkerType>
void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDispatchKernelArgs &args) {
using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS;
UNRECOVERABLE_IF(args.makeCommandView && (args.cpuWalkerBuffer == nullptr || args.cpuPayloadBuffer == nullptr));
constexpr bool heaplessModeEnabled = Family::template isHeaplessMode<WalkerType>();
const HardwareInfo &hwInfo = args.device->getHardwareInfo();
auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment();
@@ -71,10 +73,12 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
}
bool systolicModeRequired = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode;
if (container.systolicModeSupportRef() && (container.lastPipelineSelectModeRequiredRef() != systolicModeRequired)) {
container.lastPipelineSelectModeRequiredRef() = systolicModeRequired;
EncodeComputeMode<Family>::adjustPipelineSelect(container, kernelDescriptor);
if (!args.makeCommandView) {
bool systolicModeRequired = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode;
if (container.systolicModeSupportRef() && (container.lastPipelineSelectModeRequiredRef() != systolicModeRequired)) {
container.lastPipelineSelectModeRequiredRef() = systolicModeRequired;
EncodeComputeMode<Family>::adjustPipelineSelect(container, kernelDescriptor);
}
}
WalkerType walkerCmd = Family::template getInitGpuWalker<WalkerType>();
@@ -133,7 +137,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
sshProgrammingRequired = false;
}
if (sshProgrammingRequired) {
if (sshProgrammingRequired && !args.makeCommandView) {
bool isBindlessKernel = NEO::KernelDescriptor::isBindlessAddressingKernel(kernelDescriptor);
if (isBindlessKernel) {
bool globalBindlessSsh = args.device->getBindlessHeapsHelper() != nullptr;
@@ -186,7 +190,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
uint32_t samplerCount = 0;
if constexpr (Family::supportsSampler && heaplessModeEnabled == false) {
if (args.device->getDeviceInfo().imageSupport) {
if (args.device->getDeviceInfo().imageSupport && !args.makeCommandView) {
uint32_t samplerStateOffset = 0;
@@ -244,24 +248,38 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !localIdsGenerationByRuntime, rootDeviceEnvironment);
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching + args.reserveExtraPayloadSpace;
{
auto heap = container.getIndirectHeap(HeapType::indirectObject);
UNRECOVERABLE_IF(!heap);
heap->align(Family::cacheLineSize);
void *ptr = nullptr;
if (args.isKernelDispatchedFromImmediateCmdList) {
ptr = container.getHeapWithRequiredSizeAndAlignment(HeapType::indirectObject, iohRequiredSize, Family::indirectDataAlignment)->getSpace(iohRequiredSize);
} else {
ptr = container.getHeapSpaceAllowGrow(HeapType::indirectObject, iohRequiredSize);
}
UNRECOVERABLE_IF(!ptr);
offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - args.reserveExtraPayloadSpace);
auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment();
if (pImplicitArgs) {
offsetThreadData -= ImplicitArgs::getSize();
pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize;
EncodeDispatchKernel<Family>::patchScratchAddressInImplicitArgs<heaplessModeEnabled>(*pImplicitArgs, scratchAddressForImmediatePatching, args.immediateScratchAddressPatching);
if (!args.makeCommandView) {
auto heap = container.getIndirectHeap(HeapType::indirectObject);
UNRECOVERABLE_IF(!heap);
heap->align(Family::cacheLineSize);
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder), rootDeviceEnvironment);
if (args.isKernelDispatchedFromImmediateCmdList) {
ptr = container.getHeapWithRequiredSizeAndAlignment(HeapType::indirectObject, iohRequiredSize, Family::indirectDataAlignment)->getSpace(iohRequiredSize);
} else {
ptr = container.getHeapSpaceAllowGrow(HeapType::indirectObject, iohRequiredSize);
}
offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - args.reserveExtraPayloadSpace);
auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment();
if (pImplicitArgs) {
offsetThreadData -= ImplicitArgs::getSize();
pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize;
EncodeDispatchKernel<Family>::patchScratchAddressInImplicitArgs<heaplessModeEnabled>(*pImplicitArgs, scratchAddressForImmediatePatching, args.immediateScratchAddressPatching);
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder), rootDeviceEnvironment);
}
if (args.isIndirect) {
auto gpuPtr = heap->getGraphicsAllocation()->getGpuAddress() + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - inlineDataProgrammingOffset);
uint64_t implicitArgsGpuPtr = 0u;
if (pImplicitArgs) {
implicitArgsGpuPtr = gpuPtr + inlineDataProgrammingOffset - ImplicitArgs::getSize();
}
EncodeIndirectParams<Family>::encode(container, gpuPtr, args.dispatchInterface, implicitArgsGpuPtr);
}
} else {
ptr = args.cpuPayloadBuffer;
}
if (sizeCrossThreadData > 0) {
@@ -269,15 +287,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
crossThreadData, sizeCrossThreadData);
}
if (args.isIndirect) {
auto gpuPtr = heap->getGraphicsAllocation()->getGpuAddress() + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - inlineDataProgrammingOffset);
uint64_t implicitArgsGpuPtr = 0u;
if (pImplicitArgs) {
implicitArgsGpuPtr = gpuPtr + inlineDataProgrammingOffset - ImplicitArgs::getSize();
}
EncodeIndirectParams<Family>::encode(container, gpuPtr, args.dispatchInterface, implicitArgsGpuPtr);
}
auto perThreadDataPtr = args.dispatchInterface->getPerThreadData();
if (perThreadDataPtr != nullptr) {
ptr = ptrOffset(ptr, sizeCrossThreadData);
@@ -286,7 +295,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
}
}
if (args.isHeaplessStateInitEnabled == false) {
if (args.isHeaplessStateInitEnabled == false && !args.makeCommandView) {
if (container.isAnyHeapDirty() ||
args.requiresUncachedMocs) {
@@ -317,21 +326,25 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
}
}
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false));
args.additionalCommands->push_back(commandBuffer);
if (!args.makeCommandView) {
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false));
args.additionalCommands->push_back(commandBuffer);
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
}
}
uint8_t *inlineData = reinterpret_cast<uint8_t *>(walkerCmd.getInlineDataPointer());
EncodeDispatchKernel<Family>::programInlineDataHeapless<heaplessModeEnabled>(inlineData, args, container, offsetThreadData, scratchAddressForImmediatePatching);
if constexpr (heaplessModeEnabled == false) {
walkerCmd.setIndirectDataStartAddress(static_cast<uint32_t>(offsetThreadData));
walkerCmd.setIndirectDataLength(sizeThreadData);
if (!args.makeCommandView) {
walkerCmd.setIndirectDataStartAddress(static_cast<uint32_t>(offsetThreadData));
walkerCmd.setIndirectDataLength(sizeThreadData);
container.getIndirectHeap(HeapType::indirectObject)->align(NEO::EncodeDispatchKernel<Family>::getDefaultIOHAlignment());
container.getIndirectHeap(HeapType::indirectObject)->align(NEO::EncodeDispatchKernel<Family>::getDefaultIOHAlignment());
}
}
EncodeDispatchKernel<Family>::encodeThreadData(walkerCmd,
@@ -413,22 +426,26 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
hwInfo);
} else {
args.partitionCount = 1;
auto buffer = listCmdBufferStream->getSpaceForCmd<WalkerType>();
args.outWalkerPtr = buffer;
*buffer = walkerCmd;
if (!args.makeCommandView) {
auto buffer = listCmdBufferStream->getSpaceForCmd<WalkerType>();
args.outWalkerPtr = buffer;
*buffer = walkerCmd;
}
}
if (args.cpuWalkerBuffer) {
*reinterpret_cast<WalkerType *>(args.cpuWalkerBuffer) = walkerCmd;
}
PreemptionHelper::applyPreemptionWaCmdsEnd<Family>(listCmdBufferStream, *args.device);
if (!args.makeCommandView) {
PreemptionHelper::applyPreemptionWaCmdsEnd<Family>(listCmdBufferStream, *args.device);
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::AfterWorkload)) {
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false));
args.additionalCommands->push_back(commandBuffer);
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::AfterWorkload)) {
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false));
args.additionalCommands->push_back(commandBuffer);
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
}
}
}

View File

@@ -1648,3 +1648,57 @@ HWTEST2_F(CommandEncodeStatesTest, givenForceComputeWalkerPostSyncFlushWithWrite
uint64_t expectedData = 0u;
EXPECT_EQ(expectedData, postSync.getImmediateData());
}
HWTEST2_F(CommandEncodeStatesTest, givenEncodeDispatchKernelWhenRequestingCommandViewThenDoNotConsumeCmdBufferAndHeapSpace, IsAtLeastXeHpCore) {
using DefaultWalkerType = typename FamilyType::DefaultWalkerType;
uint32_t dims[] = {1, 1, 1};
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
auto payloadHeap = cmdContainer->getIndirectHeap(HeapType::indirectObject);
auto payloadHeapUsed = payloadHeap->getUsed();
auto cmdBuffer = cmdContainer->getCommandStream();
auto cmdBufferUsed = cmdBuffer->getUsed();
uint8_t payloadView[256] = {};
dispatchInterface->getCrossThreadDataSizeResult = 64;
auto walkerPtr = std::make_unique<DefaultWalkerType>();
DefaultWalkerType *cpuWalkerPointer = walkerPtr.get();
bool requiresUncachedMocs = false;
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
dispatchArgs.makeCommandView = true;
dispatchArgs.cpuPayloadBuffer = payloadView;
dispatchArgs.cpuWalkerBuffer = cpuWalkerPointer;
EncodeDispatchKernel<FamilyType>::template encode<DefaultWalkerType>(*cmdContainer.get(), dispatchArgs);
EXPECT_EQ(payloadHeapUsed, payloadHeap->getUsed());
EXPECT_EQ(cmdBufferUsed, cmdBuffer->getUsed());
}
HWTEST2_F(CommandEncodeStatesTest, givenEncodeDispatchKernelWhenRequestingCommandViewWithoutCpuPointersThenExpectUnrecoverable, IsAtLeastXeHpCore) {
using DefaultWalkerType = typename FamilyType::DefaultWalkerType;
uint32_t dims[] = {1, 1, 1};
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
uint8_t payloadView[256] = {};
dispatchInterface->getCrossThreadDataSizeResult = 64;
auto walkerPtr = std::make_unique<DefaultWalkerType>();
DefaultWalkerType *cpuWalkerPointer = walkerPtr.get();
bool requiresUncachedMocs = false;
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
dispatchArgs.makeCommandView = true;
dispatchArgs.cpuPayloadBuffer = nullptr;
dispatchArgs.cpuWalkerBuffer = cpuWalkerPointer;
EXPECT_ANY_THROW(EncodeDispatchKernel<FamilyType>::template encode<DefaultWalkerType>(*cmdContainer.get(), dispatchArgs));
dispatchArgs.cpuPayloadBuffer = payloadView;
dispatchArgs.cpuWalkerBuffer = nullptr;
EXPECT_ANY_THROW(EncodeDispatchKernel<FamilyType>::template encode<DefaultWalkerType>(*cmdContainer.get(), dispatchArgs));
}

View File

@@ -50,6 +50,7 @@ EncodeDispatchKernelArgs CommandEncodeStatesFixture::createDefaultDispatchKernel
threadGroupDimensions, // threadGroupDimensions
nullptr, // outWalkerPtr
nullptr, // cpuWalkerBuffer
nullptr, // cpuPayloadBuffer
nullptr, // additionalCommands
PreemptionMode::Disabled, // preemptionMode
NEO::RequiredPartitionDim::none, // requiredPartitionDim
@@ -73,6 +74,7 @@ EncodeDispatchKernelArgs CommandEncodeStatesFixture::createDefaultDispatchKernel
false, // isHeaplessStateInitEnabled
false, // interruptEvent
false, // immediateScratchAddressPatching
false, // makeCommandView
};
return args;