fix: add scratch ptr in implicit args patching for L0 regular cmdlists

Related-To: NEO-11874
Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
Kamil Kopryk
2024-07-18 11:03:36 +00:00
committed by Compute-Runtime-Automation
parent 153cda9a9f
commit 4008ccea05
9 changed files with 52 additions and 5 deletions

View File

@@ -20,6 +20,9 @@
namespace NEO {
enum class MemoryPool;
enum class ImageType;
struct EncodeDispatchKernelArgs;
struct KernelDescriptor;
} // namespace NEO
namespace L0 {
@@ -359,7 +362,7 @@ struct CommandListCoreFamily : public CommandListImp {
bool isInOrderNonWalkerSignalingRequired(const Event *event) const;
bool hasInOrderDependencies() const;
void appendFullSynchronizedDispatchInit();
void addPatchScratchAddressInImplicitArgs(CommandsToPatch &commandsToPatch, NEO::EncodeDispatchKernelArgs &args, const NEO::KernelDescriptor &kernelDescriptor, bool kernelNeedsImplicitArgs);
size_t addCmdForPatching(std::shared_ptr<NEO::InOrderExecInfo> *externalInOrderExecInfo, void *cmd1, void *cmd2, uint64_t counterValue, NEO::InOrderPatchCommandHelpers::PatchCmdType patchCmdType);
uint64_t getInOrderIncrementValue() const;
bool isSkippingInOrderBarrierAllowed(ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) const;

View File

@@ -4097,6 +4097,10 @@ void CommandListCoreFamily<gfxCoreFamily>::appendFullSynchronizedDispatchInit()
NEO::EncodeMiPredicate<GfxFamily>::encode(*cmdStream, NEO::MiPredicateType::disable);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::addPatchScratchAddressInImplicitArgs(CommandsToPatch &commandsToPatch, NEO::EncodeDispatchKernelArgs &args, const NEO::KernelDescriptor &kernelDescriptor, bool kernelNeedsImplicitArgs) {
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendSynchronizedDispatchCleanupSection() {
if (getSynchronizedDispatchMode() != NEO::SynchronizedDispatchMode::full) {

View File

@@ -126,6 +126,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
", Group count: ", threadGroupDimensions.groupCountX, ", ", threadGroupDimensions.groupCountY, ", ", threadGroupDimensions.groupCountZ,
", SIMD: ", kernelInfo->getMaxSimdSize());
bool kernelNeedsImplicitArgs = kernel->getImplicitArgs() != nullptr;
bool needScratchSpace = false;
bool kernelNeedsScratchSpace = false;
for (uint32_t slotId = 0u; slotId < 2; slotId++) {
@@ -373,6 +374,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
scratchInlineData.baseAddress = ssh->getGpuBase();
}
commandsToPatch.push_back(scratchInlineData);
addPatchScratchAddressInImplicitArgs(commandsToPatch, dispatchKernelArgs, kernelDescriptor, kernelNeedsImplicitArgs);
}
if (!this->isFlushTaskSubmissionEnabled) {

View File

@@ -31,6 +31,7 @@ struct CommandToPatch {
CbWaitEventSemaphoreWait,
CbWaitEventLoadRegisterImm,
ComputeWalkerInlineDataScratch,
ComputeWalkerImplicitArgsScratch,
Invalid
};
void *pDestination = nullptr;

View File

@@ -173,11 +173,11 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
bool patchNewInlineScratchAddress = false;
bool patchNewScratchAddress = false;
if (this->heaplessModeEnabled &&
(commandList.getCommandListPatchedPerThreadScratchSize(0) < perThreadScratchSpaceSlot0Size ||
commandList.getCommandListPatchedPerThreadScratchSize(1) < perThreadScratchSpaceSlot1Size)) {
patchNewInlineScratchAddress = true;
patchNewScratchAddress = true;
}
auto &commandsToPatch = commandList.getCommandsToPatch();
@@ -241,7 +241,16 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
break;
}
case CommandToPatch::ComputeWalkerInlineDataScratch: {
if (!patchNewInlineScratchAddress) {
if (!patchNewScratchAddress) {
continue;
}
uint64_t fullScratchAddress = scratchAddress + commandToPatch.baseAddress;
void *scratchAddressPatch = ptrOffset(commandToPatch.pDestination, commandToPatch.offset);
std::memcpy(scratchAddressPatch, &fullScratchAddress, commandToPatch.patchSize);
break;
}
case CommandToPatch::ComputeWalkerImplicitArgsScratch: {
if (!patchNewScratchAddress) {
continue;
}
uint64_t fullScratchAddress = scratchAddress + commandToPatch.baseAddress;
@@ -254,7 +263,7 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
}
}
if (patchNewInlineScratchAddress) {
if (patchNewScratchAddress) {
commandList.setCommandListPatchedPerThreadScratchSize(0, perThreadScratchSpaceSlot0Size);
commandList.setCommandListPatchedPerThreadScratchSize(1, perThreadScratchSpaceSlot1Size);
}

View File

@@ -32,6 +32,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
using BaseClass = ::L0::CommandListCoreFamily<gfxCoreFamily>;
using BaseClass::addCmdForPatching;
using BaseClass::addFlushRequiredCommand;
using BaseClass::addPatchScratchAddressInImplicitArgs;
using BaseClass::allocateOrReuseKernelPrivateMemoryIfNeeded;
using BaseClass::allowCbWaitEventsNoopDispatch;
using BaseClass::appendBlitFill;