refactor: move scratch and kab programming to a function

Related-To: NEO-7824

Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
Kamil Kopryk 2024-04-09 15:25:08 +00:00 committed by Compute-Runtime-Automation
parent 6a55bbe6cd
commit 6ffa756457
10 changed files with 43 additions and 39 deletions

View File

@ -178,6 +178,9 @@ struct EncodeDispatchKernel {
template <typename WalkerType>
static void adjustWalkOrder(WalkerType &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
template <bool heaplessModeEnabled>
static void programInlineDataHeapless(uint8_t *inlineDataPtr, EncodeDispatchKernelArgs &args, CommandContainer &container, uint64_t offsetThreadData);
static size_t getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount);
static size_t getSizeRequiredSsh(const KernelInfo &kernelInfo);
inline static size_t additionalSizeRequiredDsh(uint32_t iddCount);

View File

@ -24,6 +24,7 @@ template void NEO::EncodeDispatchKernel<Family>::adjustWalkOrder<Family::Default
template void NEO::EncodeDispatchKernel<Family>::programBarrierEnable<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
template void NEO::EncodeDispatchKernel<Family>::setScratchAddress<false>(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &csr);
template void NEO::EncodeDispatchKernel<Family>::setScratchAddress<true>(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &csr);
template void NEO::EncodeDispatchKernel<Family>::programInlineDataHeapless<false>(uint8_t *inlineDataPtr, EncodeDispatchKernelArgs &args, CommandContainer &container, uint64_t offsetThreadData);
template struct NEO::EncodeStates<Family>;
template struct NEO::EncodeMath<Family>;

View File

@ -0,0 +1,16 @@
/*
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/command_container/command_encoder.h"
namespace NEO {
template <typename Family>
template <bool heaplessModeEnabled>
void EncodeDispatchKernel<Family>::programInlineDataHeapless(uint8_t *inlineDataPtr, EncodeDispatchKernelArgs &args, CommandContainer &container, uint64_t offsetThreadData) {
}
} // namespace NEO

View File

@ -87,24 +87,22 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
bool localIdsGenerationByRuntime = args.dispatchInterface->requiresGenerationOfLocalIdsByRuntime();
auto requiredWorkgroupOrder = args.dispatchInterface->getRequiredWorkgroupOrder();
bool inlineDataProgramming = EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(kernelDescriptor);
{
auto alloc = args.dispatchInterface->getIsaAllocation();
UNRECOVERABLE_IF(nullptr == alloc);
{
auto isaAllocation = args.dispatchInterface->getIsaAllocation();
UNRECOVERABLE_IF(nullptr == isaAllocation);
uint64_t kernelStartPointer = args.dispatchInterface->getIsaOffsetInParentAllocation();
if constexpr (heaplessModeEnabled) {
auto address = alloc->getGpuAddress() + args.dispatchInterface->getIsaOffsetInParentAllocation();
if (!localIdsGenerationByRuntime) {
address += kernelDescriptor.entryPoints.skipPerThreadDataLoad;
}
idd.setKernelStartPointer(address);
kernelStartPointer += isaAllocation->getGpuAddress();
} else {
auto offset = alloc->getGpuAddressToPatch() + args.dispatchInterface->getIsaOffsetInParentAllocation();
if (!localIdsGenerationByRuntime) {
offset += kernelDescriptor.entryPoints.skipPerThreadDataLoad;
}
idd.setKernelStartPointer(offset);
kernelStartPointer += isaAllocation->getGpuAddressToPatch();
}
if (!localIdsGenerationByRuntime) {
kernelStartPointer += kernelDescriptor.entryPoints.skipPerThreadDataLoad;
}
idd.setKernelStartPointer(kernelStartPointer);
}
if (args.dispatchInterface->getKernelDescriptor().kernelAttributes.flags.usesAssert && args.device->getL0Debugger() != nullptr) {
idd.setSoftwareExceptionEnable(1);
@ -231,11 +229,11 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
auto crossThreadData = args.dispatchInterface->getCrossThreadData();
uint32_t inlineDataProgrammingOffset = 0u;
bool inlineDataProgramming = EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(kernelDescriptor);
if (inlineDataProgramming) {
inlineDataProgrammingOffset = std::min(inlineDataSize, sizeCrossThreadData);
auto dest = reinterpret_cast<char *>(walkerCmd.getInlineDataPointer());
memcpy_s(dest, inlineDataProgrammingOffset, crossThreadData, inlineDataProgrammingOffset);
memcpy_s(dest, inlineDataSize, crossThreadData, inlineDataProgrammingOffset);
sizeCrossThreadData -= inlineDataProgrammingOffset;
crossThreadData = ptrOffset(crossThreadData, inlineDataProgrammingOffset);
inlineDataProgramming = inlineDataProgrammingOffset != 0;
@ -325,30 +323,10 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
}
if constexpr (heaplessModeEnabled) {
auto inlineDataPointer = reinterpret_cast<char *>(walkerCmd.getInlineDataPointer());
auto indirectDataPointerAddress = kernelDescriptor.payloadMappings.implicitArgs.indirectDataPointerAddress;
auto heap = container.getIndirectHeap(HeapType::indirectObject);
auto address = heap->getHeapGpuBase() + offsetThreadData;
std::memcpy(inlineDataPointer + indirectDataPointerAddress.offset, &address, indirectDataPointerAddress.pointerSize);
uint8_t *inlineData = reinterpret_cast<uint8_t *>(walkerCmd.getInlineDataPointer());
EncodeDispatchKernel<Family>::programInlineDataHeapless<heaplessModeEnabled>(inlineData, args, container, offsetThreadData);
if (args.immediateScratchAddressPatching) {
auto requiredScratchSlot0Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
auto requiredScratchSlot1Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[1];
auto csr = args.device->getDefaultEngine().commandStreamReceiver;
NEO::IndirectHeap *ssh = nullptr;
if (csr->getGlobalStatelessHeapAllocation() != nullptr) {
ssh = csr->getGlobalStatelessHeap();
} else {
ssh = args.surfaceStateHeap ? args.surfaceStateHeap : container.getIndirectHeap(HeapType::surfaceState);
}
uint64_t scratchAddress = 0u;
EncodeDispatchKernel<Family>::template setScratchAddress<heaplessModeEnabled>(scratchAddress, requiredScratchSlot0Size, requiredScratchSlot1Size, ssh, *csr);
auto scratchPointerAddress = kernelDescriptor.payloadMappings.implicitArgs.scratchPointerAddress;
std::memcpy(inlineDataPointer + scratchPointerAddress.offset, &scratchAddress, scratchPointerAddress.pointerSize);
}
} else {
if constexpr (heaplessModeEnabled == false) {
walkerCmd.setIndirectDataStartAddress(static_cast<uint32_t>(offsetThreadData));
walkerCmd.setIndirectDataLength(sizeThreadData);

View File

@ -15,6 +15,7 @@ using Family = NEO::Gen11Family;
#include "shared/source/command_container/command_encoder.inl"
#include "shared/source/command_container/command_encoder_bdw_and_later.inl"
#include "shared/source/command_container/command_encoder_heap_addressing.inl"
#include "shared/source/command_container/encode_compute_mode_bdw_and_later.inl"
#include "shared/source/command_container/image_surface_state/compression_params_bdw_and_later.inl"

View File

@ -19,6 +19,7 @@ using Family = NEO::Gen12LpFamily;
#include "shared/source/command_container/command_encoder.inl"
#include "shared/source/command_container/command_encoder_bdw_and_later.inl"
#include "shared/source/command_container/command_encoder_heap_addressing.inl"
#include "shared/source/command_container/command_encoder_tgllp_and_later.inl"
#include "shared/source/command_container/encode_compute_mode_tgllp_and_later.inl"
#include "shared/source/command_container/image_surface_state/compression_params_bdw_and_later.inl"

View File

@ -13,6 +13,7 @@ using Family = NEO::Gen8Family;
#include "shared/source/command_container/command_encoder.inl"
#include "shared/source/command_container/command_encoder_bdw_and_later.inl"
#include "shared/source/command_container/command_encoder_heap_addressing.inl"
#include "shared/source/command_container/encode_compute_mode_bdw_and_later.inl"
#include "shared/source/command_container/image_surface_state/compression_params_bdw_and_later.inl"

View File

@ -13,6 +13,7 @@ using Family = NEO::Gen9Family;
#include "shared/source/command_container/command_encoder.inl"
#include "shared/source/command_container/command_encoder_bdw_and_later.inl"
#include "shared/source/command_container/command_encoder_heap_addressing.inl"
#include "shared/source/command_container/encode_compute_mode_bdw_and_later.inl"
#include "shared/source/command_container/image_surface_state/compression_params_bdw_and_later.inl"

View File

@ -18,6 +18,7 @@
using Family = NEO::XeHpcCoreFamily;
#include "shared/source/command_container/command_encoder_heap_addressing.inl"
#include "shared/source/command_container/command_encoder_tgllp_and_later.inl"
#include "shared/source/command_container/command_encoder_xe_hpc_core_and_later.inl"
#include "shared/source/command_container/command_encoder_xe_hpg_core_and_later.inl"

View File

@ -18,6 +18,7 @@
using Family = NEO::XeHpgCoreFamily;
#include "shared/source/command_container/command_encoder_heap_addressing.inl"
#include "shared/source/command_container/command_encoder_tgllp_and_later.inl"
#include "shared/source/command_container/command_encoder_xe_hpg_core_and_later.inl"
#include "shared/source/command_container/image_surface_state/compression_params_tgllp_and_later.inl"