compute-runtime/opencl/source/command_queue/gpgpu_walker_base.inl

171 lines
8.5 KiB
C++

/*
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/timestamp_packet.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/source/utilities/tag_allocator.h"
#include "opencl/source/command_queue/command_queue.h"
#include "opencl/source/command_queue/command_queue_hw.h"
#include "opencl/source/command_queue/gpgpu_walker.h"
#include "opencl/source/event/user_event.h"
#include "opencl/source/helpers/queue_helpers.h"
namespace NEO {
template <typename GfxFamily>
template <typename WalkerType>
void GpgpuWalkerHelper<GfxFamily>::setSystolicModeEnable(WalkerType *walkerCmd) {
}
template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
CommandQueue &commandQueue,
TagNodeBase &hwPerfCounter,
LinearStream *commandStream) {
const auto pPerformanceCounters = commandQueue.getPerfCounters();
const auto commandBufferType = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType())
? MetricsLibraryApi::GpuCommandBufferType::Compute
: MetricsLibraryApi::GpuCommandBufferType::Render;
const uint32_t size = pPerformanceCounters->getGpuCommandsSize(commandBufferType, true);
void *pBuffer = commandStream->getSpace(size);
pPerformanceCounters->getGpuCommands(commandBufferType, hwPerfCounter, true, size, pBuffer);
}
template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(
CommandQueue &commandQueue,
TagNodeBase &hwPerfCounter,
LinearStream *commandStream) {
const auto pPerformanceCounters = commandQueue.getPerfCounters();
const auto commandBufferType = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType())
? MetricsLibraryApi::GpuCommandBufferType::Compute
: MetricsLibraryApi::GpuCommandBufferType::Render;
const uint32_t size = pPerformanceCounters->getGpuCommandsSize(commandBufferType, false);
void *pBuffer = commandStream->getSpace(size);
pPerformanceCounters->getGpuCommands(commandBufferType, hwPerfCounter, false, size, pBuffer);
}
template <typename GfxFamily>
size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel) {
return 0u;
}
template <typename GfxFamily>
size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool isMarkerWithProfiling, bool eventsInWaitlist, bool resolveDependenciesByPipecontrol, cl_event *outEvent) {
size_t expectedSizeCS = 0;
auto &gfxCoreHelper = commandQueue.getDevice().getGfxCoreHelper();
auto &commandQueueHw = static_cast<CommandQueueHw<GfxFamily> &>(commandQueue);
auto &rootDeviceEnvironment = commandQueue.getDevice().getRootDeviceEnvironment();
if (blitEnqueue) {
size_t expectedSizeCS = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<GfxFamily>();
if (commandQueueHw.isCacheFlushForBcsRequired()) {
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true);
}
return expectedSizeCS;
}
for (auto &dispatchInfo : multiDispatchInfo) {
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, dispatchInfo.getKernel(), dispatchInfo);
size_t kernelObjAuxCount = multiDispatchInfo.getKernelObjsForAuxTranslation() != nullptr ? multiDispatchInfo.getKernelObjsForAuxTranslation()->size() : 0;
expectedSizeCS += dispatchInfo.dispatchInitCommands.estimateCommandsSize(kernelObjAuxCount, rootDeviceEnvironment, commandQueueHw.isCacheFlushForBcsRequired());
expectedSizeCS += dispatchInfo.dispatchEpilogueCommands.estimateCommandsSize(kernelObjAuxCount, rootDeviceEnvironment, commandQueueHw.isCacheFlushForBcsRequired());
}
auto relaxedOrderingEnabled = commandQueue.getGpgpuCommandStreamReceiver().directSubmissionRelaxedOrderingEnabled();
if (relaxedOrderingEnabled) {
expectedSizeCS += 2 * EncodeSetMMIO<GfxFamily>::sizeREG;
}
if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
// add relaxed ordering cond_bb_start
expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(csrDeps, relaxedOrderingEnabled);
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite();
if (resolveDependenciesByPipecontrol) {
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier();
}
if (isMarkerWithProfiling) {
if (!eventsInWaitlist) {
expectedSizeCS += commandQueue.getGpgpuCommandStreamReceiver().getCmdsSizeForComputeBarrierCommand();
}
expectedSizeCS += 4 * EncodeStoreMMIO<GfxFamily>::size;
}
} else if (isMarkerWithProfiling) {
expectedSizeCS += 2 * MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier();
if (!gfxCoreHelper.useOnlyGlobalTimestamps()) {
expectedSizeCS += 2 * EncodeStoreMMIO<GfxFamily>::size;
}
}
if (multiDispatchInfo.peekMainKernel()) {
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeForCacheFlushAfterWalkerCommands(*multiDispatchInfo.peekMainKernel(), commandQueue);
}
if (debugManager.flags.PauseOnEnqueue.get() != -1) {
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier() * 2;
expectedSizeCS += NEO::EncodeSemaphore<GfxFamily>::getSizeMiSemaphoreWait() * 2;
}
if (debugManager.flags.GpuScratchRegWriteAfterWalker.get() != -1) {
expectedSizeCS += sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM);
}
expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForMultiRootDeviceSyncNodesContainer<GfxFamily>(csrDeps);
if (outEvent) {
auto pEvent = castToObjectOrAbort<Event>(*outEvent);
if ((pEvent->getContext()->getRootDeviceIndices().size() > 1) && (!pEvent->isUserEvent())) {
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true);
}
}
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier();
if ((CL_COMMAND_BARRIER == eventType) && !commandQueue.isOOQEnabled() && eventsInWaitlist) {
expectedSizeCS += EncodeStoreMemory<GfxFamily>::getStoreDataImmSize();
}
return expectedSizeCS;
}
template <typename GfxFamily>
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo) {
if (isCommandWithoutKernel(cmdType)) {
return EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue);
} else {
return EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel<typename GfxFamily::DefaultWalkerType>(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, pKernel, dispatchInfo);
}
}
template <typename GfxFamily>
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue) {
size_t size = 0;
if (reserveProfilingCmdsSpace) {
size += 2 * MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier() + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
return size;
}
template <typename GfxFamily>
template <typename WalkerType>
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacketFlushL3(WalkerType *walkerCmd,
const ProductHelper &productHelper,
bool flushL3AfterPostSyncForHostUsm,
bool flushL3AfterPostSyncForExternalAllocation) {
}
} // namespace NEO