/* * Copyright (C) 2018-2019 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "core/helpers/simd_helper.h" #include "runtime/command_queue/gpgpu_walker_base.inl" #include "runtime/helpers/engine_node_helper.h" namespace NEO { template inline size_t GpgpuWalkerHelper::setGpgpuWalkerThreadData( WALKER_TYPE *walkerCmd, const size_t globalOffsets[3], const size_t startWorkGroups[3], const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, const iOpenCL::SPatchThreadPayload &threadPayload, uint32_t requiredWorkgroupOrder) { auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2]; auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize); walkerCmd->setThreadWidthCounterMaximum(static_cast(threadsPerWorkGroup)); walkerCmd->setThreadGroupIdXDimension(static_cast(numWorkGroups[0])); walkerCmd->setThreadGroupIdYDimension(static_cast(numWorkGroups[1])); walkerCmd->setThreadGroupIdZDimension(static_cast(numWorkGroups[2])); // compute executionMask - to tell which SIMD lines are active within thread auto remainderSimdLanes = localWorkSize & (simd - 1); uint64_t executionMask = (1ull << remainderSimdLanes) - 1; if (!executionMask) executionMask = ~executionMask; using SIMD_SIZE = typename WALKER_TYPE::SIMD_SIZE; walkerCmd->setRightExecutionMask(static_cast(executionMask)); walkerCmd->setBottomExecutionMask(static_cast(0xffffffff)); walkerCmd->setSimdSize(getSimdConfig>(simd)); walkerCmd->setThreadGroupIdStartingX(static_cast(startWorkGroups[0])); walkerCmd->setThreadGroupIdStartingY(static_cast(startWorkGroups[1])); walkerCmd->setThreadGroupIdStartingResumeZ(static_cast(startWorkGroups[2])); return localWorkSize; } template void GpgpuWalkerHelper::dispatchScheduler( LinearStream &commandStream, DeviceQueueHw &devQueueHw, PreemptionMode preemptionMode, SchedulerKernel &scheduler, IndirectHeap *ssh, IndirectHeap *dsh) { using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; bool dcFlush = false; PipeControlHelper::addPipeControl(commandStream, dcFlush); uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex; const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize; const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable; const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA); // Program media interface descriptor load HardwareCommandsHelper::sendMediaInterfaceDescriptorLoad( commandStream, offsetInterfaceDescriptor, totalInterfaceDescriptorTableSize); DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0); // Determine SIMD size uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize(); DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20); // Patch our kernel constants *scheduler.globalWorkOffsetX = 0; *scheduler.globalWorkOffsetY = 0; *scheduler.globalWorkOffsetZ = 0; *scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws(); *scheduler.globalWorkSizeY = 1; *scheduler.globalWorkSizeZ = 1; *scheduler.localWorkSizeX = (uint32_t)scheduler.getLws(); *scheduler.localWorkSizeY = 1; *scheduler.localWorkSizeZ = 1; *scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws(); *scheduler.localWorkSizeY2 = 1; *scheduler.localWorkSizeZ2 = 1; *scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws(); *scheduler.enqueuedLocalWorkSizeY = 1; *scheduler.enqueuedLocalWorkSizeZ = 1; *scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws()); *scheduler.numWorkGroupsY = 0; *scheduler.numWorkGroupsZ = 0; *scheduler.workDim = 1; // Send our indirect object data size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1}; // Create indirectHeap for IOH that is located at the end of device enqueue DSH size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler); IndirectHeap indirectObjectHeap(dsh->getCpuBase(), dsh->getMaxAvailableSpace()); indirectObjectHeap.getSpace(curbeOffset); IndirectHeap *ioh = &indirectObjectHeap; // Program the walker. Invokes execution so all state should already be programmed auto pGpGpuWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker; auto isCcsUsed = isCcs(devQueueHw.getDevice().getDefaultEngine().osContext->getEngineType()); bool inlineDataProgrammingRequired = HardwareCommandsHelper::inlineDataProgrammingRequired(scheduler); HardwareCommandsHelper::sendIndirectState( commandStream, *dsh, *ioh, *ssh, scheduler, simd, localWorkSizes, offsetInterfaceDescriptorTable, interfaceDescriptorIndex, preemptionMode, pGpGpuWalkerCmd, nullptr, true, isCcsUsed); // Implement enabling special WA DisableLSQCROPERFforOCL if needed GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, true); size_t globalOffsets[3] = {0, 0, 0}; size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1}; GpgpuWalkerHelper::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1, true, inlineDataProgrammingRequired, *scheduler.getKernelInfo().patchInfo.threadPayload, 0u); // Implement disabling special WA DisableLSQCROPERFforOCL if needed GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, false); // Do not put BB_START only when returning in first Scheduler run if (devQueueHw.getSchedulerReturnInstance() != 1) { PipeControlHelper::addPipeControl(commandStream, true); // Add BB Start Cmd to the SLB in the Primary Batch Buffer auto *bbStart = static_cast(commandStream.getSpace(sizeof(MI_BATCH_BUFFER_START))); *bbStart = GfxFamily::cmdInitBatchBufferStart; bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH); uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress(); bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress); } } template void GpgpuWalkerHelper::setupTimestampPacket( LinearStream *cmdStream, WALKER_TYPE *walkerCmd, TagNode *timestampPacketNode, TimestampPacketStorage::WriteOperationType writeOperationType, const HardwareInfo &hwInfo) { if (TimestampPacketStorage::WriteOperationType::AfterWalker == writeOperationType) { uint64_t address = timestampPacketNode->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextEnd); PipeControlHelper::obtainPipeControlAndProgramPostSyncOperation(*cmdStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, address, 0, false, hwInfo); } } template size_t EnqueueOperation::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) { size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + HardwareCommandsHelper::getSizeRequiredCS(pKernel) + sizeof(PIPE_CONTROL) * (HardwareCommandsHelper::isPipeControlWArequired(pKernel->getDevice().getHardwareInfo()) ? 2 : 1); size += HardwareCommandsHelper::getSizeRequiredForCacheFlush(commandQueue, pKernel, 0U); size += PreemptionHelper::getPreemptionWaCsSize(commandQueue.getDevice()); if (reserveProfilingCmdsSpace) { size += 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); } if (reservePerfCounters) { size += commandQueue.getPerfCounters()->getGpuCommandsSize(true); size += commandQueue.getPerfCounters()->getGpuCommandsSize(false); } size += GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(pKernel); return size; } template size_t EnqueueOperation::getSizeRequiredForTimestampPacketWrite() { return sizeof(PIPE_CONTROL); } } // namespace NEO