/* * Copyright (C) 2019-2022 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/pipe_control_args.h" #include "shared/source/helpers/preamble.h" #include "shared/source/helpers/string.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/utilities/tag_allocator.h" #include "opencl/source/command_queue/gpgpu_walker.h" #include "opencl/source/device_queue/device_queue_hw.h" #include "opencl/source/helpers/hardware_commands_helper.h" namespace NEO { template void DeviceQueueHw::allocateSlbBuffer() { auto slbSize = getMinimumSlbSize() + getWaCommandsSize(); slbSize *= 128; //num of enqueues slbSize += sizeof(MI_BATCH_BUFFER_START); slbSize = alignUp(slbSize, MemoryConstants::pageSize); slbSize += DeviceQueueHw::getExecutionModelCleanupSectionSize(); slbSize += (4 * MemoryConstants::pageSize); // +4 pages spec restriction slbSize = alignUp(slbSize, MemoryConstants::pageSize); slbBuffer = device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), slbSize, GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER, device->getDeviceBitfield()}); } template void DeviceQueueHw::resetDeviceQueue() { auto &caps = device->getDeviceInfo(); auto igilEventPool = reinterpret_cast(eventPoolBuffer->getUnderlyingBuffer()); memset(eventPoolBuffer->getUnderlyingBuffer(), 0x0, eventPoolBuffer->getUnderlyingBufferSize()); igilEventPool->m_TimestampResolution = static_cast(device->getProfilingTimerResolution()); igilEventPool->m_size = caps.maxOnDeviceEvents; auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); igilQueue = igilCmdQueue; igilCmdQueue->m_controls.m_StackSize = static_cast((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1); igilCmdQueue->m_controls.m_StackTop = static_cast((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1); igilCmdQueue->m_controls.m_PreviousHead = IGIL_DEVICE_QUEUE_HEAD_INIT; igilCmdQueue->m_controls.m_IDTAfterFirstPhase = 1; igilCmdQueue->m_controls.m_CurrentIDToffset = 1; igilCmdQueue->m_controls.m_PreviousStorageTop = static_cast(queueStorageBuffer->getUnderlyingBufferSize()); igilCmdQueue->m_controls.m_PreviousStackTop = static_cast((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1); igilCmdQueue->m_controls.m_DebugNextBlockID = 0xFFFFFFFF; igilCmdQueue->m_controls.m_QstorageSize = static_cast(queueStorageBuffer->getUnderlyingBufferSize()); igilCmdQueue->m_controls.m_QstorageTop = static_cast(queueStorageBuffer->getUnderlyingBufferSize()); igilCmdQueue->m_controls.m_IsProfilingEnabled = static_cast(isProfilingEnabled()); igilCmdQueue->m_controls.m_IsSimulation = static_cast(device->isSimulation()); igilCmdQueue->m_controls.m_LastScheduleEventNumber = 0; igilCmdQueue->m_controls.m_PreviousNumberOfQueues = 0; igilCmdQueue->m_controls.m_EnqueueMarkerScheduled = 0; igilCmdQueue->m_controls.m_SecondLevelBatchOffset = 0; igilCmdQueue->m_controls.m_TotalNumberOfQueues = 0; igilCmdQueue->m_controls.m_EventTimestampAddress = 0; igilCmdQueue->m_controls.m_ErrorCode = 0; igilCmdQueue->m_controls.m_CurrentScheduleEventNumber = 0; igilCmdQueue->m_controls.m_DummyAtomicOperationPlaceholder = 0x00; igilCmdQueue->m_controls.m_DebugNextBlockGWS = 0; // set first stack element in surface at value "1", it protects Scheduler in corner case when StackTop is empty after Child execution auto stack = static_cast(stackBuffer->getUnderlyingBuffer()); stack += ((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1); *stack = 1; igilCmdQueue->m_head = IGIL_DEVICE_QUEUE_HEAD_INIT; igilCmdQueue->m_size = static_cast(queueBuffer->getUnderlyingBufferSize() - sizeof(IGIL_CommandQueue)); igilCmdQueue->m_magic = IGIL_MAGIC_NUMBER; igilCmdQueue->m_controls.m_SchedulerEarlyReturn = DebugManager.flags.SchedulerSimulationReturnInstance.get(); igilCmdQueue->m_controls.m_SchedulerEarlyReturnCounter = 0; buildSlbDummyCommands(); igilCmdQueue->m_controls.m_SLBENDoffsetInBytes = -1; igilCmdQueue->m_controls.m_CriticalSection = ExecutionModelCriticalSection::Free; resetDSH(); } template void DeviceQueueHw::initPipeControl(PIPE_CONTROL *pc) { auto cmd = GfxFamily::cmdInitPipeControl; cmd.setStateCacheInvalidationEnable(0x1); cmd.setDcFlushEnable(true); cmd.setPipeControlFlushEnable(true); cmd.setTextureCacheInvalidationEnable(true); cmd.setCommandStreamerStallEnable(true); *pc = cmd; } template void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNodeBase *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) { // CleanUp Section auto offset = slbCS.getUsed(); auto alignmentSize = alignUp(offset, MemoryConstants::pageSize) - offset; slbCS.getSpace(alignmentSize); offset = slbCS.getUsed(); igilQueue->m_controls.m_CleanupSectionAddress = ptrOffset(slbBuffer->getGpuAddress(), slbCS.getUsed()); GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(&slbCS, *parentKernel, true); using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; if (hwTimeStamp != nullptr) { uint64_t timeStampAddress = hwTimeStamp->getGpuAddress() + offsetof(HwTimeStamps, ContextCompleteTS); igilQueue->m_controls.m_EventTimestampAddress = timeStampAddress; addProfilingEndCmds(timeStampAddress); //enable preemption addLriCmd(false); } uint64_t criticalSectionAddress = (uint64_t)&igilQueue->m_controls.m_CriticalSection; PipeControlArgs args; MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( slbCS, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, criticalSectionAddress, ExecutionModelCriticalSection::Free, device->getHardwareInfo(), args); MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( slbCS, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, tagAddress, taskCount, device->getHardwareInfo(), args); addMediaStateClearCmds(); auto pBBE = slbCS.getSpaceForCmd(); *pBBE = GfxFamily::cmdInitBatchBufferEnd; igilQueue->m_controls.m_CleanupSectionSize = (uint32_t)(slbCS.getUsed() - offset); } template void DeviceQueueHw::resetDSH() { if (heaps[IndirectHeap::DYNAMIC_STATE]) { heaps[IndirectHeap::DYNAMIC_STATE]->replaceBuffer(heaps[IndirectHeap::DYNAMIC_STATE]->getCpuBase(), heaps[IndirectHeap::DYNAMIC_STATE]->getMaxAvailableSpace()); heaps[IndirectHeap::DYNAMIC_STATE]->getSpace(colorCalcStateSize); } } template IndirectHeap *DeviceQueueHw::getIndirectHeap(IndirectHeap::Type type) { UNRECOVERABLE_IF(type != IndirectHeap::DYNAMIC_STATE); if (!heaps[type]) { heaps[type] = new IndirectHeap(dshBuffer); // get space for colorCalc and 2 ID tables at the beginning heaps[type]->getSpace(colorCalcStateSize); } return heaps[type]; } template size_t DeviceQueueHw::getCSPrefetchSize() { return 512; } template void DeviceQueueHw::addLriCmd(bool setArbCheck) { // CTXT_PREMP_DBG offset constexpr uint32_t registerAddress = 0x2248u; uint32_t value = 0u; if (setArbCheck) { // set only bit 8 (Preempt On MI_ARB_CHK Only) value = 0x00000100; } LriHelper::program(&slbCS, registerAddress, value, false); } template size_t DeviceQueueHw::getExecutionModelCleanupSectionSize() { size_t totalSize = 0; totalSize += sizeof(PIPE_CONTROL) + 2 * sizeof(MI_LOAD_REGISTER_REG) + sizeof(MI_LOAD_REGISTER_IMM) + sizeof(PIPE_CONTROL) + sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE); totalSize += getProfilingEndCmdsSize(); totalSize += getMediaStateClearCmdsSize(); totalSize += 4 * sizeof(PIPE_CONTROL); totalSize += sizeof(MI_BATCH_BUFFER_END); return totalSize; } template size_t DeviceQueueHw::getProfilingEndCmdsSize() { size_t size = 0; size += sizeof(PIPE_CONTROL) + sizeof(MI_STORE_REGISTER_MEM); size += sizeof(MI_LOAD_REGISTER_IMM); return size; } template void DeviceQueueHw::addDcFlushToPipeControlWa(PIPE_CONTROL *pc) {} template uint64_t DeviceQueueHw::getBlockKernelStartPointer(const Device &device, const KernelInfo *blockInfo, bool isCcsUsed) { auto blockAllocation = blockInfo->getGraphicsAllocation(); DEBUG_BREAK_IF(!blockAllocation); auto blockKernelStartPointer = blockAllocation ? blockAllocation->getGpuAddressToPatch() : 0llu; auto &hardwareInfo = device.getHardwareInfo(); auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); if (blockAllocation && isCcsUsed && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) { blockKernelStartPointer += blockInfo->kernelDescriptor.entryPoints.skipSetFFIDGP; } return blockKernelStartPointer; } } // namespace NEO