/* * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #pragma once #include "runtime/device_queue/device_queue_hw.h" #include "runtime/command_queue/dispatch_walker.h" #include "runtime/command_queue/dispatch_walker_helper.h" #include "runtime/helpers/kernel_commands.h" #include "runtime/helpers/preamble.h" #include "runtime/helpers/string.h" #include "runtime/memory_manager/memory_manager.h" namespace OCLRT { template void DeviceQueueHw::allocateSlbBuffer() { auto slbSize = getMinimumSlbSize() + getWaCommandsSize(); slbSize *= 128; //num of enqueues slbSize += sizeof(MI_BATCH_BUFFER_START); slbSize = alignUp(slbSize, MemoryConstants::pageSize); slbSize += DeviceQueueHw::getExecutionModelCleanupSectionSize(); slbSize += (4 * MemoryConstants::pageSize); // +4 pages spec restriction slbSize = alignUp(slbSize, MemoryConstants::pageSize); slbBuffer = device->getMemoryManager()->allocateGraphicsMemory(slbSize); } template void DeviceQueueHw::resetDeviceQueue() { auto &caps = device->getDeviceInfo(); auto igilEventPool = reinterpret_cast(eventPoolBuffer->getUnderlyingBuffer()); memset(eventPoolBuffer->getUnderlyingBuffer(), 0x0, eventPoolBuffer->getUnderlyingBufferSize()); igilEventPool->m_size = caps.maxOnDeviceEvents; auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); igilQueue = igilCmdQueue; igilCmdQueue->m_controls.m_StackSize = static_cast((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1); igilCmdQueue->m_controls.m_StackTop = static_cast((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1); igilCmdQueue->m_controls.m_PreviousHead = IGIL_DEVICE_QUEUE_HEAD_INIT; igilCmdQueue->m_controls.m_IDTAfterFirstPhase = 1; igilCmdQueue->m_controls.m_CurrentIDToffset = 1; igilCmdQueue->m_controls.m_PreviousStorageTop = static_cast(queueStorageBuffer->getUnderlyingBufferSize()); igilCmdQueue->m_controls.m_PreviousStackTop = static_cast((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1); igilCmdQueue->m_controls.m_DebugNextBlockID = 0xFFFFFFFF; igilCmdQueue->m_controls.m_QstorageSize = static_cast(queueStorageBuffer->getUnderlyingBufferSize()); igilCmdQueue->m_controls.m_QstorageTop = static_cast(queueStorageBuffer->getUnderlyingBufferSize()); igilCmdQueue->m_controls.m_IsProfilingEnabled = static_cast(isProfilingEnabled()); igilCmdQueue->m_controls.m_IsSimulation = static_cast(device->isSimulation()); igilCmdQueue->m_controls.m_LastScheduleEventNumber = 0; igilCmdQueue->m_controls.m_PreviousNumberOfQueues = 0; igilCmdQueue->m_controls.m_EnqueueMarkerScheduled = 0; igilCmdQueue->m_controls.m_SecondLevelBatchOffset = 0; igilCmdQueue->m_controls.m_TotalNumberOfQueues = 0; igilCmdQueue->m_controls.m_EventTimestampAddress = 0; igilCmdQueue->m_controls.m_ErrorCode = 0; igilCmdQueue->m_controls.m_CurrentScheduleEventNumber = 0; igilCmdQueue->m_controls.m_DummyAtomicOperationPlaceholder = 0x00; igilCmdQueue->m_controls.m_DebugNextBlockGWS = 0; // set first stack element in surface at value "1", it protects Scheduler in corner case when StackTop is empty after Child execution auto stack = static_cast(stackBuffer->getUnderlyingBuffer()); stack += ((stackBuffer->getUnderlyingBufferSize() / sizeof(cl_uint)) - 1); *stack = 1; igilCmdQueue->m_head = IGIL_DEVICE_QUEUE_HEAD_INIT; igilCmdQueue->m_size = static_cast(queueBuffer->getUnderlyingBufferSize() - sizeof(IGIL_CommandQueue)); igilCmdQueue->m_magic = IGIL_MAGIC_NUMBER; igilCmdQueue->m_controls.m_SchedulerEarlyReturn = DebugManager.flags.SchedulerSimulationReturnInstance.get(); igilCmdQueue->m_controls.m_SchedulerEarlyReturnCounter = 0; buildSlbDummyCommands(); igilCmdQueue->m_controls.m_SLBENDoffsetInBytes = -1; igilCmdQueue->m_controls.m_CriticalSection = ExecutionModelCriticalSection::Free; resetDSH(); } template size_t DeviceQueueHw::getMinimumSlbSize() { return sizeof(MEDIA_STATE_FLUSH) + sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) + sizeof(PIPE_CONTROL) + sizeof(GPGPU_WALKER) + sizeof(MEDIA_STATE_FLUSH) + sizeof(PIPE_CONTROL) + DeviceQueueHw::getCSPrefetchSize(); } template void DeviceQueueHw::initPipeControl(PIPE_CONTROL *pc) { *pc = PIPE_CONTROL::sInit(); pc->setStateCacheInvalidationEnable(0x1); pc->setDcFlushEnable(true); pc->setPipeControlFlushEnable(true); pc->setTextureCacheInvalidationEnable(true); pc->setCommandStreamerStallEnable(true); } template void DeviceQueueHw::buildSlbDummyCommands() { auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); auto slbEndOffset = igilCmdQueue->m_controls.m_SLBENDoffsetInBytes; size_t commandsSize = getMinimumSlbSize() + getWaCommandsSize(); size_t numEnqueues = numberOfDeviceEnqueues; // buildSlbDummyCommands is called from resetDeviceQueue() - reset slbCS each time slbCS.replaceBuffer(slbBuffer->getUnderlyingBuffer(), slbBuffer->getUnderlyingBufferSize()); if (slbEndOffset >= 0) { DEBUG_BREAK_IF(slbEndOffset % commandsSize != 0); //We always overwrite at most one enqueue space with BB_START command pointing to cleanup section //if SLBENDoffset is the at the end then BB_START added after scheduler did not corrupt anything so no need to regenerate numEnqueues = (slbEndOffset == static_cast(commandsSize)) ? 0 : 1; slbCS.getSpace(slbEndOffset); } for (size_t i = 0; i < numEnqueues; i++) { auto mediaStateFlush = slbCS.getSpaceForCmd(); *mediaStateFlush = MEDIA_STATE_FLUSH::sInit(); addArbCheckCmdWa(); addMiAtomicCmdWa((uint64_t)&igilCmdQueue->m_controls.m_DummyAtomicOperationPlaceholder); auto mediaIdLoad = slbCS.getSpaceForCmd(); *mediaIdLoad = MEDIA_INTERFACE_DESCRIPTOR_LOAD::sInit(); mediaIdLoad->setInterfaceDescriptorTotalLength(2048); auto dataStartAddress = colorCalcStateSize; mediaIdLoad->setInterfaceDescriptorDataStartAddress(dataStartAddress + sizeof(INTERFACE_DESCRIPTOR_DATA) * schedulerIDIndex); addLriCmdWa(true); if (isProfilingEnabled()) { addPipeControlCmdWa(); auto pipeControl = slbCS.getSpaceForCmd(); initPipeControl(pipeControl); } else { auto noop = slbCS.getSpace(sizeof(PIPE_CONTROL)); memset(noop, 0x0, sizeof(PIPE_CONTROL)); addPipeControlCmdWa(true); } auto gpgpuWalker = slbCS.getSpaceForCmd(); *gpgpuWalker = GPGPU_WALKER::sInit(); gpgpuWalker->setSimdSize(GPGPU_WALKER::SIMD_SIZE::SIMD_SIZE_SIMD16); gpgpuWalker->setThreadGroupIdXDimension(1); gpgpuWalker->setThreadGroupIdYDimension(1); gpgpuWalker->setThreadGroupIdZDimension(1); gpgpuWalker->setRightExecutionMask(0xFFFFFFFF); gpgpuWalker->setBottomExecutionMask(0xFFFFFFFF); mediaStateFlush = slbCS.getSpaceForCmd(); *mediaStateFlush = MEDIA_STATE_FLUSH::sInit(); addArbCheckCmdWa(); addPipeControlCmdWa(); auto pipeControl2 = slbCS.getSpaceForCmd(); initPipeControl(pipeControl2); addLriCmdWa(false); auto prefetch = slbCS.getSpace(getCSPrefetchSize()); memset(prefetch, 0x0, getCSPrefetchSize()); } // always the same BBStart position (after 128 enqueues) auto bbStartOffset = (commandsSize * 128) - slbCS.getUsed(); slbCS.getSpace(bbStartOffset); auto bbStart = slbCS.getSpaceForCmd(); *bbStart = MI_BATCH_BUFFER_START::sInit(); auto slbPtr = reinterpret_cast(slbBuffer->getUnderlyingBuffer()); bbStart->setBatchBufferStartAddressGraphicsaddress472(slbPtr); igilCmdQueue->m_controls.m_CleanupSectionSize = 0; igilQueue->m_controls.m_CleanupSectionAddress = 0; } template void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) { // CleanUp Section auto offset = slbCS.getUsed(); auto alignmentSize = alignUp(offset, MemoryConstants::pageSize) - offset; slbCS.getSpace(alignmentSize); offset = slbCS.getUsed(); igilQueue->m_controls.m_CleanupSectionAddress = ptrOffset(slbBuffer->getGpuAddress(), slbCS.getUsed()); applyWADisableLSQCROPERFforOCL(&slbCS, *parentKernel, true); using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; if (hwTimeStamp != nullptr) { uint64_t TimeStampAddress = (uint64_t)((uintptr_t) & (hwTimeStamp->ContextCompleteTS)); igilQueue->m_controls.m_EventTimestampAddress = TimeStampAddress; addProfilingEndCmds(TimeStampAddress); //enable preemption addLriCmd(false); } uint64_t criticalSectionAddress = (uint64_t)&igilQueue->m_controls.m_CriticalSection; addPipeControlCmdWa(); auto pipeControl = slbCS.getSpaceForCmd(); *pipeControl = PIPE_CONTROL::sInit(); pipeControl->setCommandStreamerStallEnable(true); pipeControl->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA); pipeControl->setAddressHigh(criticalSectionAddress >> 32); pipeControl->setAddress(criticalSectionAddress & (0xffffffff)); pipeControl->setImmediateData(ExecutionModelCriticalSection::Free); uint64_t tagAddress = (uint64_t)device->getTagAddress(); addPipeControlCmdWa(); auto pipeControl2 = slbCS.getSpaceForCmd(); *pipeControl2 = PIPE_CONTROL::sInit(); pipeControl2->setCommandStreamerStallEnable(true); pipeControl2->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA); pipeControl2->setAddressHigh(tagAddress >> 32); pipeControl2->setAddress(tagAddress & (0xffffffff)); pipeControl2->setImmediateData(taskCount); addMediaStateClearCmds(); auto pBBE = slbCS.getSpaceForCmd(); *pBBE = MI_BATCH_BUFFER_END::sInit(); igilQueue->m_controls.m_CleanupSectionSize = (uint32_t)(slbCS.getUsed() - offset); } template void DeviceQueueHw::resetDSH() { if (heaps[IndirectHeap::DYNAMIC_STATE]) { heaps[IndirectHeap::DYNAMIC_STATE]->replaceBuffer(heaps[IndirectHeap::DYNAMIC_STATE]->getCpuBase(), heaps[IndirectHeap::DYNAMIC_STATE]->getMaxAvailableSpace()); heaps[IndirectHeap::DYNAMIC_STATE]->getSpace(colorCalcStateSize); } } template IndirectHeap *DeviceQueueHw::getIndirectHeap(IndirectHeap::Type type) { if (!heaps[type]) { switch (type) { case IndirectHeap::DYNAMIC_STATE: { heaps[type] = new IndirectHeap(dshBuffer); // get space for colorCalc and 2 ID tables at the beginning heaps[type]->getSpace(colorCalcStateSize); break; } default: break; } } return heaps[type]; } template void DeviceQueueHw::setupIndirectState(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentIDCount) { void *pDSH = dshBuffer->getUnderlyingBuffer(); // Heap and dshBuffer shoud be the same if heap is created DEBUG_BREAK_IF(!((heaps[IndirectHeap::DYNAMIC_STATE] == nullptr) || (heaps[IndirectHeap::DYNAMIC_STATE]->getCpuBase() == pDSH))); // Set scheduler ID to last entry in first table, it will have ID == 0, blocks will have following entries. auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); igilCmdQueue->m_controls.m_IDTstart = colorCalcStateSize + sizeof(INTERFACE_DESCRIPTOR_DATA) * (interfaceDescriptorEntries - 2); // Parent's dsh is located after ColorCalcState and 2 ID tables igilCmdQueue->m_controls.m_DynamicHeapStart = offsetDsh + alignUp((uint32_t)parentKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); igilCmdQueue->m_controls.m_DynamicHeapSizeInBytes = (uint32_t)dshBuffer->getUnderlyingBufferSize(); igilCmdQueue->m_controls.m_CurrentDSHoffset = igilCmdQueue->m_controls.m_DynamicHeapStart; igilCmdQueue->m_controls.m_ParentDSHOffset = offsetDsh; uint32_t blockIndex = parentIDCount; pDSH = ptrOffset(pDSH, colorCalcStateSize); INTERFACE_DESCRIPTOR_DATA *pIDDestination = static_cast(pDSH); BlockKernelManager *blockManager = parentKernel->getProgram()->getBlockKernelManager(); uint32_t blockCount = static_cast(blockManager->getCount()); uint32_t maxBindingTableCount = 0; uint32_t totalBlockSSHSize = 0; igilCmdQueue->m_controls.m_StartBlockID = blockIndex; for (uint32_t i = 0; i < blockCount; i++) { const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i); auto blockAllocation = pBlockInfo->getGraphicsAllocation(); DEBUG_BREAK_IF(!blockAllocation); auto gpuAddress = blockAllocation ? blockAllocation->getGpuAddressToPatch() : 0llu; auto bindingTableCount = pBlockInfo->patchInfo.bindingTableState->Count; maxBindingTableCount = std::max(maxBindingTableCount, bindingTableCount); totalBlockSSHSize += alignUp(pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); auto btOffset = KernelCommandsHelper::pushBindingTableAndSurfaceStates(surfaceStateHeap, *pBlockInfo); parentKernel->setReflectionSurfaceBlockBtOffset(i, static_cast(btOffset)); // Determine SIMD size uint32_t simd = pBlockInfo->getMaxSimdSize(); DEBUG_BREAK_IF(pBlockInfo->patchInfo.interfaceDescriptorData == nullptr); uint32_t idOffset = pBlockInfo->patchInfo.interfaceDescriptorData->Offset; const INTERFACE_DESCRIPTOR_DATA *pBlockID = static_cast(ptrOffset(pBlockInfo->heapInfo.pDsh, idOffset)); pIDDestination[blockIndex + i] = *pBlockID; pIDDestination[blockIndex + i].setKernelStartPointerHigh(gpuAddress >> 32); pIDDestination[blockIndex + i].setKernelStartPointer((uint32_t)gpuAddress); pIDDestination[blockIndex + i].setBarrierEnable(pBlockInfo->patchInfo.executionEnvironment->HasBarriers > 0); pIDDestination[blockIndex + i].setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL); // Set offset to sampler states, block's DHSOffset is added by scheduler pIDDestination[blockIndex + i].setSamplerStatePointer(static_cast(pBlockInfo->getBorderColorStateSize())); auto threadPayload = pBlockInfo->patchInfo.threadPayload; DEBUG_BREAK_IF(nullptr == threadPayload); auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels); auto numGrfPerThreadData = static_cast(sizePerThreadData / sizeof(GRF)); // HW requires a minimum of 1 GRF of perThreadData for each thread in a thread group // when sizeCrossThreadData != 0 numGrfPerThreadData = std::max(numGrfPerThreadData, 1u); pIDDestination[blockIndex + i].setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); } igilCmdQueue->m_controls.m_BTmaxSize = alignUp(maxBindingTableCount * (uint32_t)sizeof(BINDING_TABLE_STATE), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE); igilCmdQueue->m_controls.m_BTbaseOffset = alignUp((uint32_t)surfaceStateHeap.getUsed(), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE); igilCmdQueue->m_controls.m_CurrentSSHoffset = igilCmdQueue->m_controls.m_BTbaseOffset; } template size_t DeviceQueueHw::setSchedulerCrossThreadData(SchedulerKernel &scheduler) { using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; size_t offset = dshBuffer->getUnderlyingBufferSize() - scheduler.getCurbeSize() - 4096; // Page size padding auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); igilCmdQueue->m_controls.m_SchedulerDSHOffset = (uint32_t)offset; igilCmdQueue->m_controls.m_SchedulerConstantBufferSize = (uint32_t)scheduler.getCurbeSize(); return offset; } template void DeviceQueueHw::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) { OCLRT::dispatchScheduler(cmdQ, *this, preemptionMode, scheduler); return; } template size_t DeviceQueueHw::getCSPrefetchSize() { return 512; } template void DeviceQueueHw::addLriCmd(bool setArbCheck) { using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM; auto lri = slbCS.getSpaceForCmd(); *lri = MI_LOAD_REGISTER_IMM::sInit(); lri->setRegisterOffset(0x2248); // CTXT_PREMP_DBG offset if (setArbCheck) lri->setDataDword(0x00000100); // set only bit 8 (Preempt On MI_ARB_CHK Only) else lri->setDataDword(0x0); } template void DeviceQueueHw::addMediaStateClearCmds() { typedef typename GfxFamily::MEDIA_VFE_STATE MEDIA_VFE_STATE; addPipeControlCmdWa(); auto pipeControl = slbCS.getSpaceForCmd(); *pipeControl = PIPE_CONTROL::sInit(); pipeControl->setGenericMediaStateClear(true); pipeControl->setCommandStreamerStallEnable(true); PreambleHelper::programVFEState(&slbCS, device->getHardwareInfo(), 0, 0); } template size_t DeviceQueueHw::getMediaStateClearCmdsSize() { // PC with GenreicMediaStateClear + WA PC size_t size = 2 * sizeof(PIPE_CONTROL); // VFE state cmds size += sizeof(PIPE_CONTROL); size += sizeof(MEDIA_VFE_STATE); return size; } template size_t DeviceQueueHw::getExecutionModelCleanupSectionSize() { size_t totalSize = 0; totalSize += sizeof(PIPE_CONTROL) + 2 * sizeof(MI_LOAD_REGISTER_REG) + sizeof(MI_LOAD_REGISTER_IMM) + sizeof(PIPE_CONTROL) + sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE); totalSize += getProfilingEndCmdsSize(); totalSize += getMediaStateClearCmdsSize(); totalSize += 4 * sizeof(PIPE_CONTROL); totalSize += sizeof(MI_BATCH_BUFFER_END); return totalSize; } template size_t DeviceQueueHw::getProfilingEndCmdsSize() { size_t size = 0; size += sizeof(PIPE_CONTROL) + sizeof(MI_STORE_REGISTER_MEM); size += sizeof(MI_LOAD_REGISTER_IMM); return size; } } // namespace OCLRT