diff --git a/runtime/device_queue/CMakeLists.txt b/runtime/device_queue/CMakeLists.txt index 9b3bba7edf..54c3031036 100644 --- a/runtime/device_queue/CMakeLists.txt +++ b/runtime/device_queue/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2018 Intel Corporation +# Copyright (C) 2018-2019 Intel Corporation # # SPDX-License-Identifier: MIT # @@ -9,8 +9,10 @@ set(RUNTIME_SRCS_DEVICE_QUEUE ${CMAKE_CURRENT_SOURCE_DIR}/device_queue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_queue.h ${CMAKE_CURRENT_SOURCE_DIR}/device_queue_hw.h - ${CMAKE_CURRENT_SOURCE_DIR}/device_queue_hw.inl + ${CMAKE_CURRENT_SOURCE_DIR}/device_queue_hw_base.inl + ${CMAKE_CURRENT_SOURCE_DIR}/device_queue_hw_bdw_plus.inl ${CMAKE_CURRENT_SOURCE_DIR}/device_queue_hw_profiling.inl ) target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_DEVICE_QUEUE}) set_property(GLOBAL PROPERTY RUNTIME_SRCS_DEVICE_QUEUE ${RUNTIME_SRCS_DEVICE_QUEUE}) +add_subdirectories() diff --git a/runtime/device_queue/device_queue_hw.inl b/runtime/device_queue/device_queue_hw_base.inl similarity index 50% rename from runtime/device_queue/device_queue_hw.inl rename to runtime/device_queue/device_queue_hw_base.inl index 8f80d06cac..57421eb368 100644 --- a/runtime/device_queue/device_queue_hw.inl +++ b/runtime/device_queue/device_queue_hw_base.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017-2019 Intel Corporation + * Copyright (C) 2019 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -88,21 +88,6 @@ void DeviceQueueHw::resetDeviceQueue() { resetDSH(); } -template -size_t DeviceQueueHw::getMinimumSlbSize() { - using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH; - using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD; - using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; - - return sizeof(MEDIA_STATE_FLUSH) + - sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) + - sizeof(PIPE_CONTROL) + - sizeof(GPGPU_WALKER) + - sizeof(MEDIA_STATE_FLUSH) + - sizeof(PIPE_CONTROL) + - DeviceQueueHw::getCSPrefetchSize(); -} - template void DeviceQueueHw::initPipeControl(PIPE_CONTROL *pc) { *pc = GfxFamily::cmdInitPipeControl; @@ -113,95 +98,6 @@ void DeviceQueueHw::initPipeControl(PIPE_CONTROL *pc) { pc->setCommandStreamerStallEnable(true); } -template -void DeviceQueueHw::buildSlbDummyCommands() { - using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH; - using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD; - using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; - - auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); - auto slbEndOffset = igilCmdQueue->m_controls.m_SLBENDoffsetInBytes; - size_t commandsSize = getMinimumSlbSize() + getWaCommandsSize(); - size_t numEnqueues = numberOfDeviceEnqueues; - - // buildSlbDummyCommands is called from resetDeviceQueue() - reset slbCS each time - slbCS.replaceBuffer(slbBuffer->getUnderlyingBuffer(), slbBuffer->getUnderlyingBufferSize()); - - if (slbEndOffset >= 0) { - DEBUG_BREAK_IF(slbEndOffset % commandsSize != 0); - //We always overwrite at most one enqueue space with BB_START command pointing to cleanup section - //if SLBENDoffset is the at the end then BB_START added after scheduler did not corrupt anything so no need to regenerate - numEnqueues = (slbEndOffset == static_cast(commandsSize)) ? 0 : 1; - slbCS.getSpace(slbEndOffset); - } - - for (size_t i = 0; i < numEnqueues; i++) { - auto mediaStateFlush = slbCS.getSpaceForCmd(); - *mediaStateFlush = GfxFamily::cmdInitMediaStateFlush; - - addArbCheckCmdWa(); - - addMiAtomicCmdWa((uint64_t)&igilCmdQueue->m_controls.m_DummyAtomicOperationPlaceholder); - - auto mediaIdLoad = slbCS.getSpaceForCmd(); - *mediaIdLoad = GfxFamily::cmdInitMediaInterfaceDescriptorLoad; - mediaIdLoad->setInterfaceDescriptorTotalLength(2048); - - auto dataStartAddress = colorCalcStateSize; - - mediaIdLoad->setInterfaceDescriptorDataStartAddress(dataStartAddress + sizeof(INTERFACE_DESCRIPTOR_DATA) * schedulerIDIndex); - - addLriCmdWa(true); - - if (isProfilingEnabled()) { - addPipeControlCmdWa(); - auto pipeControl = slbCS.getSpaceForCmd(); - initPipeControl(pipeControl); - - } else { - auto noop = slbCS.getSpace(sizeof(PIPE_CONTROL)); - memset(noop, 0x0, sizeof(PIPE_CONTROL)); - addPipeControlCmdWa(true); - } - - auto gpgpuWalker = slbCS.getSpaceForCmd(); - *gpgpuWalker = GfxFamily::cmdInitGpgpuWalker; - gpgpuWalker->setSimdSize(GPGPU_WALKER::SIMD_SIZE::SIMD_SIZE_SIMD16); - gpgpuWalker->setThreadGroupIdXDimension(1); - gpgpuWalker->setThreadGroupIdYDimension(1); - gpgpuWalker->setThreadGroupIdZDimension(1); - gpgpuWalker->setRightExecutionMask(0xFFFFFFFF); - gpgpuWalker->setBottomExecutionMask(0xFFFFFFFF); - - mediaStateFlush = slbCS.getSpaceForCmd(); - *mediaStateFlush = GfxFamily::cmdInitMediaStateFlush; - - addArbCheckCmdWa(); - - addPipeControlCmdWa(); - - auto pipeControl2 = slbCS.getSpaceForCmd(); - initPipeControl(pipeControl2); - - addLriCmdWa(false); - - auto prefetch = slbCS.getSpace(getCSPrefetchSize()); - memset(prefetch, 0x0, getCSPrefetchSize()); - } - - // always the same BBStart position (after 128 enqueues) - auto bbStartOffset = (commandsSize * 128) - slbCS.getUsed(); - slbCS.getSpace(bbStartOffset); - - auto bbStart = slbCS.getSpaceForCmd(); - *bbStart = GfxFamily::cmdInitBatchBufferStart; - auto slbPtr = reinterpret_cast(slbBuffer->getUnderlyingBuffer()); - bbStart->setBatchBufferStartAddressGraphicsaddress472(slbPtr); - - igilCmdQueue->m_controls.m_CleanupSectionSize = 0; - igilQueue->m_controls.m_CleanupSectionAddress = 0; -} - template void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint32_t taskCount) { // CleanUp Section @@ -271,88 +167,6 @@ IndirectHeap *DeviceQueueHw::getIndirectHeap(IndirectHeap::Type type) return heaps[type]; } -template -void DeviceQueueHw::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) { - using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; - void *pDSH = dynamicStateHeap.getCpuBase(); - - // Set scheduler ID to last entry in first table, it will have ID == 0, blocks will have following entries. - auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); - igilCmdQueue->m_controls.m_IDTstart = colorCalcStateSize + sizeof(INTERFACE_DESCRIPTOR_DATA) * (interfaceDescriptorEntries - 2); - - // Parent's dsh is located after ColorCalcState and 2 ID tables - igilCmdQueue->m_controls.m_DynamicHeapStart = offsetDsh + alignUp((uint32_t)parentKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); - igilCmdQueue->m_controls.m_DynamicHeapSizeInBytes = (uint32_t)dshBuffer->getUnderlyingBufferSize(); - - igilCmdQueue->m_controls.m_CurrentDSHoffset = igilCmdQueue->m_controls.m_DynamicHeapStart; - igilCmdQueue->m_controls.m_ParentDSHOffset = offsetDsh; - - uint32_t blockIndex = parentIDCount; - - pDSH = ptrOffset(pDSH, colorCalcStateSize); - - INTERFACE_DESCRIPTOR_DATA *pIDDestination = static_cast(pDSH); - - BlockKernelManager *blockManager = parentKernel->getProgram()->getBlockKernelManager(); - uint32_t blockCount = static_cast(blockManager->getCount()); - - uint32_t maxBindingTableCount = 0; - uint32_t totalBlockSSHSize = 0; - - igilCmdQueue->m_controls.m_StartBlockID = blockIndex; - - for (uint32_t i = 0; i < blockCount; i++) { - const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i); - - auto blockAllocation = pBlockInfo->getGraphicsAllocation(); - DEBUG_BREAK_IF(!blockAllocation); - - auto gpuAddress = blockAllocation ? blockAllocation->getGpuAddressToPatch() : 0llu; - - auto bindingTableCount = pBlockInfo->patchInfo.bindingTableState->Count; - maxBindingTableCount = std::max(maxBindingTableCount, bindingTableCount); - - totalBlockSSHSize += alignUp(pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); - - auto btOffset = KernelCommandsHelper::pushBindingTableAndSurfaceStates(surfaceStateHeap, *pBlockInfo); - - parentKernel->setReflectionSurfaceBlockBtOffset(i, static_cast(btOffset)); - - // Determine SIMD size - uint32_t simd = pBlockInfo->getMaxSimdSize(); - DEBUG_BREAK_IF(pBlockInfo->patchInfo.interfaceDescriptorData == nullptr); - - uint32_t idOffset = pBlockInfo->patchInfo.interfaceDescriptorData->Offset; - const INTERFACE_DESCRIPTOR_DATA *pBlockID = static_cast(ptrOffset(pBlockInfo->heapInfo.pDsh, idOffset)); - - pIDDestination[blockIndex + i] = *pBlockID; - pIDDestination[blockIndex + i].setKernelStartPointerHigh(gpuAddress >> 32); - pIDDestination[blockIndex + i].setKernelStartPointer((uint32_t)gpuAddress); - pIDDestination[blockIndex + i].setBarrierEnable(pBlockInfo->patchInfo.executionEnvironment->HasBarriers > 0); - pIDDestination[blockIndex + i].setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL); - - // Set offset to sampler states, block's DHSOffset is added by scheduler - pIDDestination[blockIndex + i].setSamplerStatePointer(static_cast(pBlockInfo->getBorderColorStateSize())); - - auto threadPayload = pBlockInfo->patchInfo.threadPayload; - DEBUG_BREAK_IF(nullptr == threadPayload); - - auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); - auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels); - - auto numGrfPerThreadData = static_cast(sizePerThreadData / sizeof(GRF)); - - // HW requires a minimum of 1 GRF of perThreadData for each thread in a thread group - // when sizeCrossThreadData != 0 - numGrfPerThreadData = std::max(numGrfPerThreadData, 1u); - pIDDestination[blockIndex + i].setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); - } - - igilCmdQueue->m_controls.m_BTmaxSize = alignUp(maxBindingTableCount * (uint32_t)sizeof(BINDING_TABLE_STATE), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE); - igilCmdQueue->m_controls.m_BTbaseOffset = alignUp((uint32_t)surfaceStateHeap.getUsed(), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE); - igilCmdQueue->m_controls.m_CurrentSSHoffset = igilCmdQueue->m_controls.m_BTbaseOffset; -} - template size_t DeviceQueueHw::setSchedulerCrossThreadData(SchedulerKernel &scheduler) { using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; @@ -393,34 +207,6 @@ void DeviceQueueHw::addLriCmd(bool setArbCheck) { lri->setDataDword(0x0); } -template -void DeviceQueueHw::addMediaStateClearCmds() { - typedef typename GfxFamily::MEDIA_VFE_STATE MEDIA_VFE_STATE; - - addPipeControlCmdWa(); - - auto pipeControl = slbCS.getSpaceForCmd(); - *pipeControl = GfxFamily::cmdInitPipeControl; - pipeControl->setGenericMediaStateClear(true); - pipeControl->setCommandStreamerStallEnable(true); - - addDcFlushToPipeControlWa(pipeControl); - - PreambleHelper::programVFEState(&slbCS, device->getHardwareInfo(), 0, 0); -} - -template -size_t DeviceQueueHw::getMediaStateClearCmdsSize() { - using MEDIA_VFE_STATE = typename GfxFamily::MEDIA_VFE_STATE; - // PC with GenreicMediaStateClear + WA PC - size_t size = 2 * sizeof(PIPE_CONTROL); - - // VFE state cmds - size += sizeof(PIPE_CONTROL); - size += sizeof(MEDIA_VFE_STATE); - return size; -} - template size_t DeviceQueueHw::getExecutionModelCleanupSectionSize() { size_t totalSize = 0; diff --git a/runtime/device_queue/device_queue_hw_bdw_plus.inl b/runtime/device_queue/device_queue_hw_bdw_plus.inl new file mode 100644 index 0000000000..daaadc5025 --- /dev/null +++ b/runtime/device_queue/device_queue_hw_bdw_plus.inl @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2019 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "runtime/device_queue/device_queue_hw_base.inl" + +namespace NEO { + +template +size_t DeviceQueueHw::getMinimumSlbSize() { + using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH; + using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD; + using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; + + return sizeof(MEDIA_STATE_FLUSH) + + sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) + + sizeof(PIPE_CONTROL) + + sizeof(GPGPU_WALKER) + + sizeof(MEDIA_STATE_FLUSH) + + sizeof(PIPE_CONTROL) + + DeviceQueueHw::getCSPrefetchSize(); +} + +template +void DeviceQueueHw::buildSlbDummyCommands() { + using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH; + using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD; + using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; + + auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); + auto slbEndOffset = igilCmdQueue->m_controls.m_SLBENDoffsetInBytes; + size_t commandsSize = getMinimumSlbSize() + getWaCommandsSize(); + size_t numEnqueues = numberOfDeviceEnqueues; + + // buildSlbDummyCommands is called from resetDeviceQueue() - reset slbCS each time + slbCS.replaceBuffer(slbBuffer->getUnderlyingBuffer(), slbBuffer->getUnderlyingBufferSize()); + + if (slbEndOffset >= 0) { + DEBUG_BREAK_IF(slbEndOffset % commandsSize != 0); + //We always overwrite at most one enqueue space with BB_START command pointing to cleanup section + //if SLBENDoffset is the at the end then BB_START added after scheduler did not corrupt anything so no need to regenerate + numEnqueues = (slbEndOffset == static_cast(commandsSize)) ? 0 : 1; + slbCS.getSpace(slbEndOffset); + } + + for (size_t i = 0; i < numEnqueues; i++) { + auto mediaStateFlush = slbCS.getSpaceForCmd(); + *mediaStateFlush = GfxFamily::cmdInitMediaStateFlush; + + addArbCheckCmdWa(); + + addMiAtomicCmdWa((uint64_t)&igilCmdQueue->m_controls.m_DummyAtomicOperationPlaceholder); + + auto mediaIdLoad = slbCS.getSpaceForCmd(); + *mediaIdLoad = GfxFamily::cmdInitMediaInterfaceDescriptorLoad; + mediaIdLoad->setInterfaceDescriptorTotalLength(2048); + + auto dataStartAddress = colorCalcStateSize; + + mediaIdLoad->setInterfaceDescriptorDataStartAddress(dataStartAddress + sizeof(INTERFACE_DESCRIPTOR_DATA) * schedulerIDIndex); + + addLriCmdWa(true); + + if (isProfilingEnabled()) { + addPipeControlCmdWa(); + auto pipeControl = slbCS.getSpaceForCmd(); + initPipeControl(pipeControl); + + } else { + auto noop = slbCS.getSpace(sizeof(PIPE_CONTROL)); + memset(noop, 0x0, sizeof(PIPE_CONTROL)); + addPipeControlCmdWa(true); + } + + auto gpgpuWalker = slbCS.getSpaceForCmd(); + *gpgpuWalker = GfxFamily::cmdInitGpgpuWalker; + gpgpuWalker->setSimdSize(GPGPU_WALKER::SIMD_SIZE::SIMD_SIZE_SIMD16); + gpgpuWalker->setThreadGroupIdXDimension(1); + gpgpuWalker->setThreadGroupIdYDimension(1); + gpgpuWalker->setThreadGroupIdZDimension(1); + gpgpuWalker->setRightExecutionMask(0xFFFFFFFF); + gpgpuWalker->setBottomExecutionMask(0xFFFFFFFF); + + mediaStateFlush = slbCS.getSpaceForCmd(); + *mediaStateFlush = GfxFamily::cmdInitMediaStateFlush; + + addArbCheckCmdWa(); + + addPipeControlCmdWa(); + + auto pipeControl2 = slbCS.getSpaceForCmd(); + initPipeControl(pipeControl2); + + addLriCmdWa(false); + + auto prefetch = slbCS.getSpace(getCSPrefetchSize()); + memset(prefetch, 0x0, getCSPrefetchSize()); + } + + // always the same BBStart position (after 128 enqueues) + auto bbStartOffset = (commandsSize * 128) - slbCS.getUsed(); + slbCS.getSpace(bbStartOffset); + + auto bbStart = slbCS.getSpaceForCmd(); + *bbStart = GfxFamily::cmdInitBatchBufferStart; + auto slbPtr = reinterpret_cast(slbBuffer->getUnderlyingBuffer()); + bbStart->setBatchBufferStartAddressGraphicsaddress472(slbPtr); + + igilCmdQueue->m_controls.m_CleanupSectionSize = 0; + igilQueue->m_controls.m_CleanupSectionAddress = 0; +} + +template +void DeviceQueueHw::addMediaStateClearCmds() { + typedef typename GfxFamily::MEDIA_VFE_STATE MEDIA_VFE_STATE; + + addPipeControlCmdWa(); + + auto pipeControl = slbCS.getSpaceForCmd(); + *pipeControl = GfxFamily::cmdInitPipeControl; + pipeControl->setGenericMediaStateClear(true); + pipeControl->setCommandStreamerStallEnable(true); + + addDcFlushToPipeControlWa(pipeControl); + + PreambleHelper::programVFEState(&slbCS, device->getHardwareInfo(), 0, 0); +} + +template +size_t DeviceQueueHw::getMediaStateClearCmdsSize() { + using MEDIA_VFE_STATE = typename GfxFamily::MEDIA_VFE_STATE; + // PC with GenreicMediaStateClear + WA PC + size_t size = 2 * sizeof(PIPE_CONTROL); + + // VFE state cmds + size += sizeof(PIPE_CONTROL); + size += sizeof(MEDIA_VFE_STATE); + return size; +} + +template +void DeviceQueueHw::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) { + using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; + void *pDSH = dynamicStateHeap.getCpuBase(); + + // Set scheduler ID to last entry in first table, it will have ID == 0, blocks will have following entries. + auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); + igilCmdQueue->m_controls.m_IDTstart = colorCalcStateSize + sizeof(INTERFACE_DESCRIPTOR_DATA) * (interfaceDescriptorEntries - 2); + + // Parent's dsh is located after ColorCalcState and 2 ID tables + igilCmdQueue->m_controls.m_DynamicHeapStart = offsetDsh + alignUp((uint32_t)parentKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); + igilCmdQueue->m_controls.m_DynamicHeapSizeInBytes = (uint32_t)dshBuffer->getUnderlyingBufferSize(); + + igilCmdQueue->m_controls.m_CurrentDSHoffset = igilCmdQueue->m_controls.m_DynamicHeapStart; + igilCmdQueue->m_controls.m_ParentDSHOffset = offsetDsh; + + uint32_t blockIndex = parentIDCount; + + pDSH = ptrOffset(pDSH, colorCalcStateSize); + + INTERFACE_DESCRIPTOR_DATA *pIDDestination = static_cast(pDSH); + + BlockKernelManager *blockManager = parentKernel->getProgram()->getBlockKernelManager(); + uint32_t blockCount = static_cast(blockManager->getCount()); + + uint32_t maxBindingTableCount = 0; + uint32_t totalBlockSSHSize = 0; + + igilCmdQueue->m_controls.m_StartBlockID = blockIndex; + + for (uint32_t i = 0; i < blockCount; i++) { + const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i); + + auto blockAllocation = pBlockInfo->getGraphicsAllocation(); + DEBUG_BREAK_IF(!blockAllocation); + + auto gpuAddress = blockAllocation ? blockAllocation->getGpuAddressToPatch() : 0llu; + + auto bindingTableCount = pBlockInfo->patchInfo.bindingTableState->Count; + maxBindingTableCount = std::max(maxBindingTableCount, bindingTableCount); + + totalBlockSSHSize += alignUp(pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); + + auto btOffset = KernelCommandsHelper::pushBindingTableAndSurfaceStates(surfaceStateHeap, *pBlockInfo); + + parentKernel->setReflectionSurfaceBlockBtOffset(i, static_cast(btOffset)); + + // Determine SIMD size + uint32_t simd = pBlockInfo->getMaxSimdSize(); + DEBUG_BREAK_IF(pBlockInfo->patchInfo.interfaceDescriptorData == nullptr); + + uint32_t idOffset = pBlockInfo->patchInfo.interfaceDescriptorData->Offset; + const INTERFACE_DESCRIPTOR_DATA *pBlockID = static_cast(ptrOffset(pBlockInfo->heapInfo.pDsh, idOffset)); + + pIDDestination[blockIndex + i] = *pBlockID; + pIDDestination[blockIndex + i].setKernelStartPointerHigh(gpuAddress >> 32); + pIDDestination[blockIndex + i].setKernelStartPointer((uint32_t)gpuAddress); + pIDDestination[blockIndex + i].setBarrierEnable(pBlockInfo->patchInfo.executionEnvironment->HasBarriers > 0); + pIDDestination[blockIndex + i].setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL); + + // Set offset to sampler states, block's DHSOffset is added by scheduler + pIDDestination[blockIndex + i].setSamplerStatePointer(static_cast(pBlockInfo->getBorderColorStateSize())); + + auto threadPayload = pBlockInfo->patchInfo.threadPayload; + DEBUG_BREAK_IF(nullptr == threadPayload); + + auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); + auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels); + + auto numGrfPerThreadData = static_cast(sizePerThreadData / sizeof(GRF)); + + // HW requires a minimum of 1 GRF of perThreadData for each thread in a thread group + // when sizeCrossThreadData != 0 + numGrfPerThreadData = std::max(numGrfPerThreadData, 1u); + pIDDestination[blockIndex + i].setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); + } + + igilCmdQueue->m_controls.m_BTmaxSize = alignUp(maxBindingTableCount * (uint32_t)sizeof(BINDING_TABLE_STATE), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE); + igilCmdQueue->m_controls.m_BTbaseOffset = alignUp((uint32_t)surfaceStateHeap.getUsed(), INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER::BINDINGTABLEPOINTER_ALIGN_SIZE); + igilCmdQueue->m_controls.m_CurrentSSHoffset = igilCmdQueue->m_controls.m_BTbaseOffset; +} + +} // namespace NEO diff --git a/runtime/device_queue/device_queue_hw_profiling.inl b/runtime/device_queue/device_queue_hw_profiling.inl index 60182fc2a8..cedef66f69 100644 --- a/runtime/device_queue/device_queue_hw_profiling.inl +++ b/runtime/device_queue/device_queue_hw_profiling.inl @@ -5,6 +5,9 @@ * */ +#include "runtime/command_queue/gpgpu_walker.h" +#include "runtime/device_queue/device_queue_hw.h" + namespace NEO { template diff --git a/runtime/gen10/device_queue_gen10.cpp b/runtime/gen10/device_queue_gen10.cpp index 92e6455112..1414e4fb38 100644 --- a/runtime/gen10/device_queue_gen10.cpp +++ b/runtime/gen10/device_queue_gen10.cpp @@ -6,7 +6,7 @@ */ #include "runtime/device_queue/device_queue_hw.h" -#include "runtime/device_queue/device_queue_hw.inl" +#include "runtime/device_queue/device_queue_hw_bdw_plus.inl" #include "runtime/device_queue/device_queue_hw_profiling.inl" #include "runtime/gen10/hw_cmds.h" diff --git a/runtime/gen11/device_queue_gen11.cpp b/runtime/gen11/device_queue_gen11.cpp index 75973f3136..874e5b7f4e 100644 --- a/runtime/gen11/device_queue_gen11.cpp +++ b/runtime/gen11/device_queue_gen11.cpp @@ -6,7 +6,7 @@ */ #include "runtime/device_queue/device_queue_hw.h" -#include "runtime/device_queue/device_queue_hw.inl" +#include "runtime/device_queue/device_queue_hw_bdw_plus.inl" #include "runtime/device_queue/device_queue_hw_profiling.inl" #include "runtime/gen11/device_enqueue.h" #include "runtime/gen11/hw_cmds.h" diff --git a/runtime/gen8/device_queue_gen8.cpp b/runtime/gen8/device_queue_gen8.cpp index 2208451241..5392411156 100644 --- a/runtime/gen8/device_queue_gen8.cpp +++ b/runtime/gen8/device_queue_gen8.cpp @@ -6,7 +6,7 @@ */ #include "runtime/device_queue/device_queue_hw.h" -#include "runtime/device_queue/device_queue_hw.inl" +#include "runtime/device_queue/device_queue_hw_bdw_plus.inl" #include "runtime/gen8/hw_cmds.h" namespace NEO { diff --git a/runtime/gen9/device_queue_gen9.cpp b/runtime/gen9/device_queue_gen9.cpp index 9218117c3e..b50ae885b4 100644 --- a/runtime/gen9/device_queue_gen9.cpp +++ b/runtime/gen9/device_queue_gen9.cpp @@ -6,7 +6,7 @@ */ #include "runtime/device_queue/device_queue_hw.h" -#include "runtime/device_queue/device_queue_hw.inl" +#include "runtime/device_queue/device_queue_hw_bdw_plus.inl" #include "runtime/device_queue/device_queue_hw_profiling.inl" #include "runtime/gen9/hw_cmds.h"