/* * Copyright (c) 2017, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #pragma once #include "runtime/context/context.h" #include "runtime/gen9/gen9_cmd_def.h" #include "runtime/command_queue/local_id_gen.h" #include "runtime/command_queue/command_queue.h" #include "runtime/command_queue/dispatch_walker_helper.h" #include "runtime/command_stream/command_stream_receiver.h" #include "runtime/command_stream/preemption.h" #include "runtime/device/device_info.h" #include "runtime/device_queue/device_queue_hw.h" #include "runtime/event/perf_counter.h" #include "runtime/event/user_event.h" #include "runtime/indirect_heap/indirect_heap.h" #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/debug_helpers.h" #include "runtime/helpers/kernel_commands.h" #include "runtime/helpers/task_information.h" #include "runtime/helpers/validators.h" #include "runtime/helpers/dispatch_info.h" #include "runtime/kernel/kernel.h" #include "runtime/mem_obj/mem_obj.h" #include "runtime/memory_manager/graphics_allocation.h" #include #include namespace OCLRT { void computeWorkgroupSize1D( uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize); void computeWorkgroupSizeND( WorkSizeInfo wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim); void computeWorkgroupSize2D( uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize); void computeWorkgroupSizeSquared( uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim); Vec3 computeWorkgroupSize( const DispatchInfo &dispatchInfo); Vec3 generateWorkgroupSize( const DispatchInfo &dispatchInfo); Vec3 computeWorkgroupsNumber( const Vec3 gws, const Vec3 lws); Vec3 generateWorkgroupsNumber( const Vec3 gws, const Vec3 lws); Vec3 generateWorkgroupsNumber( const DispatchInfo &dispatchInfo); Vec3 canonizeWorkgroup( Vec3 workgroup); inline uint32_t calculateDispatchDim(Vec3 dispatchSize, Vec3 dispatchOffset) { return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim())); } template inline size_t setGpgpuWalkerThreadData( typename GfxFamily::GPGPU_WALKER *pCmd, const size_t globalOffsets[3], const size_t startWorkGroups[3], const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd) { typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER; auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2]; auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize); pCmd->setThreadWidthCounterMaximum((uint32_t)threadsPerWorkGroup); pCmd->setThreadGroupIdXDimension((uint32_t)numWorkGroups[0]); pCmd->setThreadGroupIdYDimension((uint32_t)numWorkGroups[1]); pCmd->setThreadGroupIdZDimension((uint32_t)numWorkGroups[2]); // compute RightExecutionMask auto remainderSimdLanes = localWorkSize & (simd - 1); uint64_t executionMask = (1ull << remainderSimdLanes) - 1; if (!executionMask) executionMask = ~executionMask; pCmd->setRightExecutionMask((uint32_t)executionMask); pCmd->setBottomExecutionMask((uint32_t)0xffffffff); pCmd->setSimdSize((typename GPGPU_WALKER::SIMD_SIZE)(simd >> 4)); pCmd->setThreadGroupIdStartingX((uint32_t)startWorkGroups[0]); pCmd->setThreadGroupIdStartingY((uint32_t)startWorkGroups[1]); pCmd->setThreadGroupIdStartingResumeZ((uint32_t)startWorkGroups[2]); return localWorkSize; } inline cl_uint computeDimensions(const size_t workItems[3]) { return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1; } void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo); template IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) { size_t alignment = MemoryConstants::pageSize; size_t size = calc(std::forward(args)...); return new IndirectHeap(alignedMalloc(size, alignment), size); } template void dispatchProfilingCommandsStart( HwTimeStamps &hwTimeStamps, OCLRT::LinearStream *commandStream) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; // PIPE_CONTROL for global timestamp uint64_t TimeStampAddress = reinterpret_cast(&(hwTimeStamps.GlobalStartTS)); auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); *pPipeControlCmd = PIPE_CONTROL::sInit(); pPipeControlCmd->setCommandStreamerStallEnable(true); pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP); pPipeControlCmd->setAddress(static_cast(TimeStampAddress & 0x0000FFFFFFFFULL)); pPipeControlCmd->setAddressHigh(static_cast(TimeStampAddress >> 32)); //MI_STORE_REGISTER_MEM for context local timestamp TimeStampAddress = reinterpret_cast(&(hwTimeStamps.ContextStartTS)); //low part auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); *pMICmdLow = MI_STORE_REGISTER_MEM::sInit(); pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); pMICmdLow->setMemoryAddress(TimeStampAddress); //hi part TimeStampAddress += sizeof(uint32_t); auto pMICmdHigh = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); *pMICmdHigh = MI_STORE_REGISTER_MEM::sInit(); pMICmdHigh->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_HIGH); pMICmdHigh->setMemoryAddress(TimeStampAddress); } template void dispatchProfilingCommandsEnd( HwTimeStamps &hwTimeStamps, OCLRT::LinearStream *commandStream) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; // PIPE_CONTROL for global timestamp auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); *pPipeControlCmd = PIPE_CONTROL::sInit(); pPipeControlCmd->setCommandStreamerStallEnable(true); //MI_STORE_REGISTER_MEM for context local timestamp uint64_t TimeStampAddress = reinterpret_cast(&(hwTimeStamps.ContextEndTS)); //low part auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); *pMICmdLow = MI_STORE_REGISTER_MEM::sInit(); pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); pMICmdLow->setMemoryAddress(TimeStampAddress); //hi part TimeStampAddress += sizeof(uint32_t); auto pMICmdHi = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); *pMICmdHi = MI_STORE_REGISTER_MEM::sInit(); pMICmdHi->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_HIGH); pMICmdHi->setMemoryAddress(TimeStampAddress); } template void dispatchPerfCountersNoopidRegisterCommands( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream, bool start) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; uint64_t address = start ? reinterpret_cast(&(hwPerfCounter.HWPerfCounters.dmaFenceIdBegin)) : reinterpret_cast(&(hwPerfCounter.HWPerfCounters.dmaFenceIdEnd)); auto pNoopIdRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); *pNoopIdRegister = MI_STORE_REGISTER_MEM::sInit(); pNoopIdRegister->setRegisterAddress(OCLRT::INSTR_MMIO_NOOPID); pNoopIdRegister->setMemoryAddress(address); } template void dispatchPerfCountersReadFreqRegisterCommands( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream, bool start) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; uint64_t address = start ? reinterpret_cast(&(hwPerfCounter.HWPerfCounters.coreFreqBegin)) : reinterpret_cast(&(hwPerfCounter.HWPerfCounters.coreFreqEnd)); auto pCoreFreqRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); *pCoreFreqRegister = MI_STORE_REGISTER_MEM::sInit(); pCoreFreqRegister->setRegisterAddress(OCLRT::INSTR_MMIO_RPSTAT1); pCoreFreqRegister->setMemoryAddress(address); } template void dispatchPerfCountersGeneralPurposeCounterCommands( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream, bool start) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; uint64_t address = 0; const uint64_t baseAddress = start ? reinterpret_cast(&(hwPerfCounter.HWPerfCounters.reportBegin.gp)) : reinterpret_cast(&(hwPerfCounter.HWPerfCounters.reportEnd.gp)); // Read General Purpose counters for (uint16_t i = 0; i < OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) { auto pGeneralPurposeRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); *pGeneralPurposeRegister = MI_STORE_REGISTER_MEM::sInit(); uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint); pGeneralPurposeRegister->setRegisterAddress(regAddr); //Gp field is 2*uint64 wide so it can hold 4 uint32 address = baseAddress + i * sizeof(cl_uint); pGeneralPurposeRegister->setMemoryAddress(address); } } template void dispatchPerfCountersUserCounterCommands( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream, bool start) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; uint64_t address = 0; const uint64_t baseAddr = start ? reinterpret_cast(&(hwPerfCounter.HWPerfCounters.reportBegin.user)) : reinterpret_cast(&(hwPerfCounter.HWPerfCounters.reportEnd.user)); uint32_t cmdNum = 0; uint32_t regAddr = 0; auto configData = commandQueue.getPerfCountersConfigData(); auto userRegs = &configData->ReadRegs; for (uint32_t i = 0; i < userRegs->RegsCount; i++) { auto pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); *pRegister = MI_STORE_REGISTER_MEM::sInit(); regAddr = userRegs->Reg[i].Offset; pRegister->setRegisterAddress(regAddr); //offset between base (low) registers is cl_ulong wide address = baseAddr + i * sizeof(cl_ulong); pRegister->setMemoryAddress(address); cmdNum++; if (userRegs->Reg[i].BitSize > 32) { pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); *pRegister = MI_STORE_REGISTER_MEM::sInit(); regAddr += sizeof(cl_uint); pRegister->setRegisterAddress(regAddr); address += sizeof(cl_uint); pRegister->setMemoryAddress(address); cmdNum++; } } } template void dispatchPerfCountersOABufferStateCommands( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; uint64_t address = 0; //OA Status auto pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); *pOaRegister = MI_STORE_REGISTER_MEM::sInit(); pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_STATUS); address = reinterpret_cast(&(hwPerfCounter.HWPerfCounters.oaStatus)); pOaRegister->setMemoryAddress(address); //OA Head pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); *pOaRegister = MI_STORE_REGISTER_MEM::sInit(); pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR); address = reinterpret_cast(&(hwPerfCounter.HWPerfCounters.oaHead)); pOaRegister->setMemoryAddress(address); //OA Tail pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); *pOaRegister = MI_STORE_REGISTER_MEM::sInit(); pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR); address = reinterpret_cast(&(hwPerfCounter.HWPerfCounters.oaTail)); pOaRegister->setMemoryAddress(address); } template void dispatchPerfCountersCommandsStart( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream) { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT; auto perfCounters = commandQueue.getPerfCounters(); uint32_t currentReportId = perfCounters->getCurrentReportId(); uint64_t address = 0; //flush command streamer auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); *pPipeControlCmd = PIPE_CONTROL::sInit(); pPipeControlCmd->setCommandStreamerStallEnable(true); //Store value of NOOPID register dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, true); //Read Core Frequency dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, true); dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, true); auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT)); *pReportPerfCount = MI_REPORT_PERF_COUNT::sInit(); pReportPerfCount->setReportId(currentReportId); address = reinterpret_cast(&(hwPerfCounter.HWPerfCounters.reportBegin.oa)); pReportPerfCount->setMemoryAddress(address); //Timestamp: Global Start pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); *pPipeControlCmd = PIPE_CONTROL::sInit(); pPipeControlCmd->setCommandStreamerStallEnable(true); pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP); address = reinterpret_cast(&(hwPerfCounter.HWTimeStamp.GlobalStartTS)); pPipeControlCmd->setAddress(static_cast(address & ((uint64_t)UINT32_MAX))); pPipeControlCmd->setAddressHigh(static_cast(address >> 32)); dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, true); commandQueue.sendPerfCountersConfig(); } template void dispatchPerfCountersCommandsEnd( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream) { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT; auto perfCounters = commandQueue.getPerfCounters(); uint32_t currentReportId = perfCounters->getCurrentReportId(); uint64_t address = 0; //flush command streamer auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); *pPipeControlCmd = PIPE_CONTROL::sInit(); pPipeControlCmd->setCommandStreamerStallEnable(true); dispatchPerfCountersOABufferStateCommands(commandQueue, hwPerfCounter, commandStream); //Timestamp: Global End pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); *pPipeControlCmd = PIPE_CONTROL::sInit(); pPipeControlCmd->setCommandStreamerStallEnable(true); pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP); address = reinterpret_cast(&(hwPerfCounter.HWTimeStamp.GlobalEndTS)); pPipeControlCmd->setAddress(static_cast(address & ((uint64_t)UINT32_MAX))); pPipeControlCmd->setAddressHigh(static_cast(address >> 32)); auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT)); *pReportPerfCount = MI_REPORT_PERF_COUNT::sInit(); pReportPerfCount->setReportId(currentReportId); address = reinterpret_cast(&(hwPerfCounter.HWPerfCounters.reportEnd.oa)); pReportPerfCount->setMemoryAddress(address); dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, false); //Store value of NOOPID register dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, false); //Read Core Frequency dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, false); dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, false); perfCounters->setCpuTimestamp(); } template void dispatchWalker( CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, KernelOperation **blockedCommandsData, HwTimeStamps *hwTimeStamps, OCLRT::HwPerfCounter *hwPerfCounter, bool blockQueue = false, unsigned int commandType = 0) { OCLRT::LinearStream *commandStream = nullptr; OCLRT::IndirectHeap *dsh = nullptr, *ish = nullptr, *ioh = nullptr, *ssh = nullptr; bool executionModelKernel = multiDispatchInfo.begin()->getKernel()->isParentKernel; // Allocate command stream and indirect heaps if (blockQueue) { using KCH = KernelCommandsHelper; commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize), MemoryConstants::pageSize); if (executionModelKernel) { uint32_t offsetDsh = commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset(); uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize; dsh = allocateIndirectHeap([&multiDispatchInfo, offsetDsh] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo) + KCH::getTotalSizeRequiredIOH(multiDispatchInfo) + offsetDsh; }); dsh->getSpace(colorCalcSize); ioh = dsh; } else { dsh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo); }); ioh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIOH(multiDispatchInfo); }); } ish = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIH(multiDispatchInfo); }); ssh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredSSH(multiDispatchInfo); }); using UniqueIH = std::unique_ptr; *blockedCommandsData = new KernelOperation(std::unique_ptr(commandStream), UniqueIH(dsh), UniqueIH(ish), UniqueIH(ioh), UniqueIH(ssh)); if (executionModelKernel) (*blockedCommandsData)->doNotFreeISH = true; } else { commandStream = &commandQueue.getCS(0); if (executionModelKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) { commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE); } dsh = &getIndirectHeap(commandQueue, multiDispatchInfo); ish = &getIndirectHeap(commandQueue, multiDispatchInfo); ioh = &getIndirectHeap(commandQueue, multiDispatchInfo); ssh = &getIndirectHeap(commandQueue, multiDispatchInfo); } using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; dsh->align(KernelCommandsHelper::alignInterfaceDescriptorData); const size_t offsetInterfaceDescriptorTable = dsh->getUsed(); uint32_t interfaceDescriptorIndex = 0; size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA); size_t numDispatches = multiDispatchInfo.size(); totalInterfaceDescriptorTableSize *= numDispatches; if (!executionModelKernel) { dsh->getSpace(totalInterfaceDescriptorTableSize); } else { dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed()); } // Program media interface descriptor load KernelCommandsHelper::sendMediaInterfaceDescriptorLoad( *commandStream, offsetInterfaceDescriptorTable, totalInterfaceDescriptorTableSize); DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0); for (auto &dispatchInfo : multiDispatchInfo) { auto &kernel = *dispatchInfo.getKernel(); DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3)); DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3)); DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2)); DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3)); DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2)); // Determine SIMD size uint32_t simd = kernel.getKernelInfo().getMaxSimdSize(); // If we don't have a required WGS, compute one opportunistically auto maxWorkGroupSize = static_cast(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize); if (commandType == CL_COMMAND_NDRANGE_KERNEL) { provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo); } //Get dispatch geometry uint32_t dim = dispatchInfo.getDim(); Vec3 gws = dispatchInfo.getGWS(); Vec3 offset = dispatchInfo.getOffset(); Vec3 swgs = dispatchInfo.getStartOfWorkgroups(); // Compute local workgroup sizes Vec3 lws = (dispatchInfo.getLocalWorkgroupSize().x > 0) ? dispatchInfo.getLocalWorkgroupSize() : generateWorkgroupSize(dispatchInfo); Vec3 elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws; // Compute number of work groups Vec3 twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups() : generateWorkgroupsNumber(gws, lws); Vec3 nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs; // Patch our kernel constants *kernel.globalWorkOffsetX = static_cast(offset.x); *kernel.globalWorkOffsetY = static_cast(offset.y); *kernel.globalWorkOffsetZ = static_cast(offset.z); *kernel.globalWorkSizeX = static_cast(gws.x); *kernel.globalWorkSizeY = static_cast(gws.y); *kernel.globalWorkSizeZ = static_cast(gws.z); if ((&dispatchInfo == &*multiDispatchInfo.begin()) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) { *kernel.localWorkSizeX = static_cast(lws.x); *kernel.localWorkSizeY = static_cast(lws.y); *kernel.localWorkSizeZ = static_cast(lws.z); } *kernel.localWorkSizeX2 = static_cast(lws.x); *kernel.localWorkSizeY2 = static_cast(lws.y); *kernel.localWorkSizeZ2 = static_cast(lws.z); *kernel.enqueuedLocalWorkSizeX = static_cast(elws.x); *kernel.enqueuedLocalWorkSizeY = static_cast(elws.y); *kernel.enqueuedLocalWorkSizeZ = static_cast(elws.z); if (&dispatchInfo == &*multiDispatchInfo.begin()) { *kernel.numWorkGroupsX = static_cast(twgs.x); *kernel.numWorkGroupsY = static_cast(twgs.y); *kernel.numWorkGroupsZ = static_cast(twgs.z); } *kernel.workDim = dim; // Send our indirect object data size_t localWorkSizes[3] = {lws.x, lws.y, lws.z}; auto offsetCrossThreadData = KernelCommandsHelper::sendIndirectState( *commandStream, *dsh, *ish, *ioh, *ssh, kernel, simd, localWorkSizes, offsetInterfaceDescriptorTable, interfaceDescriptorIndex); if (&dispatchInfo == &*multiDispatchInfo.begin()) { // If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled if (hwTimeStamps != nullptr) { dispatchProfilingCommandsStart(*hwTimeStamps, commandStream); } if (hwPerfCounter != nullptr) { dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream); } } PreemptionHelper::applyPreemptionWaCmdsBegin(commandStream, commandQueue.getDevice()); // Implement enabling special WA DisableLSQCROPERFforOCL if needed applyWADisableLSQCROPERFforOCL(commandStream, kernel, true); // Program the walker. Invokes execution so all state should already be programmed typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER; auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER)); *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker; size_t globalOffsets[3] = {offset.x, offset.y, offset.z}; size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z}; size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z}; auto localWorkSize = setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd); pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData); DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0); pGpGpuWalkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++); auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload; DEBUG_BREAK_IF(nullptr == threadPayload); auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels); localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF)); auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread; DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group auto sizeCrossThreadData = kernel.getCrossThreadDataSize(); auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength); // Implement disabling special WA DisableLSQCROPERFforOCL if needed applyWADisableLSQCROPERFforOCL(commandStream, kernel, false); PreemptionHelper::applyPreemptionWaCmdsEnd(commandStream, commandQueue.getDevice()); } // If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled if (hwTimeStamps != nullptr) { dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream); } if (hwPerfCounter != nullptr) { dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream); } } template void dispatchWalker( CommandQueue &commandQueue, const Kernel &kernel, cl_uint workDim, const size_t globalOffsets[3], const size_t workItems[3], const size_t *localWorkSizesIn, cl_uint numEventsInWaitList, const cl_event *eventWaitList, KernelOperation **blockedCommandsData, HwTimeStamps *hwTimeStamps, HwPerfCounter *hwPerfCounter, bool blockQueue = false) { DispatchInfo dispatchInfo(const_cast(&kernel), workDim, workItems, localWorkSizesIn, globalOffsets); dispatchWalker(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList, blockedCommandsData, hwTimeStamps, hwPerfCounter, blockQueue); } template void dispatchScheduler( CommandQueue &commandQueue, DeviceQueueHw &devQueueHw, SchedulerKernel &scheduler) { using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; OCLRT::LinearStream *commandStream = nullptr; OCLRT::IndirectHeap *dsh = nullptr, *ish = nullptr, *ioh = nullptr, *ssh = nullptr; commandStream = &commandQueue.getCS(0); // note : below code assumes that caller to dispatchScheduler "preallocated" memory // required for execution model in below heap managers dsh = devQueueHw.getIndirectHeap(IndirectHeap::DYNAMIC_STATE); ish = &commandQueue.getIndirectHeap(IndirectHeap::INSTRUCTION); ssh = &commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE); bool dcFlush = false; commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush); uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex; const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize; const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable; const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA); // Program media interface descriptor load KernelCommandsHelper::sendMediaInterfaceDescriptorLoad( *commandStream, offsetInterfaceDescriptor, totalInterfaceDescriptorTableSize); DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0); // Determine SIMD size uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize(); DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20); // Patch our kernel constants *scheduler.globalWorkOffsetX = 0; *scheduler.globalWorkOffsetY = 0; *scheduler.globalWorkOffsetZ = 0; *scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws(); *scheduler.globalWorkSizeY = 1; *scheduler.globalWorkSizeZ = 1; *scheduler.localWorkSizeX = (uint32_t)scheduler.getLws(); *scheduler.localWorkSizeY = 1; *scheduler.localWorkSizeZ = 1; *scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws(); *scheduler.localWorkSizeY2 = 1; *scheduler.localWorkSizeZ2 = 1; *scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws(); *scheduler.enqueuedLocalWorkSizeY = 1; *scheduler.enqueuedLocalWorkSizeZ = 1; *scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws()); *scheduler.numWorkGroupsY = 0; *scheduler.numWorkGroupsZ = 0; *scheduler.workDim = 1; // Send our indirect object data size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1}; // Create indirectHeap for IOH that is located at the end of device enqueue DSH size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler); IndirectHeap indirectObjectHeap(dsh->getBase(), dsh->getMaxAvailableSpace()); indirectObjectHeap.getSpace(curbeOffset); ioh = &indirectObjectHeap; auto offsetCrossThreadData = KernelCommandsHelper::sendIndirectState( *commandStream, *dsh, *ish, *ioh, *ssh, scheduler, simd, localWorkSizes, offsetInterfaceDescriptorTable, interfaceDescriptorIndex); // Implement enabling special WA DisableLSQCROPERFforOCL if needed applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true); // Program the walker. Invokes execution so all state should already be programmed auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER)); *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker; size_t globalOffsets[3] = {0, 0, 0}; size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1}; auto localWorkSize = setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd); pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData); DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0); pGpGpuWalkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex); auto threadPayload = scheduler.getKernelInfo().patchInfo.threadPayload; DEBUG_BREAK_IF(nullptr == threadPayload); auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels); localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF)); auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread; DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group auto sizeCrossThreadData = scheduler.getCrossThreadDataSize(); auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength); // Implement disabling special WA DisableLSQCROPERFforOCL if needed applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false); // Do not put BB_START only when returning in first Scheduler run if (devQueueHw.getSchedulerReturnInstance() != 1) { commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, true); // Add BB Start Cmd to the SLB in the Primary Batch Buffer auto *bbStart = (MI_BATCH_BUFFER_START *)commandStream->getSpace(sizeof(MI_BATCH_BUFFER_START)); *bbStart = MI_BATCH_BUFFER_START::sInit(); bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH); uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress(); bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress); } } template struct EnqueueOperation { static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class"); static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class"); static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class"); static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) { size_t size = KernelCommandsHelper::getSizeRequiredCS() + sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper::isPipeControlWArequired() ? 2 : 1); if (reserveProfilingCmdsSpace) { size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); } if (reservePerfCounters) { //start cmds //P_C: flush CS & TimeStamp BEGIN size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL); //SRM NOOPID & Frequency size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //gp registers size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //report perf count size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); //user registers size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //end cmds //P_C: flush CS & TimeStamp END; size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL); //OA buffer (status head, tail) size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //report perf count size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); //gp registers size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //SRM NOOPID & Frequency size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //user registers size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); } Device &device = commandQueue.getDevice(); for (auto &dispatchInfo : multiDispatchInfo) { auto &kernel = *dispatchInfo.getKernel(); size += sizeof(typename GfxFamily::GPGPU_WALKER); size += getSizeForWADisableLSQCROPERFforOCL(&kernel); size += PreemptionHelper::getPreemptionWaCsSize(device); } return size; } static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) { size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper::getSizeRequiredCS() + sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper::isPipeControlWArequired() ? 2 : 1); size += PreemptionHelper::getPreemptionWaCsSize(commandQueue.getDevice()); if (reserveProfilingCmdsSpace) { size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); } if (reservePerfCounters) { //start cmds //P_C: flush CS & TimeStamp BEGIN size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL); //SRM NOOPID & Frequency size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //gp registers size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //report perf count size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); //user registers size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //end cmds //P_C: flush CS & TimeStamp END; size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL); //OA buffer (status head, tail) size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //report perf count size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); //gp registers size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //SRM NOOPID & Frequency size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); //user registers size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); } size += getSizeForWADisableLSQCROPERFforOCL(pKernel); return size; } }; template LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) { auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel); return commandQueue.getCS(expectedSizeCS); } template LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) { size_t expectedSizeCS = 0; Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr; for (auto &dispatchInfo : multiDispatchInfo) { expectedSizeCS += EnqueueOperation::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel()); } if (parentKernel && parentKernel->isParentKernel) { SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext()); expectedSizeCS += EnqueueOperation::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler); } return commandQueue.getCS(expectedSizeCS); } template IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) { size_t expectedSize = 0; IndirectHeap *ih = nullptr; // clang-format off switch(heapType) { case IndirectHeap::DYNAMIC_STATE: expectedSize = KernelCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); break; case IndirectHeap::INSTRUCTION: expectedSize = KernelCommandsHelper::getTotalSizeRequiredIH( multiDispatchInfo); break; case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); break; case IndirectHeap::SURFACE_STATE: expectedSize = KernelCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); break; } // clang-format on if (multiDispatchInfo.begin()->getKernel()->isParentKernel) { if (heapType == IndirectHeap::INSTRUCTION || heapType == IndirectHeap::SURFACE_STATE) { expectedSize += KernelCommandsHelper::template getSizeRequiredForExecutionModel(const_cast(*(multiDispatchInfo.begin()->getKernel()))); } else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT) { DeviceQueueHw *pDevQueue = castToObject>(commandQueue.getContext().getDefaultDeviceQueue()); DEBUG_BREAK_IF(pDevQueue == nullptr); ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE); } } if (ih == nullptr) ih = &commandQueue.getIndirectHeap(heapType, expectedSize); return *ih; } } // namespace OCLRT