/* * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include "runtime/command_stream/linear_stream.h" #include "runtime/command_stream/command_stream_receiver.h" #include "runtime/command_queue/command_queue.h" #include "runtime/command_queue/enqueue_common.h" #include "runtime/device/device.h" #include "runtime/device_queue/device_queue.h" #include "runtime/gtpin/gtpin_notify.h" #include "runtime/mem_obj/mem_obj.h" #include "runtime/memory_manager/surface.h" #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/string.h" #include "runtime/helpers/task_information.h" namespace OCLRT { KernelOperation::~KernelOperation() { alignedFree(dsh->getCpuBase()); alignedFree(ish->getCpuBase()); if (doNotFreeISH) { ioh.release(); } else { alignedFree(ioh->getCpuBase()); } alignedFree(ssh->getCpuBase()); alignedFree(commandStream->getCpuBase()); } CommandMapUnmap::CommandMapUnmap(MapOperationType op, MemObj &memObj, MemObjSizeArray ©Size, MemObjOffsetArray ©Offset, bool readOnly, CommandStreamReceiver &csr, CommandQueue &cmdQ) : memObj(memObj), copySize(copySize), copyOffset(copyOffset), readOnly(readOnly), csr(csr), cmdQ(cmdQ), op(op) { memObj.incRefInternal(); } CommandMapUnmap::~CommandMapUnmap() { memObj.decRefInternal(); } CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) { if (terminated) { return completionStamp; } bool blocking = true; TakeOwnershipWrapper deviceOwnership(cmdQ.getDevice()); auto &queueCommandStream = cmdQ.getCS(0); size_t offset = queueCommandStream.getUsed(); DispatchFlags dispatchFlags; dispatchFlags.blocking = blocking; dispatchFlags.dcFlush = true; dispatchFlags.useSLM = true; dispatchFlags.guardCommandBufferWithPipeControl = true; dispatchFlags.lowPriority = cmdQ.getPriority() == QueuePriority::LOW; dispatchFlags.throttle = cmdQ.getThrottle(); dispatchFlags.preemptionMode = PreemptionHelper::taskPreemptionMode(cmdQ.getDevice(), nullptr); DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady); gtpinNotifyPreFlushTask(&cmdQ); completionStamp = csr.flushTask(queueCommandStream, offset, cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE), cmdQ.getIndirectHeap(IndirectHeap::INSTRUCTION), cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT), cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE), taskLevel, dispatchFlags); cmdQ.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp); if (!memObj.isMemObjZeroCopy()) { if (op == MAP) { memObj.transferDataToHostPtr(copySize, copyOffset); } else if (!readOnly) { DEBUG_BREAK_IF(op != UNMAP); memObj.transferDataFromHostPtr(copySize, copyOffset); } } return completionStamp; } CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, CommandStreamReceiver &commandStreamReceiver, std::unique_ptr kernelOperation, std::vector &surfaces, bool flushDC, bool usesSLM, bool ndRangeKernel, std::unique_ptr printfHandler, PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount) : commandQueue(commandQueue), commandStreamReceiver(commandStreamReceiver), kernelOperation(std::move(kernelOperation)), flushDC(flushDC), slmUsed(usesSLM), NDRangeKernel(ndRangeKernel), printfHandler(std::move(printfHandler)), kernel(nullptr), kernelCount(0) { for (auto surface : surfaces) { this->surfaces.push_back(surface); } this->kernel = kernel; if (kernel) { kernel->incRefInternal(); } this->kernelCount = kernelCount; this->preemptionMode = preemptionMode; } CommandComputeKernel::~CommandComputeKernel() { for (auto surface : surfaces) { delete surface; } surfaces.clear(); if (kernelOperation->ioh.get() == kernelOperation->dsh.get()) { kernelOperation->doNotFreeISH = true; } if (kernel) { kernel->decRefInternal(); } } CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminated) { if (terminated) { return completionStamp; } bool executionModelKernel = kernel != nullptr ? kernel->isParentKernel : false; auto devQueue = commandQueue.getContext().getDefaultDeviceQueue(); TakeOwnershipWrapper deviceOwnership(commandQueue.getDevice()); if (executionModelKernel) { while (!devQueue->isEMCriticalSectionFree()) ; devQueue->resetDeviceQueue(); devQueue->acquireEMCriticalSection(); } auto &commandStream = *kernelOperation->commandStream; size_t commandsSize = commandStream.getUsed(); auto &queueCommandStream = commandQueue.getCS(commandStream.getUsed()); size_t offset = queueCommandStream.getUsed(); void *pDst = queueCommandStream.getSpace(commandsSize); //transfer the memory to commandStream of the queue. memcpy_s(pDst, commandsSize, commandStream.getCpuBase(), commandsSize); size_t requestedDshSize = kernelOperation->dsh->getUsed(); size_t requestedIshSize = kernelOperation->ish->getUsed() + kernelOperation->instructionHeapSizeEM; size_t requestedIohSize = kernelOperation->ioh->getUsed(); size_t requestedSshSize = kernelOperation->ssh->getUsed() + kernelOperation->surfaceStateHeapSizeEM; IndirectHeap *dsh = nullptr; IndirectHeap *ioh = nullptr; IndirectHeap::Type trackedHeaps[] = {IndirectHeap::SURFACE_STATE, IndirectHeap::INDIRECT_OBJECT, IndirectHeap::DYNAMIC_STATE}; for (auto trackedHeap = 0u; trackedHeap < ARRAY_COUNT(trackedHeaps); trackedHeap++) { if (commandQueue.getIndirectHeap(trackedHeaps[trackedHeap], 0).getUsed() > 0) { commandQueue.releaseIndirectHeap(trackedHeaps[trackedHeap]); } } if (commandQueue.getIndirectHeap(IndirectHeap::INSTRUCTION, 0).getUsed() > commandQueue.getInstructionHeapReservedBlockSize()) { commandQueue.releaseIndirectHeap(IndirectHeap::INSTRUCTION); } if (executionModelKernel) { dsh = devQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE); // In ExecutionModel IOH is the same as DSH to eliminate StateBaseAddress reprogramming for scheduler kernel and blocks. ioh = dsh; memcpy_s(dsh->getSpace(0), dsh->getAvailableSpace(), ptrOffset(kernelOperation->dsh->getCpuBase(), devQueue->colorCalcStateSize), kernelOperation->dsh->getUsed() - devQueue->colorCalcStateSize); dsh->getSpace(kernelOperation->dsh->getUsed() - devQueue->colorCalcStateSize); } else { dsh = &commandQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, requestedDshSize); ioh = &commandQueue.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, requestedIohSize); memcpy_s(dsh->getCpuBase(), requestedDshSize, kernelOperation->dsh->getCpuBase(), kernelOperation->dsh->getUsed()); dsh->getSpace(requestedDshSize); memcpy_s(ioh->getCpuBase(), requestedIohSize, kernelOperation->ioh->getCpuBase(), kernelOperation->ioh->getUsed()); ioh->getSpace(requestedIohSize); } IndirectHeap &ish = commandQueue.getIndirectHeap(IndirectHeap::INSTRUCTION, requestedIshSize); IndirectHeap &ssh = commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, requestedSshSize); memcpy_s(ptrOffset(ish.getCpuBase(), commandQueue.getInstructionHeapReservedBlockSize()), requestedIshSize, kernelOperation->ish->getCpuBase(), kernelOperation->ish->getUsed()); ish.getSpace(kernelOperation->ish->getUsed()); memcpy_s(ssh.getCpuBase(), requestedSshSize, kernelOperation->ssh->getCpuBase(), kernelOperation->ssh->getUsed()); ssh.getSpace(kernelOperation->ssh->getUsed()); auto requiresCoherency = false; for (auto &surface : surfaces) { DEBUG_BREAK_IF(!surface); surface->makeResident(commandStreamReceiver); requiresCoherency |= surface->IsCoherent; } if (printfHandler) { printfHandler.get()->makeResident(commandStreamReceiver); } if (executionModelKernel) { uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1; devQueue->setupExecutionModelDispatch(ish, ssh, kernel, kernelCount, taskCount, timestamp); BuiltIns &builtIns = BuiltIns::getInstance(); SchedulerKernel &scheduler = builtIns.getSchedulerKernel(commandQueue.getContext()); scheduler.setArgs(devQueue->getQueueBuffer(), devQueue->getStackBuffer(), devQueue->getEventPoolBuffer(), devQueue->getSlbBuffer(), devQueue->getDshBuffer(), kernel->getKernelReflectionSurface(), devQueue->getQueueStorageBuffer(), ssh.getGraphicsAllocation(), devQueue->getDebugQueue()); devQueue->dispatchScheduler( commandQueue, scheduler, preemptionMode); scheduler.makeResident(commandStreamReceiver); // Update SLM usage slmUsed |= scheduler.slmTotalSize > 0; } DispatchFlags dispatchFlags; dispatchFlags.blocking = true; dispatchFlags.dcFlush = flushDC; dispatchFlags.useSLM = slmUsed; dispatchFlags.guardCommandBufferWithPipeControl = true; dispatchFlags.GSBA32BitRequired = NDRangeKernel; dispatchFlags.requiresCoherency = requiresCoherency; dispatchFlags.lowPriority = commandQueue.getPriority() == QueuePriority::LOW; dispatchFlags.throttle = commandQueue.getThrottle(); dispatchFlags.preemptionMode = preemptionMode; DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady); gtpinNotifyPreFlushTask(&commandQueue); completionStamp = commandStreamReceiver.flushTask(queueCommandStream, offset, *dsh, ish, *ioh, ssh, taskLevel, dispatchFlags); for (auto &surface : surfaces) { surface->setCompletionStamp(completionStamp, nullptr, nullptr); } commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp); if (printfHandler) { printfHandler.get()->printEnqueueOutput(); } return completionStamp; } CompletionStamp &CommandMarker::submit(uint32_t taskLevel, bool terminated) { if (terminated) { return completionStamp; } bool blocking = true; TakeOwnershipWrapper deviceOwnership(cmdQ.getDevice()); auto &queueCommandStream = cmdQ.getCS(this->commandSize); size_t offset = queueCommandStream.getUsed(); DispatchFlags dispatchFlags; dispatchFlags.blocking = blocking; dispatchFlags.dcFlush = shouldFlushDC(clCommandType, nullptr); dispatchFlags.lowPriority = cmdQ.getPriority() == QueuePriority::LOW; dispatchFlags.throttle = cmdQ.getThrottle(); dispatchFlags.preemptionMode = PreemptionHelper::taskPreemptionMode(cmdQ.getDevice(), nullptr); DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady); gtpinNotifyPreFlushTask(&cmdQ); completionStamp = csr.flushTask(queueCommandStream, offset, cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE), cmdQ.getIndirectHeap(IndirectHeap::INSTRUCTION), cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT), cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE), taskLevel, dispatchFlags); cmdQ.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp); return completionStamp; } } // namespace OCLRT