/* * Copyright (C) 2017-2019 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "core/memory_manager/unified_memory_manager.h" #include "runtime/built_ins/built_ins.h" #include "runtime/command_queue/command_queue_hw.h" #include "runtime/command_queue/enqueue_common.h" #include "runtime/event/event.h" #include "runtime/memory_manager/surface.h" #include namespace NEO { using SvmFreeClbT = void(CL_CALLBACK *)(cl_command_queue queue, cl_uint numSvmPointers, void *svmPointers[], void *userData); struct SvmFreeUserData { cl_uint numSvmPointers; void **svmPointers; SvmFreeClbT clb; void *userData; bool ownsEventDeletion; SvmFreeUserData(cl_uint numSvmPointers, void **svmPointers, SvmFreeClbT clb, void *userData, bool ownsEventDeletion) : numSvmPointers(numSvmPointers), svmPointers(svmPointers), clb(clb), userData(userData), ownsEventDeletion(ownsEventDeletion){}; }; inline void CL_CALLBACK freeSvmEventClb(cl_event event, cl_int commandExecCallbackType, void *usrData) { auto freeDt = reinterpret_cast(usrData); auto eventObject = castToObjectOrAbort(event); if (freeDt->clb == nullptr) { auto ctx = eventObject->getContext(); for (cl_uint i = 0; i < freeDt->numSvmPointers; i++) { castToObjectOrAbort(ctx)->getSVMAllocsManager()->freeSVMAlloc(freeDt->svmPointers[i]); } } else { freeDt->clb(eventObject->getCommandQueue(), freeDt->numSvmPointers, freeDt->svmPointers, freeDt->userData); } if (freeDt->ownsEventDeletion) { castToObjectOrAbort(event)->release(); } delete freeDt; } template cl_int CommandQueueHw::enqueueSVMMap(cl_bool blockingMap, cl_map_flags mapFlags, void *svmPtr, size_t size, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool externalAppCall) { auto svmData = context->getSVMAllocsManager()->getSVMAlloc(svmPtr); if (svmData == nullptr) { return CL_INVALID_VALUE; } bool blocking = blockingMap == CL_TRUE; if (svmData->gpuAllocation->getAllocationType() == GraphicsAllocation::AllocationType::SVM_ZERO_COPY) { NullSurface s; Surface *surfaces[] = {&s}; if (context->isProvidingPerformanceHints()) { context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL, CL_ENQUEUE_SVM_MAP_DOESNT_REQUIRE_COPY_DATA, svmPtr); } enqueueHandler(surfaces, blocking, MultiDispatchInfo(), numEventsInWaitList, eventWaitList, event); return CL_SUCCESS; } else { auto svmOperation = context->getSVMAllocsManager()->getSvmMapOperation(svmPtr); if (svmOperation) { NullSurface s; Surface *surfaces[] = {&s}; enqueueHandler(surfaces, blocking, MultiDispatchInfo(), numEventsInWaitList, eventWaitList, event); return CL_SUCCESS; } MultiDispatchInfo dispatchInfo; auto &builder = getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBuffer, this->getContext(), this->getDevice()); BuiltInOwnershipWrapper builtInLock(builder, this->context); GeneralSurface dstSurface(svmData->cpuAllocation); GeneralSurface srcSurface(svmData->gpuAllocation); Surface *surfaces[] = {&dstSurface, &srcSurface}; void *svmBasePtr = svmData->cpuAllocation->getUnderlyingBuffer(); size_t svmOffset = ptrDiff(svmPtr, svmBasePtr); BuiltinOpParams dc; dc.dstPtr = reinterpret_cast(svmData->cpuAllocation->getGpuAddressToPatch()); dc.dstSvmAlloc = svmData->cpuAllocation; dc.dstOffset = {svmOffset, 0, 0}; dc.srcPtr = reinterpret_cast(svmData->gpuAllocation->getGpuAddressToPatch()); dc.srcSvmAlloc = svmData->gpuAllocation; dc.srcOffset = {svmOffset, 0, 0}; dc.size = {size, 0, 0}; dc.unifiedMemoryArgsRequireMemSync = externalAppCall; builder.buildDispatchInfos(dispatchInfo, dc); enqueueHandler( surfaces, blocking, dispatchInfo, numEventsInWaitList, eventWaitList, event); if (event) { castToObjectOrAbort(*event)->setCmdType(CL_COMMAND_SVM_MAP); } bool readOnlyMap = isValueSet(mapFlags, CL_MAP_READ); context->getSVMAllocsManager()->insertSvmMapOperation(svmPtr, size, svmBasePtr, svmOffset, readOnlyMap); dispatchInfo.backupUnifiedMemorySyncRequirement(); return CL_SUCCESS; } } template cl_int CommandQueueHw::enqueueSVMUnmap(void *svmPtr, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool externalAppCall) { auto svmData = context->getSVMAllocsManager()->getSVMAlloc(svmPtr); if (svmData == nullptr) { return CL_INVALID_VALUE; } if (svmData->gpuAllocation->getAllocationType() == GraphicsAllocation::AllocationType::SVM_ZERO_COPY) { NullSurface s; Surface *surfaces[] = {&s}; enqueueHandler(surfaces, false, MultiDispatchInfo(), numEventsInWaitList, eventWaitList, event); return CL_SUCCESS; } else { auto svmOperation = context->getSVMAllocsManager()->getSvmMapOperation(svmPtr); if (!svmOperation) { NullSurface s; Surface *surfaces[] = {&s}; enqueueHandler(surfaces, false, MultiDispatchInfo(), numEventsInWaitList, eventWaitList, event); return CL_SUCCESS; } if (svmOperation->readOnlyMap) { NullSurface s; Surface *surfaces[] = {&s}; enqueueHandler(surfaces, false, MultiDispatchInfo(), numEventsInWaitList, eventWaitList, event); context->getSVMAllocsManager()->removeSvmMapOperation(svmPtr); return CL_SUCCESS; } MultiDispatchInfo dispatchInfo; auto &builder = getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBuffer, this->getContext(), this->getDevice()); BuiltInOwnershipWrapper builtInLock(builder, this->context); GeneralSurface dstSurface(svmData->gpuAllocation); GeneralSurface srcSurface(svmData->cpuAllocation); Surface *surfaces[] = {&dstSurface, &srcSurface}; BuiltinOpParams dc; dc.dstPtr = reinterpret_cast(svmData->gpuAllocation->getGpuAddressToPatch()); dc.dstSvmAlloc = svmData->gpuAllocation; dc.dstOffset = {svmOperation->offset, 0, 0}; dc.srcPtr = reinterpret_cast(svmData->cpuAllocation->getGpuAddressToPatch()); dc.srcSvmAlloc = svmData->cpuAllocation; dc.srcOffset = {svmOperation->offset, 0, 0}; dc.size = {svmOperation->regionSize, 0, 0}; dc.unifiedMemoryArgsRequireMemSync = externalAppCall; builder.buildDispatchInfos(dispatchInfo, dc); enqueueHandler( surfaces, false, dispatchInfo, numEventsInWaitList, eventWaitList, event); if (event) { castToObjectOrAbort(*event)->setCmdType(CL_COMMAND_SVM_UNMAP); } context->getSVMAllocsManager()->removeSvmMapOperation(svmPtr); dispatchInfo.backupUnifiedMemorySyncRequirement(); return CL_SUCCESS; } } template cl_int CommandQueueHw::enqueueSVMFree(cl_uint numSvmPointers, void *svmPointers[], SvmFreeClbT clb, void *userData, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *retEvent) { cl_event event = nullptr; bool ownsEventDeletion = false; if (retEvent == nullptr) { ownsEventDeletion = true; retEvent = &event; } SvmFreeUserData *pFreeData = new SvmFreeUserData(numSvmPointers, svmPointers, clb, userData, ownsEventDeletion); NullSurface s; Surface *surfaces[] = {&s}; enqueueHandler(surfaces, false, MultiDispatchInfo(), numEventsInWaitList, eventWaitList, retEvent); auto eventObject = castToObjectOrAbort(*retEvent); eventObject->addCallback(freeSvmEventClb, CL_COMPLETE, pFreeData); return CL_SUCCESS; } inline void setOperationParams(BuiltinOpParams &operationParams, size_t size, const void *srcPtr, GraphicsAllocation *srcSvmAlloc, void *dstPtr, GraphicsAllocation *dstSvmAlloc) { operationParams.size = {size, 0, 0}; operationParams.srcPtr = const_cast(alignDown(srcPtr, 4)); operationParams.srcSvmAlloc = srcSvmAlloc; operationParams.srcOffset = {ptrDiff(srcPtr, operationParams.srcPtr), 0, 0}; operationParams.dstPtr = alignDown(dstPtr, 4); operationParams.dstSvmAlloc = dstSvmAlloc; operationParams.dstOffset = {ptrDiff(dstPtr, operationParams.dstPtr), 0, 0}; } template cl_int CommandQueueHw::enqueueSVMMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { if ((dstPtr == nullptr) || (srcPtr == nullptr)) { return CL_INVALID_VALUE; } auto dstSvmData = context->getSVMAllocsManager()->getSVMAlloc(dstPtr); auto srcSvmData = context->getSVMAllocsManager()->getSVMAlloc(srcPtr); enum CopyType { HostToHost, SvmToHost, HostToSvm, SvmToSvm }; CopyType copyType = HostToHost; if ((srcSvmData != nullptr) && (dstSvmData != nullptr)) { copyType = SvmToSvm; } else if ((srcSvmData == nullptr) && (dstSvmData != nullptr)) { copyType = HostToSvm; } else if (srcSvmData != nullptr) { copyType = SvmToHost; } auto pageFaultManager = context->getMemoryManager()->getPageFaultManager(); if (dstSvmData && pageFaultManager) { pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast(dstSvmData->gpuAllocation->getGpuAddress())); } if (srcSvmData && pageFaultManager) { pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast(srcSvmData->gpuAllocation->getGpuAddress())); } auto isStatelessRequired = false; if (srcSvmData != nullptr) { isStatelessRequired = forceStateless(srcSvmData->size); } if (dstSvmData != nullptr) { isStatelessRequired |= forceStateless(dstSvmData->size); } auto builtInType = EBuiltInOps::CopyBufferToBuffer; if (isStatelessRequired) { builtInType = EBuiltInOps::CopyBufferToBufferStateless; } auto &builder = getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(builtInType, this->getContext(), this->getDevice()); BuiltInOwnershipWrapper builtInLock(builder, this->context); MultiDispatchInfo dispatchInfo; BuiltinOpParams operationParams; Surface *surfaces[2]; if (copyType == SvmToHost) { GeneralSurface srcSvmSurf(srcSvmData->gpuAllocation); HostPtrSurface dstHostPtrSurf(dstPtr, size); if (size != 0) { bool status = getGpgpuCommandStreamReceiver().createAllocationForHostSurface(dstHostPtrSurf, true); if (!status) { return CL_OUT_OF_RESOURCES; } dstPtr = reinterpret_cast(dstHostPtrSurf.getAllocation()->getGpuAddress()); } setOperationParams(operationParams, size, srcPtr, srcSvmData->gpuAllocation, dstPtr, dstHostPtrSurf.getAllocation()); surfaces[0] = &srcSvmSurf; surfaces[1] = &dstHostPtrSurf; builder.buildDispatchInfos(dispatchInfo, operationParams); enqueueHandler( surfaces, blockingCopy == CL_TRUE, dispatchInfo, numEventsInWaitList, eventWaitList, event); } else if (copyType == HostToSvm) { HostPtrSurface srcHostPtrSurf(const_cast(srcPtr), size); GeneralSurface dstSvmSurf(dstSvmData->gpuAllocation); if (size != 0) { bool status = getGpgpuCommandStreamReceiver().createAllocationForHostSurface(srcHostPtrSurf, false); if (!status) { return CL_OUT_OF_RESOURCES; } srcPtr = reinterpret_cast(srcHostPtrSurf.getAllocation()->getGpuAddress()); } setOperationParams(operationParams, size, srcPtr, srcHostPtrSurf.getAllocation(), dstPtr, dstSvmData->gpuAllocation); surfaces[0] = &dstSvmSurf; surfaces[1] = &srcHostPtrSurf; builder.buildDispatchInfos(dispatchInfo, operationParams); enqueueHandler( surfaces, blockingCopy == CL_TRUE, dispatchInfo, numEventsInWaitList, eventWaitList, event); } else if (copyType == SvmToSvm) { GeneralSurface srcSvmSurf(srcSvmData->gpuAllocation); GeneralSurface dstSvmSurf(dstSvmData->gpuAllocation); setOperationParams(operationParams, size, srcPtr, srcSvmData->gpuAllocation, dstPtr, dstSvmData->gpuAllocation); surfaces[0] = &srcSvmSurf; surfaces[1] = &dstSvmSurf; builder.buildDispatchInfos(dispatchInfo, operationParams); enqueueHandler( surfaces, blockingCopy ? true : false, dispatchInfo, numEventsInWaitList, eventWaitList, event); } else { HostPtrSurface srcHostPtrSurf(const_cast(srcPtr), size); HostPtrSurface dstHostPtrSurf(dstPtr, size); if (size != 0) { bool status = getGpgpuCommandStreamReceiver().createAllocationForHostSurface(srcHostPtrSurf, false); status &= getGpgpuCommandStreamReceiver().createAllocationForHostSurface(dstHostPtrSurf, true); if (!status) { return CL_OUT_OF_RESOURCES; } srcPtr = reinterpret_cast(srcHostPtrSurf.getAllocation()->getGpuAddress()); dstPtr = reinterpret_cast(dstHostPtrSurf.getAllocation()->getGpuAddress()); } setOperationParams(operationParams, size, srcPtr, srcHostPtrSurf.getAllocation(), dstPtr, dstHostPtrSurf.getAllocation()); surfaces[0] = &srcHostPtrSurf; surfaces[1] = &dstHostPtrSurf; builder.buildDispatchInfos(dispatchInfo, operationParams); enqueueHandler( surfaces, blockingCopy ? true : false, dispatchInfo, numEventsInWaitList, eventWaitList, event); } if (event) { auto pEvent = castToObjectOrAbort(*event); pEvent->setCmdType(CL_COMMAND_SVM_MEMCPY); } return CL_SUCCESS; } template cl_int CommandQueueHw::enqueueSVMMemFill(void *svmPtr, const void *pattern, size_t patternSize, size_t size, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { auto svmData = context->getSVMAllocsManager()->getSVMAlloc(svmPtr); if (svmData == nullptr) { return CL_INVALID_VALUE; } auto memoryManager = context->getMemoryManager(); DEBUG_BREAK_IF(nullptr == memoryManager); auto pageFaultManager = memoryManager->getPageFaultManager(); if (pageFaultManager) { pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast(svmData->gpuAllocation->getGpuAddress())); } auto commandStreamReceieverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership(); auto storageWithAllocations = getGpgpuCommandStreamReceiver().getInternalAllocationStorage(); auto allocationType = GraphicsAllocation::AllocationType::FILL_PATTERN; auto patternAllocation = storageWithAllocations->obtainReusableAllocation(patternSize, allocationType).release(); commandStreamReceieverOwnership.unlock(); if (!patternAllocation) { patternAllocation = memoryManager->allocateGraphicsMemoryWithProperties({getDevice().getRootDeviceIndex(), patternSize, allocationType}); } if (patternSize == 1) { int patternInt = (uint32_t)((*(uint8_t *)pattern << 24) | (*(uint8_t *)pattern << 16) | (*(uint8_t *)pattern << 8) | *(uint8_t *)pattern); memcpy_s(patternAllocation->getUnderlyingBuffer(), sizeof(int), &patternInt, sizeof(int)); } else if (patternSize == 2) { int patternInt = (uint32_t)((*(uint16_t *)pattern << 16) | *(uint16_t *)pattern); memcpy_s(patternAllocation->getUnderlyingBuffer(), sizeof(int), &patternInt, sizeof(int)); } else { memcpy_s(patternAllocation->getUnderlyingBuffer(), patternSize, pattern, patternSize); } auto builtInType = EBuiltInOps::FillBuffer; if (forceStateless(svmData->size)) { builtInType = EBuiltInOps::FillBufferStateless; } auto &builder = getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(builtInType, this->getContext(), this->getDevice()); BuiltInOwnershipWrapper builtInLock(builder, this->context); BuiltinOpParams operationParams; MemObj patternMemObj(this->context, 0, {}, 0, 0, alignUp(patternSize, 4), patternAllocation->getUnderlyingBuffer(), patternAllocation->getUnderlyingBuffer(), patternAllocation, false, false, true); void *alignedDstPtr = alignDown(svmPtr, 4); size_t dstPtrOffset = ptrDiff(svmPtr, alignedDstPtr); operationParams.srcMemObj = &patternMemObj; operationParams.dstPtr = alignedDstPtr; operationParams.dstSvmAlloc = svmData->gpuAllocation; operationParams.dstOffset = {dstPtrOffset, 0, 0}; operationParams.size = {size, 0, 0}; MultiDispatchInfo dispatchInfo; builder.buildDispatchInfos(dispatchInfo, operationParams); GeneralSurface s1(svmData->gpuAllocation); GeneralSurface s2(patternAllocation); Surface *surfaces[] = {&s1, &s2}; enqueueHandler( surfaces, false, dispatchInfo, numEventsInWaitList, eventWaitList, event); storageWithAllocations->storeAllocationWithTaskCount(std::unique_ptr(patternAllocation), REUSABLE_ALLOCATION, taskCount); return CL_SUCCESS; } template cl_int CommandQueueHw::enqueueSVMMigrateMem(cl_uint numSvmPointers, const void **svmPointers, const size_t *sizes, const cl_mem_migration_flags flags, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { NullSurface s; Surface *surfaces[] = {&s}; enqueueHandler(surfaces, false, MultiDispatchInfo(), numEventsInWaitList, eventWaitList, event); return CL_SUCCESS; } } // namespace NEO