From 35ff2849449a9ed137f6a5c7d60fa731001a8e45 Mon Sep 17 00:00:00 2001 From: Mateusz Jablonski Date: Tue, 23 Mar 2021 17:11:41 +0000 Subject: [PATCH] Cleanup Kernel class move deviceVector to MultiDeviceKernel class remove Device arg from Kernel's methods Related-To: NEO-5001 Signed-off-by: Mateusz Jablonski --- level_zero/core/source/kernel/kernel.h | 2 +- level_zero/core/source/kernel/kernel_imp.cpp | 2 +- level_zero/core/source/kernel/kernel_imp.h | 2 +- opencl/source/api/api.cpp | 7 +-- opencl/source/gtpin/gtpin_callbacks.cpp | 4 +- opencl/source/kernel/kernel.cpp | 58 +++++++++---------- opencl/source/kernel/kernel.h | 21 +++---- opencl/source/kernel/multi_device_kernel.cpp | 5 +- opencl/source/kernel/multi_device_kernel.h | 1 + .../sync_buffer_handler_tests.cpp | 2 +- opencl/test/unit_test/gtpin/gtpin_tests.cpp | 2 +- .../unit_test/kernel/kernel_arg_svm_tests.cpp | 4 +- opencl/test/unit_test/kernel/kernel_tests.cpp | 17 +++--- .../kernel/substitute_kernel_heap_tests.cpp | 10 ++-- opencl/test/unit_test/mocks/mock_program.h | 2 +- shared/source/program/sync_buffer_handler.inl | 2 +- 16 files changed, 63 insertions(+), 78 deletions(-) diff --git a/level_zero/core/source/kernel/kernel.h b/level_zero/core/source/kernel/kernel.h index e96c30fd19..e006c67fc2 100644 --- a/level_zero/core/source/kernel/kernel.h +++ b/level_zero/core/source/kernel/kernel.h @@ -133,7 +133,7 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI { virtual void printPrintfOutput() = 0; virtual bool usesSyncBuffer() = 0; - virtual void patchSyncBuffer(NEO::Device &device, NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0; + virtual void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0; Kernel() = default; Kernel(const Kernel &) = delete; diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 841478cc27..a82d417375 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -778,7 +778,7 @@ bool KernelImp::usesSyncBuffer() { return this->kernelImmData->getDescriptor().kernelAttributes.flags.usesSyncBuffer; } -void KernelImp::patchSyncBuffer(NEO::Device &device, NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) { +void KernelImp::patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) { this->residencyContainer.push_back(gfxAllocation); NEO::patchPointer(ArrayRef(crossThreadData.get(), crossThreadDataSize), this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.syncBufferAddress, diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h index 0a71e3ea8a..721dd7bd9e 100644 --- a/level_zero/core/source/kernel/kernel_imp.h +++ b/level_zero/core/source/kernel/kernel_imp.h @@ -81,7 +81,7 @@ struct KernelImp : Kernel { void printPrintfOutput() override; bool usesSyncBuffer() override; - void patchSyncBuffer(NEO::Device &device, NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) override; + void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) override; const uint8_t *getSurfaceStateHeapData() const override { return surfaceStateHeapData.get(); } uint32_t getSurfaceStateHeapDataSize() const override { return surfaceStateHeapDataSize; } diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp index 03190a9825..ddc92e8dce 100644 --- a/opencl/source/api/api.cpp +++ b/opencl/source/api/api.cpp @@ -1951,7 +1951,6 @@ cl_int CL_API_CALL clGetKernelWorkGroupInfo(cl_kernel kernel, if (CL_SUCCESS == retVal) { auto pKernel = pMultiDeviceKernel->getKernel(pClDevice->getRootDeviceIndex()); retVal = pKernel->getWorkGroupInfo( - *pClDevice, paramName, paramValueSize, paramValue, @@ -5312,7 +5311,7 @@ cl_int CL_API_CALL clGetKernelSubGroupInfoKHR(cl_kernel kernel, case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE: case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE: case CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: - return pKernel->getSubGroupInfo(*pClDevice, paramName, + return pKernel->getSubGroupInfo(paramName, inputValueSize, inputValue, paramValueSize, paramValue, paramValueSizeRet); @@ -5418,7 +5417,7 @@ cl_int CL_API_CALL clGetKernelSubGroupInfo(cl_kernel kernel, } auto pKernel = pMultiDeviceKernel->getKernel(pClDevice->getRootDeviceIndex()); - retVal = pKernel->getSubGroupInfo(*pClDevice, paramName, + retVal = pKernel->getSubGroupInfo(paramName, inputValueSize, inputValue, paramValueSize, paramValue, paramValueSizeRet); @@ -5796,7 +5795,7 @@ cl_int CL_API_CALL clGetKernelSuggestedLocalWorkSizeINTEL(cl_command_queue comma return retVal; } - pKernel->getSuggestedLocalWorkSize(workDim, globalWorkSize, globalWorkOffset, suggestedLocalWorkSize, pCommandQueue->getClDevice()); + pKernel->getSuggestedLocalWorkSize(workDim, globalWorkSize, globalWorkOffset, suggestedLocalWorkSize); return retVal; } diff --git a/opencl/source/gtpin/gtpin_callbacks.cpp b/opencl/source/gtpin/gtpin_callbacks.cpp index e20d17b481..a1107268bc 100644 --- a/opencl/source/gtpin/gtpin_callbacks.cpp +++ b/opencl/source/gtpin/gtpin_callbacks.cpp @@ -65,7 +65,7 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) { if (isGTPinInitialized) { auto pMultiDeviceKernel = castToObjectOrAbort(kernel); auto pKernel = pMultiDeviceKernel->getDefaultKernel(); - auto &device = pKernel->getDevices()[0]->getDevice(); + auto &device = pMultiDeviceKernel->getDevices()[0]->getDevice(); size_t gtpinBTI = pKernel->getNumberOfBindingTableStates(); // Enlarge local copy of SSH by 1 SS GFXCORE_FAMILY genFamily = device.getHardwareInfo().platform.eRenderCoreFamily; @@ -98,7 +98,7 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) { instrument_params_out_t paramsOut = {0}; (*GTPinCallbacks.onKernelCreate)((context_handle_t)(cl_context)context, ¶msIn, ¶msOut); // Substitute ISA of created kernel with instrumented code - pKernel->substituteKernelHeap(device, paramsOut.inst_kernel_binary, paramsOut.inst_kernel_size); + pKernel->substituteKernelHeap(paramsOut.inst_kernel_binary, paramsOut.inst_kernel_size); pKernel->setKernelId(paramsOut.kernel_id); } } diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 6582abb21e..be0854fb6d 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -70,9 +70,7 @@ Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, ClDevice &c executionEnvironment(programArg->getExecutionEnvironment()), program(programArg), clDevice(clDeviceArg), - deviceVector(programArg->getDevices()), - kernelInfo(kernelInfoArg), - defaultRootDeviceIndex(clDeviceArg.getRootDeviceIndex()) { + kernelInfo(kernelInfoArg) { program->retain(); program->retainForKernel(); imageTransformer.reset(new ImageTransformer); @@ -131,7 +129,7 @@ inline void patch(const SrcT &src, void *dst, uint32_t dstOffsetBytes) { *patchLocation = static_cast(src); } -void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const ArgDescPointer &arg) { +void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg) { if ((nullptr != crossThreadData) && isValidOffset(arg.stateless)) { auto pp = ptrOffset(crossThreadData, arg.stateless); uintptr_t addressToPatch = reinterpret_cast(ptrToPatchInCrossThreadData); @@ -147,13 +145,13 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic auto surfaceState = ptrOffset(ssh, arg.bindful); void *addressToPatch = reinterpret_cast(allocation.getGpuAddressToPatch()); size_t sizeToPatch = allocation.getUnderlyingBufferSize(); - Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0, + Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0, kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, getTotalNumDevicesInContext()); } } template -void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const PatchTokenT &patch) { +void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const PatchTokenT &patch) { uint32_t pointerSize = patch.DataParamSize; if (crossThreadData != nullptr) { @@ -173,16 +171,16 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic auto surfaceState = ptrOffset(ssh, sshOffset); void *addressToPatch = reinterpret_cast(allocation.getGpuAddressToPatch()); size_t sizeToPatch = allocation.getUnderlyingBufferSize(); - Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0, + Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0, kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, getTotalNumDevicesInContext()); } } -template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization &patch); +template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization &patch); -template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const SPatchAllocateStatelessPrivateSurface &patch); +template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessPrivateSurface &patch); -template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization &patch); +template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization &patch); cl_int Kernel::initialize() { this->kernelHasIndirectAccess = false; @@ -326,14 +324,14 @@ cl_int Kernel::initialize() { return CL_OUT_OF_RESOURCES; } const auto &patch = kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress; - patchWithImplicitSurface(reinterpret_cast(privateSurface->getGpuAddressToPatch()), *privateSurface, pClDevice->getDevice(), patch); + patchWithImplicitSurface(reinterpret_cast(privateSurface->getGpuAddressToPatch()), *privateSurface, patch); } if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) { DEBUG_BREAK_IF(program->getConstantSurface(rootDeviceIndex) == nullptr); uintptr_t constMemory = isBuiltIn ? (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch(); const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress; - patchWithImplicitSurface(reinterpret_cast(constMemory), *program->getConstantSurface(rootDeviceIndex), pClDevice->getDevice(), arg); + patchWithImplicitSurface(reinterpret_cast(constMemory), *program->getConstantSurface(rootDeviceIndex), arg); } if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) { @@ -341,7 +339,7 @@ cl_int Kernel::initialize() { uintptr_t globalMemory = isBuiltIn ? (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch(); const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress; - patchWithImplicitSurface(reinterpret_cast(globalMemory), *program->getGlobalSurface(rootDeviceIndex), pClDevice->getDevice(), arg); + patchWithImplicitSurface(reinterpret_cast(globalMemory), *program->getGlobalSurface(rootDeviceIndex), arg); } bool useGlobalAtomics = kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics; @@ -392,7 +390,6 @@ cl_int Kernel::initialize() { // double check this assumption bool usingBuffers = false; bool usingImages = false; - auto &defaultKernelInfo = kernelInfo; kernelArguments.resize(numArgs); kernelArgHandlers.resize(numArgs); kernelArgRequiresCacheFlush.resize(numArgs); @@ -401,7 +398,7 @@ cl_int Kernel::initialize() { storeKernelArg(i, NONE_OBJ, nullptr, nullptr, 0); // set the argument handler - auto &argInfo = defaultKernelInfo.kernelArgInfo[i]; + auto &argInfo = kernelInfo.kernelArgInfo[i]; if (argInfo.metadata.addressQualifier == KernelArgMetadata::AddrLocal) { kernelArgHandlers[i] = &Kernel::setArgLocal; } else if (argInfo.isAccelerator) { @@ -551,9 +548,8 @@ cl_int Kernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t cl_int retVal; const void *pSrc = nullptr; size_t srcSize = GetInfo::invalidSourceSize; - auto &defaultKernelInfo = kernelInfo; - auto numArgs = static_cast(defaultKernelInfo.kernelArgInfo.size()); - const auto &argInfo = defaultKernelInfo.kernelArgInfo[argIndx]; + auto numArgs = static_cast(kernelInfo.kernelArgInfo.size()); + const auto &argInfo = kernelInfo.kernelArgInfo[argIndx]; if (argIndx >= numArgs) { retVal = CL_INVALID_ARG_INDEX; @@ -604,7 +600,7 @@ cl_int Kernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t return retVal; } -cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info paramName, +cl_int Kernel::getWorkGroupInfo(cl_kernel_work_group_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { cl_int retVal = CL_INVALID_VALUE; @@ -619,7 +615,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para cl_ulong scratchSize; cl_ulong privateMemSize; size_t maxWorkgroupSize; - const auto &hwInfo = device.getHardwareInfo(); + const auto &hwInfo = clDevice.getHardwareInfo(); auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); auto &clHwHelper = ClHwHelper::get(hwInfo.platform.eRenderCoreFamily); GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet); @@ -680,7 +676,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para return retVal; } -cl_int Kernel::getSubGroupInfo(ClDevice &clDevice, cl_kernel_sub_group_info paramName, +cl_int Kernel::getSubGroupInfo(cl_kernel_sub_group_info paramName, size_t inputValueSize, const void *inputValue, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { @@ -791,7 +787,7 @@ size_t Kernel::getKernelHeapSize() const { return kernelInfo.heapInfo.KernelHeapSize; } -void Kernel::substituteKernelHeap(const Device &device, void *newKernelHeap, size_t newKernelHeapSize) { +void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize) { KernelInfo *pKernelInfo = const_cast(&kernelInfo); void **pKernelHeap = const_cast(&pKernelInfo->heapInfo.pKernelHeap); *pKernelHeap = newKernelHeap; @@ -807,7 +803,7 @@ void Kernel::substituteKernelHeap(const Device &device, void *newKernelHeap, siz } else { memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(pKernelInfo->kernelAllocation); pKernelInfo->kernelAllocation = nullptr; - status = pKernelInfo->createKernelAllocation(device, isBuiltIn); + status = pKernelInfo->createKernelAllocation(clDevice.getDevice(), isBuiltIn); } UNRECOVERABLE_IF(!status); } @@ -864,9 +860,8 @@ cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) { cl_int retVal = CL_SUCCESS; bool updateExposedKernel = true; auto argWasUncacheable = false; - auto &defaultKernelInfo = kernelInfo; - if (defaultKernelInfo.builtinDispatchBuilder != nullptr) { - updateExposedKernel = defaultKernelInfo.builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal); + if (kernelInfo.builtinDispatchBuilder != nullptr) { + updateExposedKernel = kernelInfo.builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal); } if (updateExposedKernel) { if (argIndex >= kernelArgHandlers.size()) { @@ -1068,7 +1063,7 @@ cl_int Kernel::setKernelExecutionType(cl_execution_info_kernel_type_intel execut } void Kernel::getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset, - size_t *localWorkSize, ClDevice &clDevice) { + size_t *localWorkSize) { UNRECOVERABLE_IF((workDim == 0) || (workDim > 3)); UNRECOVERABLE_IF(globalWorkSize == nullptr); Vec3 elws{0, 0, 0}; @@ -2431,7 +2426,7 @@ bool Kernel::usesSyncBuffer() { return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesSyncBuffer; } -void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) { +void Kernel::patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset) { const auto &syncBuffer = kernelInfo.kernelDescriptor.payloadMappings.implicitArgs.syncBufferAddress; auto bufferPatchAddress = ptrOffset(crossThreadData, syncBuffer.stateless); patchWithRequiredSize(bufferPatchAddress, syncBuffer.pointerSize, @@ -2441,7 +2436,7 @@ void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, auto surfaceState = ptrOffset(reinterpret_cast(getSurfaceStateHeap()), syncBuffer.bindful); auto addressToPatch = gfxAllocation->getUnderlyingBuffer(); auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize(); - Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0, gfxAllocation, 0, 0, + Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, gfxAllocation, 0, 0, kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, getTotalNumDevicesInContext()); } } @@ -2454,13 +2449,12 @@ bool Kernel::isPatched() const { cl_int Kernel::checkCorrectImageAccessQualifier(cl_uint argIndex, size_t argSize, const void *argValue) const { - auto &defaultKernelInfo = kernelInfo; - if (defaultKernelInfo.kernelArgInfo[argIndex].isImage) { + if (kernelInfo.kernelArgInfo[argIndex].isImage) { cl_mem mem = *(static_cast(argValue)); MemObj *pMemObj = nullptr; WithCastToInternal(mem, &pMemObj); if (pMemObj) { - auto accessQualifier = defaultKernelInfo.kernelArgInfo[argIndex].metadata.accessQualifier; + auto accessQualifier = kernelInfo.kernelArgInfo[argIndex].metadata.accessQualifier; cl_mem_flags flags = pMemObj->getFlags(); if ((accessQualifier == KernelArgMetadata::AccessReadOnly && ((flags | CL_MEM_WRITE_ONLY) == flags)) || (accessQualifier == KernelArgMetadata::AccessWriteOnly && ((flags | CL_MEM_READ_ONLY) == flags))) { diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h index 1186d49e4f..0381ccc680 100644 --- a/opencl/source/kernel/kernel.h +++ b/opencl/source/kernel/kernel.h @@ -41,8 +41,6 @@ class Surface; class PrintfHandler; class MultiDeviceKernel; -using KernelInfoContainer = StackVec; - class Kernel : public ReferenceTrackedObject { public: static const uint32_t kernelBinaryAlignement = 64; @@ -157,10 +155,10 @@ class Kernel : public ReferenceTrackedObject { cl_int getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const; - cl_int getWorkGroupInfo(ClDevice &clDevice, cl_kernel_work_group_info paramName, + cl_int getWorkGroupInfo(cl_kernel_work_group_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const; - cl_int getSubGroupInfo(ClDevice &device, cl_kernel_sub_group_info paramName, + cl_int getSubGroupInfo(cl_kernel_sub_group_info paramName, size_t inputValueSize, const void *inputValue, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const; @@ -179,7 +177,7 @@ class Kernel : public ReferenceTrackedObject { void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset); - void substituteKernelHeap(const Device &device, void *newKernelHeap, size_t newKernelHeapSize); + void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize); bool isKernelHeapSubstituted() const; uint64_t getKernelId() const; void setKernelId(uint64_t newKernelId); @@ -224,7 +222,7 @@ class Kernel : public ReferenceTrackedObject { void patchEventPool(DeviceQueue *devQueue); void patchBlocksSimdSize(); bool usesSyncBuffer(); - void patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset); + void patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset); void patchBindlessSurfaceStateOffsets(const Device &device, const size_t sshOffset); GraphicsAllocation *getKernelReflectionSurface() const { @@ -368,7 +366,7 @@ class Kernel : public ReferenceTrackedObject { this->threadArbitrationPolicy = policy; } void getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset, - size_t *localWorkSize, ClDevice &clDevice); + size_t *localWorkSize); uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const; uint64_t getKernelStartOffset( @@ -383,9 +381,6 @@ class Kernel : public ReferenceTrackedObject { void setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo); uint32_t getAdditionalKernelExecInfo() const; MOCKABLE_VIRTUAL bool requiresWaDisableRccRhwoOptimization() const; - const ClDeviceVector &getDevices() const { - return program->getDevices(); - } void setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ); void setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ); @@ -479,10 +474,10 @@ class Kernel : public ReferenceTrackedObject { void *patchBufferOffset(const KernelArgInfo &argInfo, void *svmPtr, GraphicsAllocation *svmAlloc); - void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const ArgDescPointer &arg); + void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg); // Sets-up both crossThreadData and ssh for given implicit (private/constant, etc.) allocation template - void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const PatchTokenT &patch); + void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const PatchTokenT &patch); void getParentObjectCounts(ObjectCounts &objectCount); Kernel(Program *programArg, const KernelInfo &kernelInfo, ClDevice &clDevice, bool schedulerKernel = false); @@ -508,7 +503,6 @@ class Kernel : public ReferenceTrackedObject { const ExecutionEnvironment &executionEnvironment; Program *program; ClDevice &clDevice; - const ClDeviceVector &deviceVector; const KernelInfo &kernelInfo; std::vector kernelArguments; @@ -585,7 +579,6 @@ class Kernel : public ReferenceTrackedObject { GraphicsAllocation *privateSurface = nullptr; uint64_t privateSurfaceSize = 0u; - const uint32_t defaultRootDeviceIndex; struct KernelConfig { Vec3 gws; diff --git a/opencl/source/kernel/multi_device_kernel.cpp b/opencl/source/kernel/multi_device_kernel.cpp index af60e4a695..0a88225a48 100644 --- a/opencl/source/kernel/multi_device_kernel.cpp +++ b/opencl/source/kernel/multi_device_kernel.cpp @@ -19,9 +19,10 @@ MultiDeviceKernel::~MultiDeviceKernel() { Kernel *MultiDeviceKernel::determineDefaultKernel(KernelVectorType &kernelVector) { for (auto &pKernel : kernelVector) { if (pKernel) { - return kernelVector[(*pKernel->getDevices().begin())->getRootDeviceIndex()]; + return pKernel; } } + UNRECOVERABLE_IF(true); return nullptr; } MultiDeviceKernel::MultiDeviceKernel(KernelVectorType kernelVector, const KernelInfoContainer kernelInfosArg) : kernels(std::move(kernelVector)), @@ -39,7 +40,7 @@ MultiDeviceKernel::MultiDeviceKernel(KernelVectorType kernelVector, const Kernel const std::vector &MultiDeviceKernel::getKernelArguments() const { return defaultKernel->getKernelArguments(); } cl_int MultiDeviceKernel::getInfo(cl_kernel_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { return defaultKernel->getInfo(paramName, paramValueSize, paramValue, paramValueSizeRet); } cl_int MultiDeviceKernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { return defaultKernel->getArgInfo(argIndx, paramName, paramValueSize, paramValue, paramValueSizeRet); } -const ClDeviceVector &MultiDeviceKernel::getDevices() const { return defaultKernel->getDevices(); } +const ClDeviceVector &MultiDeviceKernel::getDevices() const { return program->getDevices(); } size_t MultiDeviceKernel::getKernelArgsNumber() const { return defaultKernel->getKernelArgsNumber(); } Context &MultiDeviceKernel::getContext() const { return defaultKernel->getContext(); } bool MultiDeviceKernel::getHasIndirectAccess() const { return defaultKernel->getHasIndirectAccess(); } diff --git a/opencl/source/kernel/multi_device_kernel.h b/opencl/source/kernel/multi_device_kernel.h index f2469abc60..8bf6ab9d23 100644 --- a/opencl/source/kernel/multi_device_kernel.h +++ b/opencl/source/kernel/multi_device_kernel.h @@ -15,6 +15,7 @@ struct OpenCLObjectMapper<_cl_kernel> { }; using KernelVectorType = StackVec; +using KernelInfoContainer = StackVec; class MultiDeviceKernel : public BaseObject<_cl_kernel> { public: diff --git a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp index e2796738e6..0de8b5f645 100644 --- a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp +++ b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp @@ -190,7 +190,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenSshRequiredWhenPatchingSyncBuffer auto surfaceAddress = surfaceState->getSurfaceBaseAddress(); EXPECT_NE(bufferAddress, surfaceAddress); - kernel->patchSyncBuffer(commandQueue->getDevice(), syncBufferHandler->graphicsAllocation, syncBufferHandler->usedBufferSize); + kernel->patchSyncBuffer(syncBufferHandler->graphicsAllocation, syncBufferHandler->usedBufferSize); surfaceAddress = surfaceState->getSurfaceBaseAddress(); EXPECT_EQ(bufferAddress, surfaceAddress); } diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp index 2a82096169..94a986505d 100644 --- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp +++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp @@ -2344,7 +2344,7 @@ TEST_F(GTPinTests, givenKernelThenVerifyThatKernelCodeSubstitutionWorksWell) { // Substitute new kernel code constexpr size_t newCodeSize = 64; uint8_t newCode[newCodeSize] = {0x0, 0x1, 0x2, 0x3, 0x4}; - pKernel->substituteKernelHeap(pDevice->getDevice(), &newCode[0], newCodeSize); + pKernel->substituteKernelHeap(&newCode[0], newCodeSize); // Verify that substitution went properly isKernelCodeSubstituted = pKernel->isKernelHeapSubstituted(); diff --git a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp index 2b0f279bbb..445c5f4e81 100644 --- a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp @@ -259,7 +259,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) { RENDER_SURFACE_STATE *surfState = reinterpret_cast(pKernel->getSurfaceStateHeap()); memset(surfState, 0, rendSurfSize); - pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, *pDevice, patch); + pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, patch); // verify cross thread data was properly patched EXPECT_EQ(ptrToPatch, *reinterpret_cast(pKernel->getCrossThreadData())); @@ -280,7 +280,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) { // when cross thread and ssh data is not available then should not do anything pKernel->setCrossThreadData(nullptr, 0); pKernel->setSshLocal(nullptr, 0); - pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, *pDevice, patch); + pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, patch); } } diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index ef0aa46980..d1fae7c68e 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -285,7 +285,6 @@ TEST_F(KernelTests, GivenKernelWorkGroupSizeWhenGettingWorkGroupInfoThenWorkGrou pKernel->maxKernelWorkGroupSize = static_cast(kernelMaxWorkGroupSize); retVal = pKernel->getWorkGroupInfo( - *pClDevice, paramName, paramValueSize, ¶mValue, @@ -303,7 +302,6 @@ TEST_F(KernelTests, GivenKernelCompileWorkGroupSizeWhenGettingWorkGroupInfoThenC size_t paramValueSizeRet = 0; retVal = pKernel->getWorkGroupInfo( - *pClDevice, paramName, paramValueSize, ¶mValue, @@ -317,7 +315,6 @@ TEST_F(KernelTests, GivenInvalidParamNameWhenGettingWorkGroupInfoThenInvalidValu size_t paramValueSizeRet = 0x1234u; retVal = pKernel->getWorkGroupInfo( - *pClDevice, 0, 0, nullptr, @@ -2818,15 +2815,15 @@ HWTEST_F(KernelTest, givenKernelWhenDebugFlagToUseMaxSimdForCalculationsIsUsedTh size_t maxKernelWkgSize; kernel.kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 32; - kernel.mockKernel->getWorkGroupInfo(*device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr); + kernel.mockKernel->getWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr); EXPECT_EQ(1024u, maxKernelWkgSize); kernel.kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 16; - kernel.mockKernel->getWorkGroupInfo(*device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr); + kernel.mockKernel->getWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr); EXPECT_EQ(512u, maxKernelWkgSize); kernel.kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 8; - kernel.mockKernel->getWorkGroupInfo(*device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr); + kernel.mockKernel->getWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr); EXPECT_EQ(256u, maxKernelWkgSize); } @@ -3166,7 +3163,7 @@ TEST(KernelTest, givenKernelWithPatchInfoCollectionEnabledWhenPatchWithImplicitS SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization patchToken{}; uint64_t crossThreadData = 0; EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size()); - kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, device->getDevice(), patchToken); + kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, patchToken); EXPECT_EQ(1u, kernel.mockKernel->getPatchInfoDataList().size()); } @@ -3176,7 +3173,7 @@ TEST(KernelTest, givenKernelWithPatchInfoCollecitonEnabledAndArgumentWithInvalid MockGraphicsAllocation mockAllocation; ArgDescPointer arg; uint64_t ptr = 0; - kernel.mockKernel->patchWithImplicitSurface(&ptr, mockAllocation, device->getDevice(), arg); + kernel.mockKernel->patchWithImplicitSurface(&ptr, mockAllocation, arg); EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size()); } @@ -3191,7 +3188,7 @@ TEST(KernelTest, givenKernelWithPatchInfoCollectionEnabledAndValidArgumentWhenPa arg.stateless = 0; uint64_t crossThreadData = 0; EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size()); - kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, device->getDevice(), arg); + kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, arg); EXPECT_EQ(1u, kernel.mockKernel->getPatchInfoDataList().size()); } @@ -3202,7 +3199,7 @@ TEST(KernelTest, givenKernelWithPatchInfoCollectionDisabledWhenPatchWithImplicit SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization patchToken{}; uint64_t crossThreadData = 0; EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size()); - kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, device->getDevice(), patchToken); + kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, patchToken); EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size()); } diff --git a/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp b/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp index 510dd1024e..d38ceb70d3 100644 --- a/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp +++ b/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -34,7 +34,7 @@ TEST_F(KernelSubstituteTest, givenKernelWhenSubstituteKernelHeapWithGreaterSizeT const size_t newHeapSize = initialHeapSize + 1; char newHeap[newHeapSize]; - kernel.mockKernel->substituteKernelHeap(*pDevice, newHeap, newHeapSize); + kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize); auto secondAllocation = kernel.kernelInfo.kernelAllocation; EXPECT_NE(nullptr, secondAllocation); auto secondAllocationSize = secondAllocation->getUnderlyingBufferSize(); @@ -64,7 +64,7 @@ TEST_F(KernelSubstituteTest, givenKernelWhenSubstituteKernelHeapWithSameSizeThen const size_t newHeapSize = initialHeapSize; char newHeap[newHeapSize]; - kernel.mockKernel->substituteKernelHeap(*pDevice, newHeap, newHeapSize); + kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize); auto secondAllocation = kernel.kernelInfo.kernelAllocation; EXPECT_NE(nullptr, secondAllocation); auto secondAllocationSize = secondAllocation->getUnderlyingBufferSize(); @@ -93,7 +93,7 @@ TEST_F(KernelSubstituteTest, givenKernelWhenSubstituteKernelHeapWithSmallerSizeT const size_t newHeapSize = initialHeapSize - 1; char newHeap[newHeapSize]; - kernel.mockKernel->substituteKernelHeap(*pDevice, newHeap, newHeapSize); + kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize); auto secondAllocation = kernel.kernelInfo.kernelAllocation; EXPECT_NE(nullptr, secondAllocation); auto secondAllocationSize = secondAllocation->getUnderlyingBufferSize(); @@ -125,7 +125,7 @@ TEST_F(KernelSubstituteTest, givenKernelWithUsedKernelAllocationWhenSubstituteKe EXPECT_TRUE(commandStreamReceiver.getTemporaryAllocations().peekIsEmpty()); - kernel.mockKernel->substituteKernelHeap(*pDevice, newHeap, newHeapSize); + kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize); auto secondAllocation = kernel.kernelInfo.kernelAllocation; EXPECT_FALSE(commandStreamReceiver.getTemporaryAllocations().peekIsEmpty()); diff --git a/opencl/test/unit_test/mocks/mock_program.h b/opencl/test/unit_test/mocks/mock_program.h index 558e9e3be4..a970cd1576 100644 --- a/opencl/test/unit_test/mocks/mock_program.h +++ b/opencl/test/unit_test/mocks/mock_program.h @@ -11,7 +11,7 @@ #include "shared/source/helpers/string.h" #include "opencl/source/cl_device/cl_device.h" -#include "opencl/source/kernel/kernel.h" +#include "opencl/source/kernel/multi_device_kernel.h" #include "opencl/source/program/kernel_info.h" #include "opencl/source/program/program.h" diff --git a/shared/source/program/sync_buffer_handler.inl b/shared/source/program/sync_buffer_handler.inl index 1e7d5d7dba..351b6b40d9 100644 --- a/shared/source/program/sync_buffer_handler.inl +++ b/shared/source/program/sync_buffer_handler.inl @@ -19,7 +19,7 @@ void NEO::SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, KernelT & usedBufferSize = 0; } - kernel.patchSyncBuffer(device, graphicsAllocation, usedBufferSize); + kernel.patchSyncBuffer(graphicsAllocation, usedBufferSize); usedBufferSize += requiredSize; }