From 350ec9f16b523bcd56e05550c80cdd571640ec80 Mon Sep 17 00:00:00 2001 From: Mateusz Jablonski Date: Tue, 8 Dec 2020 12:11:40 +0000 Subject: [PATCH] Pass root device index to get proper kernel info Related-To: NEO-5001 Signed-off-by: Mateusz Jablonski --- opencl/source/api/api.cpp | 10 +- opencl/source/command_queue/command_queue.cpp | 2 +- opencl/source/command_queue/enqueue_common.h | 10 +- opencl/source/context/context.cpp | 2 +- .../device_queue/device_queue_hw_bdw_plus.inl | 4 +- opencl/source/gtpin/gtpin_callbacks.cpp | 17 +- opencl/source/helpers/dispatch_info.cpp | 4 +- .../helpers/hardware_commands_helper_base.inl | 6 +- .../kernel/get_additional_kernel_info.cpp | 2 +- opencl/source/kernel/kernel.cpp | 151 ++++++++++-------- opencl/source/kernel/kernel.h | 54 +++---- opencl/source/program/kernel_info.cpp | 9 +- opencl/source/program/kernel_info.h | 2 +- opencl/source/program/printf_handler.cpp | 5 +- opencl/source/scheduler/scheduler_kernel.h | 5 +- .../api/cl_enqueue_nd_range_kernel_tests.inl | 2 +- .../cl_get_kernel_work_group_info_tests.inl | 2 +- .../enqueue_kernel_aub_tests.cpp | 8 +- .../enqueue_copy_buffer_rect_tests.cpp | 2 +- .../enqueue_copy_buffer_tests.cpp | 8 +- .../enqueue_copy_buffer_to_image_tests.cpp | 2 +- .../enqueue_copy_image_tests.cpp | 2 +- .../enqueue_copy_image_to_buffer_tests.cpp | 2 +- .../enqueue_fill_buffer_tests.cpp | 8 +- .../enqueue_fill_image_tests.cpp | 2 +- .../command_queue/enqueue_kernel_1_tests.cpp | 4 +- .../enqueue_read_buffer_rect_tests.cpp | 2 +- .../enqueue_read_buffer_tests.cpp | 4 +- .../enqueue_read_image_tests.cpp | 2 +- .../enqueue_write_buffer_rect_tests.cpp | 2 +- .../enqueue_write_buffer_tests.cpp | 4 +- .../enqueue_write_image_tests.cpp | 2 +- .../context/driver_diagnostics_tests.cpp | 2 +- .../device_queue/device_queue_hw_tests.cpp | 4 +- .../parent_kernel_dispatch_tests.cpp | 2 +- opencl/test/unit_test/gtpin/gtpin_tests.cpp | 24 +-- .../test/unit_test/helpers/unit_test_helper.h | 2 +- .../unit_test/helpers/unit_test_helper.inl | 2 +- .../unit_test/kernel/debug_kernel_tests.cpp | 8 +- opencl/test/unit_test/kernel/kernel_tests.cpp | 14 +- .../kernel/kernel_transformable_tests.cpp | 25 +-- .../kernel/substitute_kernel_heap_tests.cpp | 8 +- .../sampler/sampler_set_arg_tests.cpp | 10 +- shared/test/unit_test/cmd_parse/hw_parse.h | 2 +- shared/test/unit_test/cmd_parse/hw_parse.inl | 4 +- .../test/unit_test/gen11/cmd_parse_gen11.cpp | 2 +- .../unit_test/gen12lp/cmd_parse_gen12lp.cpp | 2 +- shared/test/unit_test/gen8/cmd_parse_gen8.cpp | 2 +- shared/test/unit_test/gen9/cmd_parse_gen9.cpp | 2 +- 49 files changed, 239 insertions(+), 217 deletions(-) diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp index 7fae08722d..8d193cf980 100644 --- a/opencl/source/api/api.cpp +++ b/opencl/source/api/api.cpp @@ -3410,7 +3410,7 @@ cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue commandQueue, } if ((pKernel->getExecutionType() != KernelExecutionType::Default) || - pKernel->isUsingSyncBuffer()) { + pKernel->isUsingSyncBuffer(pCommandQueue->getDevice().getRootDeviceIndex())) { retVal = CL_INVALID_KERNEL; TRACING_EXIT(clEnqueueNDRangeKernel, &retVal); return retVal; @@ -5818,7 +5818,9 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue, return retVal; } - auto &hardwareInfo = pKernel->getDevices()[0]->getHardwareInfo(); + auto &device = pCommandQueue->getClDevice(); + auto rootDeviceIndex = device.getRootDeviceIndex(); + auto &hardwareInfo = device.getHardwareInfo(); auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); if (!hwHelper.isCooperativeDispatchSupported(pCommandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) { retVal = CL_INVALID_COMMAND_QUEUE; @@ -5842,13 +5844,13 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue, } } - if (pKernel->isUsingSyncBuffer()) { + if (pKernel->isUsingSyncBuffer(rootDeviceIndex)) { if (pKernel->getExecutionType() != KernelExecutionType::Concurrent) { retVal = CL_INVALID_KERNEL; return retVal; } - pCommandQueue->getDevice().getSpecializedDevice()->allocateSyncBufferHandler(); + device.allocateSyncBufferHandler(); } if (!pCommandQueue->validateCapabilityForOperation(CL_QUEUE_CAPABILITY_KERNEL_INTEL, eventWaitList, event)) { diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index a69df40d47..395aea644c 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -529,7 +529,7 @@ bool CommandQueue::setupDebugSurface(Kernel *kernel) { auto debugSurface = getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation(); auto rootDeviceIndex = device->getRootDeviceIndex(); - DEBUG_BREAK_IF(!kernel->requiresSshForBuffers()); + DEBUG_BREAK_IF(!kernel->requiresSshForBuffers(rootDeviceIndex)); auto surfaceState = ptrOffset(reinterpret_cast(kernel->getSurfaceStateHeap(rootDeviceIndex)), kernel->getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface->Offset); void *addressToPatch = reinterpret_cast(debugSurface->getGpuAddress()); diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index e2b0d0689b..80e12167f5 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -64,16 +64,16 @@ void CommandQueueHw::enqueueHandler(Surface *(&surfaces)[surfaceCount if (DebugManager.flags.ForceDispatchScheduler.get()) { forceDispatchScheduler(multiDispatchInfo); } else { + auto rootDeviceIndex = device->getRootDeviceIndex(); if (kernel->isAuxTranslationRequired()) { auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, getClDevice()); builtInLock.takeOwnership(builder); - kernel->fillWithBuffersForAuxTranslation(memObjsForAuxTranslation); + kernel->fillWithBuffersForAuxTranslation(memObjsForAuxTranslation, rootDeviceIndex); multiDispatchInfo.setMemObjsForAuxTranslation(memObjsForAuxTranslation); if (!memObjsForAuxTranslation.empty()) { dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::AuxToNonAux); } } - auto rootDeviceIndex = device->getRootDeviceIndex(); if (kernel->getKernelInfo(rootDeviceIndex).builtinDispatchBuilder == nullptr) { DispatchInfoBuilder builder(getClDevice()); builder.setDispatchGeometry(workDim, workItems, enqueuedWorkSizes, globalOffsets, Vec3{0, 0, 0}, localWorkSizesIn); @@ -373,7 +373,7 @@ void CommandQueueHw::processDispatchForKernels(const MultiDispatchInf printfHandler->prepareDispatch(multiDispatchInfo); } - if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) { + if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer(device->getRootDeviceIndex())) { auto &gws = multiDispatchInfo.begin()->getGWS(); auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize(); size_t workGroupsCount = (gws.x * gws.y * gws.z) / @@ -649,7 +649,8 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( printfHandler->makeResident(getGpgpuCommandStreamReceiver()); } - if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) { + auto rootDeviceIndex = device->getRootDeviceIndex(); + if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer(rootDeviceIndex)) { device->syncBufferHandler->makeResident(getGpgpuCommandStreamReceiver()); } @@ -674,7 +675,6 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( auto specialPipelineSelectMode = false; Kernel *kernel = nullptr; bool usePerDssBackedBuffer = false; - auto rootDeviceIndex = device->getRootDeviceIndex(); for (auto &dispatchInfo : multiDispatchInfo) { if (kernel != dispatchInfo.getKernel()) { diff --git a/opencl/source/context/context.cpp b/opencl/source/context/context.cpp index b058cb7e1b..464ecef245 100644 --- a/opencl/source/context/context.cpp +++ b/opencl/source/context/context.cpp @@ -412,7 +412,7 @@ SchedulerKernel &Context::getSchedulerKernel() { kernelInfos, &retVal); - UNRECOVERABLE_IF(schedulerBuiltIn->pKernel->getScratchSize() != 0); + UNRECOVERABLE_IF(schedulerBuiltIn->pKernel->getScratchSize(clDevice->getRootDeviceIndex()) != 0); DEBUG_BREAK_IF(retVal != CL_SUCCESS); }; diff --git a/opencl/source/device_queue/device_queue_hw_bdw_plus.inl b/opencl/source/device_queue/device_queue_hw_bdw_plus.inl index 20a97984fd..773a642237 100644 --- a/opencl/source/device_queue/device_queue_hw_bdw_plus.inl +++ b/opencl/source/device_queue/device_queue_hw_bdw_plus.inl @@ -151,13 +151,13 @@ template void DeviceQueueHw::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed) { using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; void *pDSH = dynamicStateHeap.getCpuBase(); - + auto rootDeviceIndex = device->getRootDeviceIndex(); // Set scheduler ID to last entry in first table, it will have ID == 0, blocks will have following entries. auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); igilCmdQueue->m_controls.m_IDTstart = colorCalcStateSize + sizeof(INTERFACE_DESCRIPTOR_DATA) * (interfaceDescriptorEntries - 2); // Parent's dsh is located after ColorCalcState and 2 ID tables - igilCmdQueue->m_controls.m_DynamicHeapStart = offsetDsh + alignUp((uint32_t)parentKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); + igilCmdQueue->m_controls.m_DynamicHeapStart = offsetDsh + alignUp(static_cast(parentKernel->getDynamicStateHeapSize(rootDeviceIndex)), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); igilCmdQueue->m_controls.m_DynamicHeapSizeInBytes = (uint32_t)dshBuffer->getUnderlyingBufferSize(); igilCmdQueue->m_controls.m_CurrentDSHoffset = igilCmdQueue->m_controls.m_DynamicHeapStart; diff --git a/opencl/source/gtpin/gtpin_callbacks.cpp b/opencl/source/gtpin/gtpin_callbacks.cpp index 4f44b0279f..5dc1055969 100644 --- a/opencl/source/gtpin/gtpin_callbacks.cpp +++ b/opencl/source/gtpin/gtpin_callbacks.cpp @@ -73,7 +73,7 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) { // Kernel with no SSH or Kernel EM, not supported return; } - if (pKernel->isKernelHeapSubstituted()) { + if (pKernel->isKernelHeapSubstituted(rootDeviceIndex)) { // ISA for this kernel was already substituted return; } @@ -85,8 +85,8 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) { paramsIn.kernel_type = GTPIN_KERNEL_TYPE_CS; paramsIn.simd = (GTPIN_SIMD_WIDTH)kernelInfo.getMaxSimdSize(); - paramsIn.orig_kernel_binary = (uint8_t *)pKernel->getKernelHeap(); - paramsIn.orig_kernel_size = static_cast(pKernel->getKernelHeapSize()); + paramsIn.orig_kernel_binary = (uint8_t *)pKernel->getKernelHeap(rootDeviceIndex); + paramsIn.orig_kernel_size = static_cast(pKernel->getKernelHeapSize(rootDeviceIndex)); paramsIn.buffer_type = GTPIN_BUFFER_BINDFULL; paramsIn.buffer_desc.BTI = static_cast(gtpinBTI); paramsIn.igc_hash_id = kernelInfo.shaderHashCode; @@ -97,8 +97,8 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) { instrument_params_out_t paramsOut = {0}; (*GTPinCallbacks.onKernelCreate)((context_handle_t)(cl_context)context, ¶msIn, ¶msOut); // Substitute ISA of created kernel with instrumented code - pKernel->substituteKernelHeap(paramsOut.inst_kernel_binary, paramsOut.inst_kernel_size); - pKernel->setKernelId(paramsOut.kernel_id); + pKernel->substituteKernelHeap(rootDeviceIndex, paramsOut.inst_kernel_binary, paramsOut.inst_kernel_size); + pKernel->setKernelId(rootDeviceIndex, paramsOut.kernel_id); } } @@ -106,14 +106,15 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) { if (isGTPinInitialized) { auto pCmdQ = reinterpret_cast(pCmdQueue); auto &device = pCmdQ->getDevice(); + auto rootDeviceIndex = device.getRootDeviceIndex(); auto pKernel = castToObjectOrAbort(kernel); - if (pKernel->isParentKernel || pKernel->getSurfaceStateHeapSize(device.getRootDeviceIndex()) == 0) { + if (pKernel->isParentKernel || pKernel->getSurfaceStateHeapSize(rootDeviceIndex) == 0) { // Kernel with no SSH, not supported return; } Context *pContext = &(pKernel->getContext()); cl_context context = (cl_context)pContext; - uint64_t kernelId = pKernel->getKernelId(); + uint64_t kernelId = pKernel->getKernelId(rootDeviceIndex); command_buffer_handle_t commandBuffer = (command_buffer_handle_t)((uintptr_t)(sequenceCount++)); uint32_t kernelOffset = 0; resource_handle_t resource = 0; @@ -138,7 +139,7 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) { GFXCORE_FAMILY genFamily = device.getHardwareInfo().platform.eRenderCoreFamily; GTPinHwHelper >pinHelper = GTPinHwHelper::get(genFamily); size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1; - void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI, device.getRootDeviceIndex()); + void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI, rootDeviceIndex); cl_mem buffer = (cl_mem)resource; auto pBuffer = castToObjectOrAbort(buffer); pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device); diff --git a/opencl/source/helpers/dispatch_info.cpp b/opencl/source/helpers/dispatch_info.cpp index 44df3ab4ce..b07271c7bb 100644 --- a/opencl/source/helpers/dispatch_info.cpp +++ b/opencl/source/helpers/dispatch_info.cpp @@ -19,11 +19,11 @@ bool DispatchInfo::usesStatelessPrintfSurface() const { } uint32_t DispatchInfo::getRequiredScratchSize() const { - return (kernel == nullptr) ? 0 : kernel->getScratchSize(); + return (kernel == nullptr) ? 0 : kernel->getScratchSize(pClDevice->getRootDeviceIndex()); } uint32_t DispatchInfo::getRequiredPrivateScratchSize() const { - return (kernel == nullptr) ? 0 : kernel->getPrivateScratchSize(); + return (kernel == nullptr) ? 0 : kernel->getPrivateScratchSize(pClDevice->getRootDeviceIndex()); } Kernel *MultiDispatchInfo::peekMainKernel() const { diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index 106fde7221..45982ce1c1 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -52,7 +52,7 @@ size_t HardwareCommandsHelper::getSizeRequiredDSH( totalSize += borderColorSize + additionalSizeRequiredDsh(); - DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize() || kernel.getKernelInfo(rootDeviceIndex).isVmeWorkload)); + DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize(rootDeviceIndex) || kernel.getKernelInfo(rootDeviceIndex).isVmeWorkload)); return alignUp(totalSize, EncodeStates::alignInterfaceDescriptorData); } @@ -245,7 +245,9 @@ size_t HardwareCommandsHelper::sendIndirectState( uint32_t samplerCount = 0; if (patchInfo.samplerStateArray) { samplerCount = patchInfo.samplerStateArray->Count; - samplerStateOffset = EncodeStates::copySamplerState(&dsh, patchInfo.samplerStateArray->Offset, samplerCount, patchInfo.samplerStateArray->BorderColorOffset, kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper()); + samplerStateOffset = EncodeStates::copySamplerState(&dsh, patchInfo.samplerStateArray->Offset, + samplerCount, patchInfo.samplerStateArray->BorderColorOffset, + kernel.getDynamicStateHeap(rootDeviceIndex), device.getBindlessHeapsHelper()); } auto threadPayload = kernelInfo.patchInfo.threadPayload; diff --git a/opencl/source/kernel/get_additional_kernel_info.cpp b/opencl/source/kernel/get_additional_kernel_info.cpp index 7f8b732547..c6a8753e11 100644 --- a/opencl/source/kernel/get_additional_kernel_info.cpp +++ b/opencl/source/kernel/get_additional_kernel_info.cpp @@ -11,6 +11,6 @@ namespace NEO { void Kernel::getAdditionalInfo(cl_kernel_info paramName, const void *¶mValue, size_t ¶mValueSizeRet) const { } -void Kernel::getAdditionalWorkGroupInfo(cl_kernel_work_group_info paramName, const void *¶mValue, size_t ¶mValueSizeRet) const { +void Kernel::getAdditionalWorkGroupInfo(cl_kernel_work_group_info paramName, const void *¶mValue, size_t ¶mValueSizeRet, uint32_t rootDeviceIndex) const { } } // namespace NEO diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 0648cc092a..ffe84464cb 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -290,7 +290,7 @@ cl_int Kernel::initialize() { } if (patchInfo.pAllocateStatelessEventPoolSurface) { - if (requiresSshForBuffers()) { + if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(reinterpret_cast(getSurfaceStateHeap(rootDeviceIndex)), patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset); Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0); @@ -299,7 +299,7 @@ cl_int Kernel::initialize() { if (patchInfo.pAllocateStatelessDefaultDeviceQueueSurface) { - if (requiresSshForBuffers()) { + if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(reinterpret_cast(getSurfaceStateHeap(rootDeviceIndex)), patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset); Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0); @@ -442,16 +442,18 @@ cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize, const _cl_context *ctxt; cl_uint refCount = 0; uint64_t nonCannonizedGpuAddress = 0llu; + auto defaultRootDeviceIndex = getDevices()[0]->getRootDeviceIndex(); + auto &defaultKernelInfo = getKernelInfo(defaultRootDeviceIndex); switch (paramName) { case CL_KERNEL_FUNCTION_NAME: - pSrc = getDefaultKernelInfo().kernelDescriptor.kernelMetadata.kernelName.c_str(); - srcSize = getDefaultKernelInfo().kernelDescriptor.kernelMetadata.kernelName.length() + 1; + pSrc = defaultKernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(); + srcSize = defaultKernelInfo.kernelDescriptor.kernelMetadata.kernelName.length() + 1; break; case CL_KERNEL_NUM_ARGS: srcSize = sizeof(cl_uint); - numArgs = (cl_uint)getDefaultKernelInfo().kernelArgInfo.size(); + numArgs = static_cast(defaultKernelInfo.kernelArgInfo.size()); pSrc = &numArgs; break; @@ -474,16 +476,16 @@ cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize, break; case CL_KERNEL_ATTRIBUTES: - pSrc = getDefaultKernelInfo().attributes.c_str(); - srcSize = getDefaultKernelInfo().attributes.length() + 1; + pSrc = defaultKernelInfo.attributes.c_str(); + srcSize = defaultKernelInfo.attributes.length() + 1; break; case CL_KERNEL_BINARY_PROGRAM_INTEL: - pSrc = getKernelHeap(); - srcSize = getKernelHeapSize(); + pSrc = getKernelHeap(defaultRootDeviceIndex); + srcSize = getKernelHeapSize(defaultRootDeviceIndex); break; case CL_KERNEL_BINARY_GPU_ADDRESS_INTEL: - nonCannonizedGpuAddress = GmmHelper::decanonize(getDefaultKernelInfo().kernelAllocation->getGpuAddress()); + nonCannonizedGpuAddress = GmmHelper::decanonize(defaultKernelInfo.kernelAllocation->getGpuAddress()); pSrc = &nonCannonizedGpuAddress; srcSize = sizeof(nonCannonizedGpuAddress); break; @@ -504,8 +506,9 @@ cl_int Kernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t cl_int retVal; const void *pSrc = nullptr; size_t srcSize = GetInfo::invalidSourceSize; - auto numArgs = (cl_uint)getDefaultKernelInfo().kernelArgInfo.size(); - const auto &argInfo = getDefaultKernelInfo().kernelArgInfo[argIndx]; + auto &defaultKernelInfo = getDefaultKernelInfo(); + auto numArgs = static_cast(defaultKernelInfo.kernelArgInfo.size()); + const auto &argInfo = defaultKernelInfo.kernelArgInfo[argIndx]; if (argIndx >= numArgs) { retVal = CL_INVALID_ARG_INDEX; @@ -626,7 +629,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para pSrc = &privateMemSize; break; default: - getAdditionalWorkGroupInfo(paramName, pSrc, srcSize); + getAdditionalWorkGroupInfo(paramName, pSrc, srcSize, rootDeviceIndex); break; } @@ -742,16 +745,16 @@ cl_int Kernel::getSubGroupInfo(ClDevice &clDevice, cl_kernel_sub_group_info para } } -const void *Kernel::getKernelHeap() const { - return getDefaultKernelInfo().heapInfo.pKernelHeap; +const void *Kernel::getKernelHeap(uint32_t rootDeviceIndex) const { + return getKernelInfo(rootDeviceIndex).heapInfo.pKernelHeap; } -size_t Kernel::getKernelHeapSize() const { - return getDefaultKernelInfo().heapInfo.KernelHeapSize; +size_t Kernel::getKernelHeapSize(uint32_t rootDeviceIndex) const { + return getKernelInfo(rootDeviceIndex).heapInfo.KernelHeapSize; } -void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize) { - KernelInfo *pKernelInfo = const_cast(&getDefaultKernelInfo()); +void Kernel::substituteKernelHeap(uint32_t rootDeviceIndex, void *newKernelHeap, size_t newKernelHeapSize) { + KernelInfo *pKernelInfo = const_cast(&getKernelInfo(rootDeviceIndex)); void **pKernelHeap = const_cast(&pKernelInfo->heapInfo.pKernelHeap); *pKernelHeap = newKernelHeap; auto &heapInfo = pKernelInfo->heapInfo; @@ -771,16 +774,16 @@ void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize) UNRECOVERABLE_IF(!status); } -bool Kernel::isKernelHeapSubstituted() const { - return getDefaultKernelInfo().isKernelHeapSubstituted; +bool Kernel::isKernelHeapSubstituted(uint32_t rootDeviceIndex) const { + return getKernelInfo(rootDeviceIndex).isKernelHeapSubstituted; } -uint64_t Kernel::getKernelId() const { - return getDefaultKernelInfo().kernelId; +uint64_t Kernel::getKernelId(uint32_t rootDeviceIndex) const { + return getKernelInfo(rootDeviceIndex).kernelId; } -void Kernel::setKernelId(uint64_t newKernelId) { - KernelInfo *pKernelInfo = const_cast(&getDefaultKernelInfo()); +void Kernel::setKernelId(uint32_t rootDeviceIndex, uint64_t newKernelId) { + KernelInfo *pKernelInfo = const_cast(&getKernelInfo(rootDeviceIndex)); pKernelInfo->kernelId = newKernelId; } uint32_t Kernel::getStartOffset() const { @@ -794,12 +797,12 @@ void *Kernel::getSurfaceStateHeap(uint32_t rootDeviceIndex) const { return kernelInfos[rootDeviceIndex]->usesSsh ? kernelDeviceInfos[rootDeviceIndex].pSshLocal.get() : nullptr; } -size_t Kernel::getDynamicStateHeapSize() const { - return getDefaultKernelInfo().heapInfo.DynamicStateHeapSize; +size_t Kernel::getDynamicStateHeapSize(uint32_t rootDeviceIndex) const { + return getKernelInfo(rootDeviceIndex).heapInfo.DynamicStateHeapSize; } -const void *Kernel::getDynamicStateHeap() const { - return getDefaultKernelInfo().heapInfo.pDsh; +const void *Kernel::getDynamicStateHeap(uint32_t rootDeviceIndex) const { + return getKernelInfo(rootDeviceIndex).heapInfo.pDsh; } size_t Kernel::getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const { @@ -823,8 +826,9 @@ cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) { cl_int retVal = CL_SUCCESS; bool updateExposedKernel = true; auto argWasUncacheable = false; - if (getDefaultKernelInfo().builtinDispatchBuilder != nullptr) { - updateExposedKernel = getDefaultKernelInfo().builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal); + auto &defaultKernelInfo = getDefaultKernelInfo(); + if (defaultKernelInfo.builtinDispatchBuilder != nullptr) { + updateExposedKernel = defaultKernelInfo.builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal); } if (updateExposedKernel) { if (argIndex >= kernelArgHandlers.size()) { @@ -882,13 +886,14 @@ void *Kernel::patchBufferOffset(const KernelArgInfo &argInfo, void *svmPtr, Grap cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, GraphicsAllocation *svmAlloc, cl_mem_flags svmFlags) { auto rootDeviceIndex = getDevice().getRootDeviceIndex(); - void *ptrToPatch = patchBufferOffset(getDefaultKernelInfo().kernelArgInfo[argIndex], svmPtr, svmAlloc, rootDeviceIndex); + auto &kernelInfo = getKernelInfo(rootDeviceIndex); + void *ptrToPatch = patchBufferOffset(kernelInfo.kernelArgInfo[argIndex], svmPtr, svmAlloc, rootDeviceIndex); setArgImmediate(argIndex, sizeof(void *), &svmPtr); storeKernelArg(argIndex, SVM_OBJ, nullptr, svmPtr, sizeof(void *), svmAlloc, svmFlags); - if (requiresSshForBuffers()) { - const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex]; + if (requiresSshForBuffers(rootDeviceIndex)) { + const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex]; auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap); Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0); } @@ -905,7 +910,8 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio DBG_LOG_INPUTS("setArgBuffer svm_alloc", svmAlloc); auto rootDeviceIndex = getDevice().getRootDeviceIndex(); - const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex]; + auto &kernelInfo = getKernelInfo(rootDeviceIndex); + const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex]; storeKernelArg(argIndex, SVM_ALLOC_OBJ, svmAlloc, svmPtr, sizeof(uintptr_t)); @@ -918,8 +924,8 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio patchWithRequiredSize(patchLocation, patchSize, reinterpret_cast(svmPtr)); - if (requiresSshForBuffers()) { - const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex]; + if (requiresSshForBuffers(rootDeviceIndex)) { + const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex]; auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap); size_t allocSize = 0; size_t offset = 0; @@ -1049,7 +1055,7 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local return 0; } - auto executionEnvironment = getDefaultKernelInfo().patchInfo.executionEnvironment; + auto executionEnvironment = getKernelInfo(rootDeviceIndex).patchInfo.executionEnvironment; auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount; if (dssCount == 0) { dssCount = hardwareInfo.gtSystemInfo.SubSliceCount; @@ -1227,6 +1233,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex, const void *argVal) { auto rootDeviceIndex = getDevice().getRootDeviceIndex(); auto crossThreadData = reinterpret_cast(getCrossThreadData(rootDeviceIndex)); + auto &defaultKernelInfo = getDefaultKernelInfo(); storeKernelArg(argIndex, SLM_OBJ, nullptr, argVal, argSize); @@ -1234,7 +1241,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex, // Extract our current slmOffset auto slmOffset = *ptrOffset(crossThreadData, - getDefaultKernelInfo().kernelArgInfo[argIndex].kernelArgPatchInfoVector[0].crossthreadOffset); + defaultKernelInfo.kernelArgInfo[argIndex].kernelArgPatchInfoVector[0].crossthreadOffset); // Add our size slmOffset += static_cast(argSize); @@ -1242,7 +1249,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex, // Update all slm offsets after this argIndex ++argIndex; while (argIndex < slmSizes.size()) { - const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex]; + const auto &kernelArgInfo = defaultKernelInfo.kernelArgInfo[argIndex]; auto slmAlignment = kernelArgInfo.slmAlignment; // If an local argument, alignment should be non-zero @@ -1260,7 +1267,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex, ++argIndex; } - slmTotalSize = getDefaultKernelInfo().workloadInfo.slmStaticSize + alignUp(slmOffset, KB); + slmTotalSize = defaultKernelInfo.workloadInfo.slmStaticSize + alignUp(slmOffset, KB); return CL_SUCCESS; } @@ -1272,9 +1279,9 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex, if (argSize != sizeof(cl_mem *)) return CL_INVALID_ARG_SIZE; - const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex]; auto clMem = reinterpret_cast(argVal); auto rootDeviceIndex = getDevice().getRootDeviceIndex(); + const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex]; patchBufferOffset(kernelArgInfo, nullptr, nullptr, rootDeviceIndex); if (clMem && *clMem) { @@ -1323,7 +1330,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex, forceNonAuxMode = true; } - if (requiresSshForBuffers()) { + if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap); buffer->setArgStateful(surfaceState, forceNonAuxMode, disableL3, isAuxTranslationKernel, kernelArgInfo.isReadOnly, getDevice().getDevice()); } @@ -1348,7 +1355,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex, storeKernelArg(argIndex, BUFFER_OBJ, nullptr, argVal, argSize); - if (requiresSshForBuffers()) { + if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap); Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0); } @@ -1365,7 +1372,6 @@ cl_int Kernel::setArgPipe(uint32_t argIndex, return CL_INVALID_ARG_SIZE; } - const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex]; auto clMem = reinterpret_cast(argVal); if (clMem && *clMem) { @@ -1388,6 +1394,7 @@ cl_int Kernel::setArgPipe(uint32_t argIndex, return CL_INVALID_MEM_OBJECT; } auto rootDeviceIndex = getDevice().getRootDeviceIndex(); + const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex]; auto patchLocation = ptrOffset(getCrossThreadData(rootDeviceIndex), kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset); @@ -1397,7 +1404,7 @@ cl_int Kernel::setArgPipe(uint32_t argIndex, auto graphicsAllocation = pipe->getGraphicsAllocation(rootDeviceIndex); - if (requiresSshForBuffers()) { + if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap); Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, pipe->getSize(), pipe->getCpuAddress(), 0, @@ -1421,7 +1428,8 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex, const void *argVal, uint32_t mipLevel) { auto retVal = CL_INVALID_ARG_VALUE; auto rootDeviceIndex = getDevice().getRootDeviceIndex(); - patchBufferOffset(getDefaultKernelInfo().kernelArgInfo[argIndex], nullptr, nullptr, rootDeviceIndex); + auto &kernelInfo = getKernelInfo(rootDeviceIndex); + patchBufferOffset(kernelInfo.kernelArgInfo[argIndex], nullptr, nullptr, rootDeviceIndex); auto clMemObj = *(static_cast(argVal)); auto pImage = castToObject(clMemObj); @@ -1430,7 +1438,7 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex, if (pImage->peekSharingHandler()) { usingSharedObjArgs = true; } - const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex]; + const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex]; DBG_LOG_INPUTS("setArgImage cl_mem", clMemObj); @@ -1441,7 +1449,7 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex, // Sets SS structure if (kernelArgInfo.isMediaImage) { - DEBUG_BREAK_IF(!getDefaultKernelInfo().isVmeWorkload); + DEBUG_BREAK_IF(!kernelInfo.isVmeWorkload); pImage->setMediaImageArg(surfaceState, rootDeviceIndex); } else { pImage->setImageArg(surfaceState, kernelArgInfo.isMediaBlockImage, mipLevel, rootDeviceIndex); @@ -1487,7 +1495,7 @@ cl_int Kernel::setArgImmediate(uint32_t argIndex, auto rootDeviceIndex = getDevice().getRootDeviceIndex(); if (argVal) { - const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex]; + const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex]; DEBUG_BREAK_IF(kernelArgInfo.kernelArgPatchInfoVector.size() <= 0); storeKernelArg(argIndex, NONE_OBJ, nullptr, nullptr, argSize); @@ -1540,11 +1548,11 @@ cl_int Kernel::setArgSampler(uint32_t argIndex, } if (pSampler && argSize == sizeof(cl_sampler *)) { - const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex]; + const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex]; storeKernelArg(argIndex, SAMPLER_OBJ, clSamplerObj, argVal, argSize); - auto dsh = getDynamicStateHeap(); + auto dsh = getDynamicStateHeap(rootDeviceIndex); auto samplerState = ptrOffset(dsh, kernelArgInfo.offsetHeap); pSampler->setArg(const_cast(samplerState)); @@ -1583,7 +1591,7 @@ cl_int Kernel::setArgAccelerator(uint32_t argIndex, if (pAccelerator) { storeKernelArg(argIndex, ACCELERATOR_OBJ, clAcceleratorObj, argVal, argSize); - const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex]; + const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex]; if (kernelArgInfo.samplerArgumentType == iOpenCL::SAMPLER_OBJECT_VME) { auto crossThreadData = getCrossThreadData(rootDeviceIndex); @@ -2267,7 +2275,7 @@ void Kernel::patchDefaultDeviceQueue(DeviceQueue *devQueue) { patchWithRequiredSize(patchLocation, patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->DataParamSize, static_cast(devQueue->getQueueBuffer()->getGpuAddressToPatch())); } - if (requiresSshForBuffers()) { + if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(reinterpret_cast(getSurfaceStateHeap(rootDeviceIndex)), patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset); Buffer::setSurfaceState(&devQueue->getDevice(), surfaceState, devQueue->getQueueBuffer()->getUnderlyingBufferSize(), @@ -2289,7 +2297,7 @@ void Kernel::patchEventPool(DeviceQueue *devQueue) { static_cast(devQueue->getEventPoolBuffer()->getGpuAddressToPatch())); } - if (requiresSshForBuffers()) { + if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(reinterpret_cast(getSurfaceStateHeap(rootDeviceIndex)), patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset); Buffer::setSurfaceState(&devQueue->getDevice(), surfaceState, devQueue->getEventPoolBuffer()->getUnderlyingBufferSize(), @@ -2311,8 +2319,8 @@ void Kernel::patchBlocksSimdSize(uint32_t rootDeviceIndex) { } } -bool Kernel::usesSyncBuffer() { - return (getDefaultKernelInfo().patchInfo.pAllocateSyncBuffer != nullptr); +bool Kernel::usesSyncBuffer(uint32_t rootDeviceIndex) { + return (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSyncBuffer != nullptr); } void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) { @@ -2322,7 +2330,7 @@ void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, patchWithRequiredSize(bufferPatchAddress, patchInfo.pAllocateSyncBuffer->DataParamSize, ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset)); - if (requiresSshForBuffers()) { + if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(reinterpret_cast(getSurfaceStateHeap(rootDeviceIndex)), patchInfo.pAllocateSyncBuffer->SurfaceStateHeapOffset); auto addressToPatch = gfxAllocation->getUnderlyingBuffer(); @@ -2339,12 +2347,13 @@ bool Kernel::isPatched() const { cl_int Kernel::checkCorrectImageAccessQualifier(cl_uint argIndex, size_t argSize, const void *argValue) const { - if (getDefaultKernelInfo().kernelArgInfo[argIndex].isImage) { + auto &defaultKernelInfo = getDefaultKernelInfo(); + if (defaultKernelInfo.kernelArgInfo[argIndex].isImage) { cl_mem mem = *(static_cast(argValue)); MemObj *pMemObj = nullptr; WithCastToInternal(mem, &pMemObj); if (pMemObj) { - auto accessQualifier = getDefaultKernelInfo().kernelArgInfo[argIndex].metadata.accessQualifier; + auto accessQualifier = defaultKernelInfo.kernelArgInfo[argIndex].metadata.accessQualifier; cl_mem_flags flags = pMemObj->getFlags(); if ((accessQualifier == KernelArgMetadata::AccessReadOnly && ((flags | CL_MEM_WRITE_ONLY) == flags)) || (accessQualifier == KernelArgMetadata::AccessWriteOnly && ((flags | CL_MEM_READ_ONLY) == flags))) { @@ -2372,11 +2381,16 @@ void Kernel::resolveArgs() { } } } - auto rootDeviceIndex = getDevice().getRootDeviceIndex(); - if (canTransformImageTo2dArray) { - imageTransformer->transformImagesTo2dArray(getDefaultKernelInfo(), kernelArguments, getSurfaceStateHeap(rootDeviceIndex)); - } else if (imageTransformer->didTransform()) { - imageTransformer->transformImagesTo3d(getDefaultKernelInfo(), kernelArguments, getSurfaceStateHeap(rootDeviceIndex)); + for (auto rootDeviceIndex = 0u; rootDeviceIndex < kernelInfos.size(); rootDeviceIndex++) { + auto pKernelInfo = kernelInfos[rootDeviceIndex]; + if (!pKernelInfo) { + continue; + } + if (canTransformImageTo2dArray) { + imageTransformer->transformImagesTo2dArray(*pKernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex)); + } else if (imageTransformer->didTransform()) { + imageTransformer->transformImagesTo3d(*pKernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex)); + } } } @@ -2385,10 +2399,11 @@ bool Kernel::canTransformImages() const { return renderCoreFamily >= IGFX_GEN9_CORE && renderCoreFamily <= IGFX_GEN11LP_CORE; } -void Kernel::fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsForAuxTranslation) { +void Kernel::fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsForAuxTranslation, uint32_t rootDeviceIndex) { memObjsForAuxTranslation.reserve(getKernelArgsNumber()); + auto &kernelInfo = getKernelInfo(rootDeviceIndex); for (uint32_t i = 0; i < getKernelArgsNumber(); i++) { - if (BUFFER_OBJ == kernelArguments.at(i).type && !getDefaultKernelInfo().kernelArgInfo.at(i).pureStatefulBufferAccess) { + if (BUFFER_OBJ == kernelArguments.at(i).type && !kernelInfo.kernelArgInfo.at(i).pureStatefulBufferAccess) { auto buffer = castToObject(getKernelArg(i)); if (buffer && buffer->getMultiGraphicsAllocation().getAllocationType() == GraphicsAllocation::AllocationType::BUFFER_COMPRESSED) { memObjsForAuxTranslation.insert(buffer); @@ -2396,7 +2411,7 @@ void Kernel::fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsF auto &context = this->program->getContext(); if (context.isProvidingPerformanceHints()) { context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ARGUMENT_AUX_TRANSLATION, - getDefaultKernelInfo().kernelDescriptor.kernelMetadata.kernelName.c_str(), i, getDefaultKernelInfo().kernelArgInfo.at(i).metadataExtended->argName.c_str()); + kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(), i, kernelInfo.kernelArgInfo.at(i).metadataExtended->argName.c_str()); } } } diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h index f9274f4828..7e806df63e 100644 --- a/opencl/source/kernel/kernel.h +++ b/opencl/source/kernel/kernel.h @@ -141,7 +141,7 @@ class Kernel : public BaseObject<_cl_kernel> { cl_int getInfo(cl_kernel_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const; void getAdditionalInfo(cl_kernel_info paramName, const void *¶mValue, size_t ¶mValueSizeRet) const; - void getAdditionalWorkGroupInfo(cl_kernel_work_group_info paramName, const void *¶mValue, size_t ¶mValueSizeRet) const; + void getAdditionalWorkGroupInfo(cl_kernel_work_group_info paramName, const void *¶mValue, size_t ¶mValueSizeRet, uint32_t rootDeviceIndex) const; cl_int getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const; @@ -154,13 +154,13 @@ class Kernel : public BaseObject<_cl_kernel> { size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const; - const void *getKernelHeap() const; + const void *getKernelHeap(uint32_t rootDeviceIndex) const; void *getSurfaceStateHeap(uint32_t rootDeviceIndex) const; - const void *getDynamicStateHeap() const; + const void *getDynamicStateHeap(uint32_t rootDeviceIndex) const; - size_t getKernelHeapSize() const; + size_t getKernelHeapSize(uint32_t rootDeviceIndex) const; size_t getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const; - size_t getDynamicStateHeapSize() const; + size_t getDynamicStateHeapSize(uint32_t rootDeviceIndex) const; size_t getNumberOfBindingTableStates() const; size_t getBindingTableOffset() const { return localBindingTableOffset; @@ -168,10 +168,10 @@ class Kernel : public BaseObject<_cl_kernel> { void resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset); - void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize); - bool isKernelHeapSubstituted() const; - uint64_t getKernelId() const; - void setKernelId(uint64_t newKernelId); + void substituteKernelHeap(uint32_t rootDeviceIndex, void *newKernelHeap, size_t newKernelHeapSize); + bool isKernelHeapSubstituted(uint32_t rootDeviceIndex) const; + uint64_t getKernelId(uint32_t rootDeviceIndex) const; + void setKernelId(uint32_t rootDeviceIndex, uint64_t newKernelId); uint32_t getStartOffset() const; void setStartOffset(uint32_t offset); @@ -183,8 +183,8 @@ class Kernel : public BaseObject<_cl_kernel> { return getDefaultKernelInfo().kernelArgInfo.size(); } - bool requiresSshForBuffers() const { - return getDefaultKernelInfo().requiresSshForBuffers; + bool requiresSshForBuffers(uint32_t rootDeviceIndex) const { + return getKernelInfo(rootDeviceIndex).requiresSshForBuffers; } const KernelInfo &getKernelInfo(uint32_t rootDeviceIndex) const { @@ -200,12 +200,12 @@ class Kernel : public BaseObject<_cl_kernel> { Program *getProgram() const { return program; } - uint32_t getScratchSize() { - return getDefaultKernelInfo().patchInfo.mediavfestate ? getDefaultKernelInfo().patchInfo.mediavfestate->PerThreadScratchSpace : 0; + uint32_t getScratchSize(uint32_t rootDeviceIndex) { + return getKernelInfo(rootDeviceIndex).patchInfo.mediavfestate ? getKernelInfo(rootDeviceIndex).patchInfo.mediavfestate->PerThreadScratchSpace : 0; } - uint32_t getPrivateScratchSize() { - return getDefaultKernelInfo().patchInfo.mediaVfeStateSlot1 ? getDefaultKernelInfo().patchInfo.mediaVfeStateSlot1->PerThreadScratchSpace : 0; + uint32_t getPrivateScratchSize(uint32_t rootDeviceIndex) { + return getKernelInfo(rootDeviceIndex).patchInfo.mediaVfeStateSlot1 ? getKernelInfo(rootDeviceIndex).patchInfo.mediaVfeStateSlot1->PerThreadScratchSpace : 0; } void createReflectionSurface(); @@ -215,7 +215,7 @@ class Kernel : public BaseObject<_cl_kernel> { void patchDefaultDeviceQueue(DeviceQueue *devQueue); void patchEventPool(DeviceQueue *devQueue); void patchBlocksSimdSize(uint32_t rootDeviceIndex); - bool usesSyncBuffer(); + bool usesSyncBuffer(uint32_t rootDeviceIndex); void patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset); void patchBindlessSurfaceStateOffsets(const Device &device, const size_t sshOffset); @@ -352,26 +352,26 @@ class Kernel : public BaseObject<_cl_kernel> { KernelExecutionType getExecutionType() const { return executionType; } - bool isUsingSyncBuffer() const { - return (getDefaultKernelInfo().patchInfo.pAllocateSyncBuffer != nullptr); + bool isUsingSyncBuffer(uint32_t rootDeviceIndex) const { + return (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSyncBuffer != nullptr); } bool checkIfIsParentKernelAndBlocksUsesPrintf(); - bool is32Bit() const { - return getDefaultKernelInfo().gpuPointerSize == 4; + bool is32Bit(uint32_t rootDeviceIndex) const { + return getKernelInfo(rootDeviceIndex).gpuPointerSize == 4; } - int32_t getDebugSurfaceBti() const { - if (getDefaultKernelInfo().patchInfo.pAllocateSystemThreadSurface) { - return getDefaultKernelInfo().patchInfo.pAllocateSystemThreadSurface->BTI; + int32_t getDebugSurfaceBti(uint32_t rootDeviceIndex) const { + if (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface) { + return getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface->BTI; } return -1; } - size_t getPerThreadSystemThreadSurfaceSize() const { - if (getDefaultKernelInfo().patchInfo.pAllocateSystemThreadSurface) { - return getDefaultKernelInfo().patchInfo.pAllocateSystemThreadSurface->PerThreadSystemThreadSurfaceSize; + size_t getPerThreadSystemThreadSurfaceSize(uint32_t rootDeviceIndex) const { + if (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface) { + return getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface->PerThreadSystemThreadSurfaceSize; } return 0; } @@ -381,7 +381,7 @@ class Kernel : public BaseObject<_cl_kernel> { return usingImagesOnly; } - void fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsForAuxTranslation); + void fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsForAuxTranslation, uint32_t rootDeviceIndex); MOCKABLE_VIRTUAL bool requiresCacheFlushCommand(const CommandQueue &commandQueue) const; diff --git a/opencl/source/program/kernel_info.cpp b/opencl/source/program/kernel_info.cpp index 1c9ab99dd0..c27fc7269c 100644 --- a/opencl/source/program/kernel_info.cpp +++ b/opencl/source/program/kernel_info.cpp @@ -143,13 +143,12 @@ WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) { this->numThreadsPerSubSlice = static_cast(device.getSharedDeviceInfo().maxNumEUsPerSubSlice) * device.getSharedDeviceInfo().numThreadsPerEU; this->localMemSize = static_cast(device.getSharedDeviceInfo().localMemSize); - setIfUseImg(dispatchInfo.getKernel()); + setIfUseImg(kernelInfo); setMinWorkGroupSize(); } -void WorkSizeInfo::setIfUseImg(Kernel *pKernel) { - auto ParamsCount = pKernel->getKernelArgsNumber(); - for (auto i = 0u; i < ParamsCount; i++) { - if (pKernel->getDefaultKernelInfo().kernelArgInfo[i].isImage) { +void WorkSizeInfo::setIfUseImg(const KernelInfo &kernelInfo) { + for (auto i = 0u; i < kernelInfo.kernelArgInfo.size(); i++) { + if (kernelInfo.kernelArgInfo[i].isImage) { imgUsed = true; yTiledSurfaces = true; } diff --git a/opencl/source/program/kernel_info.h b/opencl/source/program/kernel_info.h index 44f8e06c8c..1093d9906c 100644 --- a/opencl/source/program/kernel_info.h +++ b/opencl/source/program/kernel_info.h @@ -84,7 +84,7 @@ struct WorkSizeInfo { WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, GFXCORE_FAMILY coreFamily, uint32_t numThreadsPerSubSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface); WorkSizeInfo(const DispatchInfo &dispatchInfo); - void setIfUseImg(Kernel *pKernel); + void setIfUseImg(const KernelInfo &kernelInfo); void setMinWorkGroupSize(); void checkRatio(const size_t workItems[3]); }; diff --git a/opencl/source/program/printf_handler.cpp b/opencl/source/program/printf_handler.cpp index 2e98b619ea..2cfbca55d2 100644 --- a/opencl/source/program/printf_handler.cpp +++ b/opencl/source/program/printf_handler.cpp @@ -61,7 +61,7 @@ void PrintfHandler::prepareDispatch(const MultiDispatchInfo &multiDispatchInfo) kernel->getKernelInfo(rootDeviceIndex).patchInfo.pAllocateStatelessPrintfSurface->DataParamOffset); patchWithRequiredSize(printfPatchAddress, kernel->getKernelInfo(rootDeviceIndex).patchInfo.pAllocateStatelessPrintfSurface->DataParamSize, (uintptr_t)printfSurface->getGpuAddressToPatch()); - if (kernel->requiresSshForBuffers()) { + if (kernel->requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(reinterpret_cast(kernel->getSurfaceStateHeap(rootDeviceIndex)), kernel->getKernelInfo(rootDeviceIndex).patchInfo.pAllocateStatelessPrintfSurface->SurfaceStateHeapOffset); void *addressToPatch = printfSurface->getUnderlyingBuffer(); @@ -75,8 +75,9 @@ void PrintfHandler::makeResident(CommandStreamReceiver &commandStreamReceiver) { } void PrintfHandler::printEnqueueOutput() { + auto rootDeviceIndex = device.getRootDeviceIndex(); PrintFormatter printFormatter(reinterpret_cast(printfSurface->getUnderlyingBuffer()), static_cast(printfSurface->getUnderlyingBufferSize()), - kernel->is32Bit(), kernel->getKernelInfo(device.getRootDeviceIndex()).patchInfo.stringDataMap); + kernel->is32Bit(rootDeviceIndex), kernel->getKernelInfo(rootDeviceIndex).patchInfo.stringDataMap); printFormatter.printKernelOutput(); } } // namespace NEO diff --git a/opencl/source/scheduler/scheduler_kernel.h b/opencl/source/scheduler/scheduler_kernel.h index 1b7ecc8538..c1ba93e8ef 100644 --- a/opencl/source/scheduler/scheduler_kernel.h +++ b/opencl/source/scheduler/scheduler_kernel.h @@ -34,8 +34,9 @@ class SchedulerKernel : public Kernel { } size_t getCurbeSize() { - size_t crossTrheadDataSize = getDefaultKernelInfo().patchInfo.dataParameterStream ? getDefaultKernelInfo().patchInfo.dataParameterStream->DataParameterStreamSize : 0; - size_t dshSize = getDefaultKernelInfo().heapInfo.DynamicStateHeapSize; + auto &defaultKernelInfo = getDefaultKernelInfo(); + size_t crossTrheadDataSize = defaultKernelInfo.patchInfo.dataParameterStream ? defaultKernelInfo.patchInfo.dataParameterStream->DataParameterStreamSize : 0; + size_t dshSize = defaultKernelInfo.heapInfo.DynamicStateHeapSize; crossTrheadDataSize = alignUp(crossTrheadDataSize, 64); dshSize = alignUp(dshSize, 64); diff --git a/opencl/test/unit_test/api/cl_enqueue_nd_range_kernel_tests.inl b/opencl/test/unit_test/api/cl_enqueue_nd_range_kernel_tests.inl index d7b2c93864..70fdfab4e6 100644 --- a/opencl/test/unit_test/api/cl_enqueue_nd_range_kernel_tests.inl +++ b/opencl/test/unit_test/api/cl_enqueue_nd_range_kernel_tests.inl @@ -137,7 +137,7 @@ TEST_F(clEnqueueNDRangeKernelTests, GivenKernelWithAllocateSyncBufferPatchWhenEx SPatchAllocateSyncBuffer patchAllocateSyncBuffer; pProgram->mockKernelInfo.patchInfo.pAllocateSyncBuffer = &patchAllocateSyncBuffer; - EXPECT_TRUE(pKernel->isUsingSyncBuffer()); + EXPECT_TRUE(pKernel->isUsingSyncBuffer(testedRootDeviceIndex)); retVal = clEnqueueNDRangeKernel( pCommandQueue, diff --git a/opencl/test/unit_test/api/cl_get_kernel_work_group_info_tests.inl b/opencl/test/unit_test/api/cl_get_kernel_work_group_info_tests.inl index 3be843e6a8..660835c51c 100644 --- a/opencl/test/unit_test/api/cl_get_kernel_work_group_info_tests.inl +++ b/opencl/test/unit_test/api/cl_get_kernel_work_group_info_tests.inl @@ -156,7 +156,7 @@ TEST_F(clGetKernelWorkGroupInfoTests, GivenKernelRequiringScratchSpaceWhenGettin mediaVFEstate.PerThreadScratchSpace = 1024; //whatever greater than 0 mockKernel.kernelInfo.patchInfo.mediavfestate = &mediaVFEstate; - cl_ulong scratchSpaceSize = static_cast(mockKernel.mockKernel->getScratchSize()); + cl_ulong scratchSpaceSize = static_cast(mockKernel.mockKernel->getScratchSize(testedRootDeviceIndex)); EXPECT_EQ(scratchSpaceSize, 1024u); retVal = clGetKernelWorkGroupInfo( diff --git a/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp b/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp index a2e4f9ec07..ed7fdb653f 100644 --- a/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp +++ b/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp @@ -109,8 +109,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, AUBHelloWorld, simple) { EXPECT_EQ(0u, addrIDD % alignmentIDD); // Check kernel start pointer matches hard-coded kernel. - auto pExpectedISA = pKernel->getKernelHeap(); - auto expectedSize = pKernel->getKernelHeapSize(); + auto pExpectedISA = pKernel->getKernelHeap(rootDeviceIndex); + auto expectedSize = pKernel->getKernelHeapSize(rootDeviceIndex); auto pSBA = reinterpret_cast(cmdStateBaseAddress); ASSERT_NE(nullptr, pSBA); @@ -268,8 +268,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, AUBSimpleArg, simple) { EXPECT_EQ(0u, addrIDD % alignmentIDD); // Check kernel start pointer matches hard-coded kernel. - auto pExpectedISA = pKernel->getKernelHeap(); - auto expectedSize = pKernel->getKernelHeapSize(); + auto pExpectedISA = pKernel->getKernelHeap(rootDeviceIndex); + auto expectedSize = pKernel->getKernelHeapSize(rootDeviceIndex); auto pSBA = reinterpret_cast(cmdStateBaseAddress); ASSERT_NE(nullptr, pSBA); diff --git a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_rect_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_rect_tests.cpp index 1963ee4563..31e5afc120 100644 --- a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_rect_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_rect_tests.cpp @@ -183,7 +183,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueCopyBufferRectTest, WhenCopyingBufferRect2DTh EXPECT_NE(dshBefore, pDSH->getUsed()); EXPECT_NE(iohBefore, pIOH->getUsed()); - if (kernel->requiresSshForBuffers()) { + if (kernel->requiresSshForBuffers(rootDeviceIndex)) { EXPECT_NE(sshBefore, pSSH->getUsed()); } } diff --git a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_tests.cpp index 396791d100..69e39bbf01 100644 --- a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_tests.cpp @@ -202,9 +202,9 @@ HWTEST_F(EnqueueCopyBufferTest, WhenCopyingBufferThenIndirectDataGetsAdded) { auto kernel = multiDispatchInfo.begin()->getKernel(); - EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel)); + EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel, rootDeviceIndex)); EXPECT_NE(iohBefore, pIOH->getUsed()); - if (kernel->requiresSshForBuffers()) { + if (kernel->requiresSshForBuffers(rootDeviceIndex)) { EXPECT_NE(sshBefore, pSSH->getUsed()); } } @@ -329,7 +329,7 @@ HWTEST_F(EnqueueCopyBufferTest, WhenCopyingBufferThenArgumentZeroMatchesSourceAd ASSERT_NE(nullptr, kernel); // Determine where the argument is - auto pArgument = (void **)getStatelessArgumentPointer(*kernel, 0u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0)); + auto pArgument = (void **)getStatelessArgumentPointer(*kernel, 0u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0), rootDeviceIndex); EXPECT_EQ(reinterpret_cast(srcBuffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex())->getGpuAddress()), *pArgument); } @@ -357,7 +357,7 @@ HWTEST_F(EnqueueCopyBufferTest, WhenCopyingBufferThenArgumentOneMatchesDestinati ASSERT_NE(nullptr, kernel); // Determine where the argument is - auto pArgument = (void **)getStatelessArgumentPointer(*kernel, 1u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0)); + auto pArgument = (void **)getStatelessArgumentPointer(*kernel, 1u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0), rootDeviceIndex); EXPECT_EQ(reinterpret_cast(dstBuffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex())->getGpuAddress()), *pArgument); } diff --git a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp index 915fa25120..a63ad9a4c1 100644 --- a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp @@ -85,7 +85,7 @@ HWTEST_F(EnqueueCopyBufferToImageTest, WhenCopyingBufferToImageThenIndirectDataG auto sshBefore = pSSH->getUsed(); EnqueueCopyBufferToImageHelper<>::enqueueCopyBufferToImage(pCmdQ, srcBuffer, dstImage); - EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr)); + EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr, rootDeviceIndex)); EXPECT_NE(iohBefore, pIOH->getUsed()); EXPECT_NE(sshBefore, pSSH->getUsed()); } diff --git a/opencl/test/unit_test/command_queue/enqueue_copy_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_copy_image_tests.cpp index 8c37586c37..176011aa0c 100644 --- a/opencl/test/unit_test/command_queue/enqueue_copy_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_copy_image_tests.cpp @@ -85,7 +85,7 @@ HWTEST_F(EnqueueCopyImageTest, WhenCopyingImageThenIndirectDataGetsAdded) { auto sshBefore = pSSH->getUsed(); EnqueueCopyImageHelper<>::enqueueCopyImage(pCmdQ, srcImage, dstImage); - EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr)); + EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr, rootDeviceIndex)); EXPECT_NE(iohBefore, pIOH->getUsed()); EXPECT_NE(sshBefore, pSSH->getUsed()); } diff --git a/opencl/test/unit_test/command_queue/enqueue_copy_image_to_buffer_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_copy_image_to_buffer_tests.cpp index 7f21b3c75c..be50ea6c19 100644 --- a/opencl/test/unit_test/command_queue/enqueue_copy_image_to_buffer_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_copy_image_to_buffer_tests.cpp @@ -84,7 +84,7 @@ HWTEST_F(EnqueueCopyImageToBufferTest, WhenCopyingImageToBufferThenIndirectDataG auto sshBefore = pSSH->getUsed(); enqueueCopyImageToBuffer(); - EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr)); + EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr, rootDeviceIndex)); EXPECT_NE(iohBefore, pIOH->getUsed()); EXPECT_NE(sshBefore, pSSH->getUsed()); } diff --git a/opencl/test/unit_test/command_queue/enqueue_fill_buffer_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_fill_buffer_tests.cpp index d66b76f030..09d052a442 100644 --- a/opencl/test/unit_test/command_queue/enqueue_fill_buffer_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_fill_buffer_tests.cpp @@ -115,9 +115,9 @@ HWTEST_F(EnqueueFillBufferCmdTests, WhenFillingBufferThenIndirectDataGetsAdded) auto kernel = multiDispatchInfo.begin()->getKernel(); - EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel)); + EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel, rootDeviceIndex)); EXPECT_NE(iohBefore, pIOH->getUsed()); - if (kernel->requiresSshForBuffers()) { + if (kernel->requiresSshForBuffers(rootDeviceIndex)) { EXPECT_NE(sshBefore, pSSH->getUsed()); } @@ -302,7 +302,7 @@ HWTEST_F(EnqueueFillBufferCmdTests, WhenFillingBufferThenArgumentZeroShouldMatch ASSERT_NE(nullptr, kernel); // Determine where the argument is - auto pArgument = (void **)getStatelessArgumentPointer(*kernel, 0u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0)); + auto pArgument = (void **)getStatelessArgumentPointer(*kernel, 0u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0), rootDeviceIndex); EXPECT_EQ((void *)((uintptr_t)buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex())->getGpuAddress()), *pArgument); @@ -335,7 +335,7 @@ HWTEST_F(EnqueueFillBufferCmdTests, WhenFillingBufferThenArgumentTwoShouldMatchP ASSERT_NE(nullptr, kernel); // Determine where the argument is - auto pArgument = (void **)getStatelessArgumentPointer(*kernel, 2u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0)); + auto pArgument = (void **)getStatelessArgumentPointer(*kernel, 2u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0), rootDeviceIndex); EXPECT_NE(nullptr, *pArgument); context.getMemoryManager()->freeGraphicsMemory(patternAllocation); diff --git a/opencl/test/unit_test/command_queue/enqueue_fill_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_fill_image_tests.cpp index f42e8dbeee..39fa4e06fa 100644 --- a/opencl/test/unit_test/command_queue/enqueue_fill_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_fill_image_tests.cpp @@ -92,7 +92,7 @@ HWTEST_F(EnqueueFillImageTest, WhenFillingImageThenIndirectDataGetsAdded) { auto sshBefore = pSSH->getUsed(); EnqueueFillImageHelper<>::enqueueFillImage(pCmdQ, image); - EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr)); + EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr, rootDeviceIndex)); EXPECT_NE(iohBefore, pIOH->getUsed()); EXPECT_NE(sshBefore, pSSH->getUsed()); } diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp index 1827b206c4..42ec21014a 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp @@ -415,9 +415,9 @@ HWTEST_F(EnqueueKernelTest, addsIndirectData) { auto sshBefore = pSSH->getUsed(); callOneWorkItemNDRKernel(); - EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), pKernel)); + EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), pKernel, rootDeviceIndex)); EXPECT_NE(iohBefore, pIOH->getUsed()); - if (pKernel->requiresSshForBuffers() || (pKernel->getKernelInfo(rootDeviceIndex).patchInfo.imageMemObjKernelArgs.size() > 0)) { + if (pKernel->requiresSshForBuffers(rootDeviceIndex) || (pKernel->getKernelInfo(rootDeviceIndex).patchInfo.imageMemObjKernelArgs.size() > 0)) { EXPECT_NE(sshBefore, pSSH->getUsed()); } } diff --git a/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp index 5558b7c60d..a36502d7f0 100644 --- a/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp @@ -196,7 +196,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueReadBufferRectTest, WhenReadingBufferThenIndi EXPECT_NE(dshBefore, pDSH->getUsed()); EXPECT_NE(iohBefore, pIOH->getUsed()); - if (kernel->requiresSshForBuffers()) { + if (kernel->requiresSshForBuffers(rootDeviceIndex)) { EXPECT_NE(sshBefore, pSSH->getUsed()); } } diff --git a/opencl/test/unit_test/command_queue/enqueue_read_buffer_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_read_buffer_tests.cpp index edd04fbfa0..efedabcd5f 100644 --- a/opencl/test/unit_test/command_queue/enqueue_read_buffer_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_read_buffer_tests.cpp @@ -156,9 +156,9 @@ HWTEST_F(EnqueueReadBufferTypeTest, addsIndirectData) { auto kernel = multiDispatchInfo.begin()->getKernel(); - EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel)); + EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel, rootDeviceIndex)); EXPECT_NE(iohBefore, pIOH->getUsed()); - if (kernel->requiresSshForBuffers()) { + if (kernel->requiresSshForBuffers(rootDeviceIndex)) { EXPECT_NE(sshBefore, pSSH->getUsed()); } } diff --git a/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp index 36a919d740..7fccdb93b7 100644 --- a/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp @@ -95,7 +95,7 @@ HWTEST_F(EnqueueReadImageTest, WhenReadingImageThenIndirectDataIsAdded) { auto sshBefore = pSSH->getUsed(); EnqueueReadImageHelper<>::enqueueReadImage(pCmdQ, srcImage, EnqueueReadImageTraits::blocking); - EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr)); + EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr, rootDeviceIndex)); EXPECT_NE(iohBefore, pIOH->getUsed()); EXPECT_NE(sshBefore, pSSH->getUsed()); } diff --git a/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp index 64bc0b0adc..5e12b2bb84 100644 --- a/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp @@ -170,7 +170,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueWriteBufferRectTest, WhenWritingBufferThenInd EXPECT_NE(dshBefore, pDSH->getUsed()); EXPECT_NE(iohBefore, pIOH->getUsed()); - if (kernel->requiresSshForBuffers()) { + if (kernel->requiresSshForBuffers(rootDeviceIndex)) { EXPECT_NE(sshBefore, pSSH->getUsed()); } } diff --git a/opencl/test/unit_test/command_queue/enqueue_write_buffer_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_write_buffer_tests.cpp index 86903de36f..543c1698f9 100644 --- a/opencl/test/unit_test/command_queue/enqueue_write_buffer_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_write_buffer_tests.cpp @@ -154,9 +154,9 @@ HWTEST_F(EnqueueWriteBufferTypeTest, WhenWritingBufferThenIndirectDataIsAdded) { auto kernel = multiDispatchInfo.begin()->getKernel(); - EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel)); + EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel, rootDeviceIndex)); EXPECT_NE(iohBefore, pIOH->getUsed()); - if (kernel->requiresSshForBuffers()) { + if (kernel->requiresSshForBuffers(rootDeviceIndex)) { EXPECT_NE(sshBefore, pSSH->getUsed()); } } diff --git a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp index 12c7680dd1..e04dce3981 100644 --- a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp @@ -97,7 +97,7 @@ HWTEST_F(EnqueueWriteImageTest, WhenWritingImageThenIndirectDataIsAdded) { auto sshBefore = pSSH->getUsed(); EnqueueWriteImageHelper<>::enqueueWriteImage(pCmdQ, dstImage, EnqueueWriteImageTraits::blocking); - EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr)); + EXPECT_TRUE(UnitTestHelper::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr, rootDeviceIndex)); EXPECT_NE(iohBefore, pIOH->getUsed()); EXPECT_NE(sshBefore, pSSH->getUsed()); } diff --git a/opencl/test/unit_test/context/driver_diagnostics_tests.cpp b/opencl/test/unit_test/context/driver_diagnostics_tests.cpp index aa10f64670..5c06ecc61b 100644 --- a/opencl/test/unit_test/context/driver_diagnostics_tests.cpp +++ b/opencl/test/unit_test/context/driver_diagnostics_tests.cpp @@ -452,7 +452,7 @@ TEST_F(PerformanceHintTest, givenPrintDriverDiagnosticsDebugModeEnabledWhenCallF testing::internal::CaptureStdout(); MemObjsForAuxTranslation memObjects; - mockKernel.mockKernel->fillWithBuffersForAuxTranslation(memObjects); + mockKernel.mockKernel->fillWithBuffersForAuxTranslation(memObjects, rootDeviceIndex); snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[KERNEL_ARGUMENT_AUX_TRANSLATION], mockKernel.mockKernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(), 0, mockKernel.mockKernel->getKernelInfo(rootDeviceIndex).kernelArgInfo.at(0).metadataExtended->argName.c_str()); diff --git a/opencl/test/unit_test/device_queue/device_queue_hw_tests.cpp b/opencl/test/unit_test/device_queue/device_queue_hw_tests.cpp index 8f50bcef9a..f07d878922 100644 --- a/opencl/test/unit_test/device_queue/device_queue_hw_tests.cpp +++ b/opencl/test/unit_test/device_queue/device_queue_hw_tests.cpp @@ -603,9 +603,9 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, WhenSettingUpIndirectStateT devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount, false); auto *igilQueue = reinterpret_cast(devQueueHw->getQueueBuffer()->getUnderlyingBuffer()); - EXPECT_EQ(igilQueue->m_controls.m_DynamicHeapStart, devQueueHw->offsetDsh + alignUp((uint32_t)pKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE)); + EXPECT_EQ(igilQueue->m_controls.m_DynamicHeapStart, devQueueHw->offsetDsh + alignUp((uint32_t)pKernel->getDynamicStateHeapSize(rootDeviceIndex), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE)); EXPECT_EQ(igilQueue->m_controls.m_DynamicHeapSizeInBytes, (uint32_t)devQueueHw->getDshBuffer()->getUnderlyingBufferSize()); - EXPECT_EQ(igilQueue->m_controls.m_CurrentDSHoffset, devQueueHw->offsetDsh + alignUp((uint32_t)pKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE)); + EXPECT_EQ(igilQueue->m_controls.m_CurrentDSHoffset, devQueueHw->offsetDsh + alignUp((uint32_t)pKernel->getDynamicStateHeapSize(rootDeviceIndex), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE)); EXPECT_EQ(igilQueue->m_controls.m_ParentDSHOffset, devQueueHw->offsetDsh); alignedFree(ssh->getCpuBase()); diff --git a/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp b/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp index 25be22b78b..76ad01e403 100644 --- a/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp +++ b/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp @@ -170,7 +170,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelDispatchTest, givenParentKernelWhenQueue pKernel->getKernelInfo(rootDeviceIndex).patchInfo.bindingTableState->Count * sizeof(BINDING_TABLE_STATE) + UnitTestHelper::getDefaultSshUsage(); - if ((pKernel->requiresSshForBuffers()) || (pKernel->getKernelInfo(rootDeviceIndex).patchInfo.imageMemObjKernelArgs.size() > 0)) { + if ((pKernel->requiresSshForBuffers(rootDeviceIndex)) || (pKernel->getKernelInfo(rootDeviceIndex).patchInfo.imageMemObjKernelArgs.size() > 0)) { EXPECT_EQ(expectedSizeSSH, sshUsed); } diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp index 5f41c77659..cfe3883827 100644 --- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp +++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp @@ -763,7 +763,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelIsExecutedThenGTPinCa Kernel *pKernel1 = (Kernel *)kernel1; const KernelInfo &kInfo1 = pKernel1->getKernelInfo(rootDeviceIndex); - uint64_t gtpinKernelId1 = pKernel1->getKernelId(); + uint64_t gtpinKernelId1 = pKernel1->getKernelId(rootDeviceIndex); EXPECT_EQ(kInfo1.shaderHashCode, gtpinKernelId1); constexpr size_t n = 256; @@ -797,7 +797,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelIsExecutedThenGTPinCa Kernel *pKernel2 = (Kernel *)kernel2; const KernelInfo &kInfo2 = pKernel2->getKernelInfo(rootDeviceIndex); - uint64_t gtpinKernelId2 = pKernel2->getKernelId(); + uint64_t gtpinKernelId2 = pKernel2->getKernelId(rootDeviceIndex); EXPECT_EQ(kInfo2.shaderHashCode, gtpinKernelId2); auto buff20 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); @@ -911,7 +911,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelINTELIsExecutedThenGT Kernel *pKernel1 = (Kernel *)kernel1; const KernelInfo &kInfo1 = pKernel1->getKernelInfo(rootDeviceIndex); - uint64_t gtpinKernelId1 = pKernel1->getKernelId(); + uint64_t gtpinKernelId1 = pKernel1->getKernelId(rootDeviceIndex); EXPECT_EQ(kInfo1.shaderHashCode, gtpinKernelId1); cl_uint workDim = 1; @@ -951,7 +951,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelINTELIsExecutedThenGT Kernel *pKernel2 = (Kernel *)kernel2; const KernelInfo &kInfo2 = pKernel2->getKernelInfo(rootDeviceIndex); - uint64_t gtpinKernelId2 = pKernel2->getKernelId(); + uint64_t gtpinKernelId2 = pKernel2->getKernelId(rootDeviceIndex); EXPECT_EQ(kInfo2.shaderHashCode, gtpinKernelId2); auto buff20 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); @@ -1267,7 +1267,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelWithoutSSHIsUsedThenG Kernel *pKernel = (Kernel *)kernel; const KernelInfo &kInfo = pKernel->getKernelInfo(rootDeviceIndex); - uint64_t gtpinKernelId = pKernel->getKernelId(); + uint64_t gtpinKernelId = pKernel->getKernelId(rootDeviceIndex); EXPECT_EQ(kInfo.shaderHashCode, gtpinKernelId); constexpr size_t n = 256; @@ -1380,7 +1380,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenBlockedKernelWithoutSSHIsUs Kernel *pKernel = (Kernel *)kernel; const KernelInfo &kInfo = pKernel->getKernelInfo(rootDeviceIndex); - uint64_t gtpinKernelId = pKernel->getKernelId(); + uint64_t gtpinKernelId = pKernel->getKernelId(rootDeviceIndex); EXPECT_EQ(kInfo.shaderHashCode, gtpinKernelId); constexpr size_t n = 256; @@ -1504,7 +1504,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenTheSameKerneIsExecutedTwice Kernel *pKernel1 = (Kernel *)kernel1; const KernelInfo &kInfo1 = pKernel1->getKernelInfo(rootDeviceIndex); - uint64_t gtpinKernelId1 = pKernel1->getKernelId(); + uint64_t gtpinKernelId1 = pKernel1->getKernelId(rootDeviceIndex); EXPECT_EQ(kInfo1.shaderHashCode, gtpinKernelId1); constexpr size_t n = 256; @@ -1542,7 +1542,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenTheSameKerneIsExecutedTwice Kernel *pKernel2 = (Kernel *)kernel2; const KernelInfo &kInfo2 = pKernel2->getKernelInfo(rootDeviceIndex); - uint64_t gtpinKernelId2 = pKernel2->getKernelId(); + uint64_t gtpinKernelId2 = pKernel2->getKernelId(rootDeviceIndex); EXPECT_EQ(kInfo2.shaderHashCode, gtpinKernelId2); auto buff20 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr); @@ -2325,18 +2325,18 @@ TEST_F(GTPinTests, givenKernelThenVerifyThatKernelCodeSubstitutionWorksWell) { Kernel *pKernel = castToObject(kernel); ASSERT_NE(nullptr, pKernel); - bool isKernelCodeSubstituted = pKernel->isKernelHeapSubstituted(); + bool isKernelCodeSubstituted = pKernel->isKernelHeapSubstituted(rootDeviceIndex); EXPECT_FALSE(isKernelCodeSubstituted); // Substitute new kernel code constexpr size_t newCodeSize = 64; uint8_t newCode[newCodeSize] = {0x0, 0x1, 0x2, 0x3, 0x4}; - pKernel->substituteKernelHeap(&newCode[0], newCodeSize); + pKernel->substituteKernelHeap(rootDeviceIndex, &newCode[0], newCodeSize); // Verify that substitution went properly - isKernelCodeSubstituted = pKernel->isKernelHeapSubstituted(); + isKernelCodeSubstituted = pKernel->isKernelHeapSubstituted(rootDeviceIndex); EXPECT_TRUE(isKernelCodeSubstituted); - uint8_t *pBin2 = reinterpret_cast(const_cast(pKernel->getKernelHeap())); + uint8_t *pBin2 = reinterpret_cast(const_cast(pKernel->getKernelHeap(rootDeviceIndex))); EXPECT_EQ(pBin2, &newCode[0]); auto kernelIsa = pKernel->getKernelInfo(rootDeviceIndex).kernelAllocation->getUnderlyingBuffer(); diff --git a/opencl/test/unit_test/helpers/unit_test_helper.h b/opencl/test/unit_test/helpers/unit_test_helper.h index c579cf4b9d..d83fa64696 100644 --- a/opencl/test/unit_test/helpers/unit_test_helper.h +++ b/opencl/test/unit_test/helpers/unit_test_helper.h @@ -21,7 +21,7 @@ template struct UnitTestHelper { static bool isL3ConfigProgrammable(); - static bool evaluateDshUsage(size_t sizeBeforeEnqueue, size_t sizeAfterEnqueue, Kernel *kernel); + static bool evaluateDshUsage(size_t sizeBeforeEnqueue, size_t sizeAfterEnqueue, Kernel *kernel, uint32_t rootDeviceIndex); static bool isPageTableManagerSupported(const HardwareInfo &hwInfo); diff --git a/opencl/test/unit_test/helpers/unit_test_helper.inl b/opencl/test/unit_test/helpers/unit_test_helper.inl index 2925d00233..d09e12a2d6 100644 --- a/opencl/test/unit_test/helpers/unit_test_helper.inl +++ b/opencl/test/unit_test/helpers/unit_test_helper.inl @@ -13,7 +13,7 @@ bool UnitTestHelper::isL3ConfigProgrammable() { }; template -bool UnitTestHelper::evaluateDshUsage(size_t sizeBeforeEnqueue, size_t sizeAfterEnqueue, Kernel *kernel) { +bool UnitTestHelper::evaluateDshUsage(size_t sizeBeforeEnqueue, size_t sizeAfterEnqueue, Kernel *kernel, uint32_t rootDeviceIndex) { if (sizeBeforeEnqueue != sizeAfterEnqueue) { return true; } diff --git a/opencl/test/unit_test/kernel/debug_kernel_tests.cpp b/opencl/test/unit_test/kernel/debug_kernel_tests.cpp index 0c6de1dcf3..19a3726efe 100644 --- a/opencl/test/unit_test/kernel/debug_kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/debug_kernel_tests.cpp @@ -22,7 +22,7 @@ TEST(DebugKernelTest, givenKernelCompiledForDebuggingWhenGetDebugSurfaceBtiIsCal program.enableKernelDebug(); std::unique_ptr kernel(MockKernel::create(device->getDevice(), &program)); - EXPECT_EQ(0, kernel->getDebugSurfaceBti()); + EXPECT_EQ(0, kernel->getDebugSurfaceBti(device->getRootDeviceIndex())); } TEST(DebugKernelTest, givenKernelCompiledForDebuggingWhenGetPerThreadSystemThreadSurfaceSizeIsCalledThenCorrectValueIsReturned) { @@ -31,7 +31,7 @@ TEST(DebugKernelTest, givenKernelCompiledForDebuggingWhenGetPerThreadSystemThrea program.enableKernelDebug(); std::unique_ptr kernel(MockKernel::create(device->getDevice(), &program)); - EXPECT_EQ(MockDebugKernel::perThreadSystemThreadSurfaceSize, kernel->getPerThreadSystemThreadSurfaceSize()); + EXPECT_EQ(MockDebugKernel::perThreadSystemThreadSurfaceSize, kernel->getPerThreadSystemThreadSurfaceSize(device->getRootDeviceIndex())); } TEST(DebugKernelTest, givenKernelCompiledForDebuggingWhenQueryingIsKernelDebugEnabledThenTrueIsReturned) { @@ -60,7 +60,7 @@ TEST(DebugKernelTest, givenKernelWithoutDebugFlagWhenGetDebugSurfaceBtiIsCalledT program.enableKernelDebug(); std::unique_ptr kernel(MockKernel::create(device->getDevice(), &program)); - EXPECT_EQ(-1, kernel->getDebugSurfaceBti()); + EXPECT_EQ(-1, kernel->getDebugSurfaceBti(device->getRootDeviceIndex())); } TEST(DebugKernelTest, givenKernelWithoutDebugFlagWhenGetPerThreadSystemThreadSurfaceSizeIsCalledThenZeroIsReturned) { @@ -69,5 +69,5 @@ TEST(DebugKernelTest, givenKernelWithoutDebugFlagWhenGetPerThreadSystemThreadSur program.enableKernelDebug(); std::unique_ptr kernel(MockKernel::create(device->getDevice(), &program)); - EXPECT_EQ(0u, kernel->getPerThreadSystemThreadSurfaceSize()); + EXPECT_EQ(0u, kernel->getPerThreadSystemThreadSurfaceSize(device->getRootDeviceIndex())); } diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index dbf1770e8e..e26d0ec5ac 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -97,8 +97,8 @@ TEST(KernelTest, WhenKernelIsCreatedThenCorrectMembersAreMemObjects) { } TEST_F(KernelTests, WhenKernelIsCreatedThenKernelHeapIsCorrect) { - EXPECT_EQ(pKernel->getKernelInfo(rootDeviceIndex).heapInfo.pKernelHeap, pKernel->getKernelHeap()); - EXPECT_EQ(pKernel->getKernelInfo(rootDeviceIndex).heapInfo.KernelHeapSize, pKernel->getKernelHeapSize()); + EXPECT_EQ(pKernel->getKernelInfo(rootDeviceIndex).heapInfo.pKernelHeap, pKernel->getKernelHeap(rootDeviceIndex)); + EXPECT_EQ(pKernel->getKernelInfo(rootDeviceIndex).heapInfo.KernelHeapSize, pKernel->getKernelHeapSize(rootDeviceIndex)); } TEST_F(KernelTests, GivenInvalidParamNameWhenGettingInfoThenInvalidValueErrorIsReturned) { @@ -167,7 +167,7 @@ TEST_F(KernelTests, GivenKernelBinaryProgramIntelWhenGettingInfoThenKernelBinary size_t paramValueSize = 0; char *paramValue = nullptr; size_t paramValueSizeRet = 0; - const char *pKernelData = reinterpret_cast(pKernel->getKernelHeap()); + const char *pKernelData = reinterpret_cast(pKernel->getKernelHeap(rootDeviceIndex)); EXPECT_NE(nullptr, pKernelData); // get size of kernel binary @@ -2670,7 +2670,7 @@ TEST(KernelTest, givenKernelWithKernelInfoWith32bitPointerSizeThenReport32bit) { MockProgram program(&context, false, toClDeviceVector(*device)); std::unique_ptr kernel(new MockKernel(&program, MockKernel::toKernelInfoContainer(info, rootDeviceIndex))); - EXPECT_TRUE(kernel->is32Bit()); + EXPECT_TRUE(kernel->is32Bit(rootDeviceIndex)); } TEST(KernelTest, givenKernelWithKernelInfoWith64bitPointerSizeThenReport64bit) { @@ -2683,7 +2683,7 @@ TEST(KernelTest, givenKernelWithKernelInfoWith64bitPointerSizeThenReport64bit) { MockProgram program(&context, false, toClDeviceVector(*device)); std::unique_ptr kernel(new MockKernel(&program, MockKernel::toKernelInfoContainer(info, rootDeviceIndex))); - EXPECT_FALSE(kernel->is32Bit()); + EXPECT_FALSE(kernel->is32Bit(rootDeviceIndex)); } TEST(KernelTest, givenFtrRenderCompressedBuffersWhenInitializingArgsWithNonStatefulAccessThenMarkKernelForAuxTranslation) { @@ -2978,7 +2978,7 @@ TEST(KernelTest, givenKernelRequiringPrivateScratchSpaceWhenGettingSizeForPrivat mockKernel.kernelInfo.patchInfo.mediavfestate = &mediaVFEstate; mockKernel.kernelInfo.patchInfo.mediaVfeStateSlot1 = &mediaVFEstateSlot1; - EXPECT_EQ(1024u, mockKernel.mockKernel->getPrivateScratchSize()); + EXPECT_EQ(1024u, mockKernel.mockKernel->getPrivateScratchSize(device->getRootDeviceIndex())); } TEST(KernelTest, givenKernelWithoutMediaVfeStateSlot1WhenGettingSizeForPrivateScratchSpaceThenCorrectSizeIsReturned) { @@ -2987,7 +2987,7 @@ TEST(KernelTest, givenKernelWithoutMediaVfeStateSlot1WhenGettingSizeForPrivateSc MockKernelWithInternals mockKernel(*device); mockKernel.kernelInfo.patchInfo.mediaVfeStateSlot1 = nullptr; - EXPECT_EQ(0u, mockKernel.mockKernel->getPrivateScratchSize()); + EXPECT_EQ(0u, mockKernel.mockKernel->getPrivateScratchSize(device->getRootDeviceIndex())); } TEST(KernelTest, givenKernelWithPatchInfoCollectionEnabledWhenPatchWithImplicitSurfaceCalledThenPatchInfoDataIsCollected) { diff --git a/opencl/test/unit_test/kernel/kernel_transformable_tests.cpp b/opencl/test/unit_test/kernel/kernel_transformable_tests.cpp index a37c9c9049..61f61adb49 100644 --- a/opencl/test/unit_test/kernel/kernel_transformable_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_transformable_tests.cpp @@ -21,7 +21,7 @@ using namespace NEO; class KernelTransformableTest : public ::testing::Test { public: void SetUp() override { - rootDeviceIndex = context.getDevice(0)->getRootDeviceIndex(); + context = std::make_unique(deviceFactory.rootDevices[rootDeviceIndex]); pKernelInfo = std::make_unique(); KernelArgPatchInfo kernelArgPatchInfo; @@ -45,7 +45,7 @@ class KernelTransformableTest : public ::testing::Test { pKernelInfo->kernelArgInfo[3].isImage = true; pKernelInfo->argumentsToPatchNum = 4; - program = std::make_unique(toClDeviceVector(*context.getDevice(0))); + program = std::make_unique(context.get(), false, toClDeviceVector(*context->getDevice(0))); pKernel.reset(new MockKernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex))); ASSERT_EQ(CL_SUCCESS, pKernel->initialize()); @@ -66,7 +66,8 @@ class KernelTransformableTest : public ::testing::Test { const int secondImageOffset = 0x40; cl_int retVal = CL_SUCCESS; - MockContext context; + UltClDeviceFactory deviceFactory{2, 0}; + std::unique_ptr context; std::unique_ptr program; std::unique_ptr sampler; std::unique_ptr pKernelInfo; @@ -75,14 +76,14 @@ class KernelTransformableTest : public ::testing::Test { std::unique_ptr image; SKernelBinaryHeaderCommon kernelHeader; char surfaceStateHeap[0x80]; - uint32_t rootDeviceIndex = std::numeric_limits::max(); + const uint32_t rootDeviceIndex = 1; }; HWTEST_F(KernelTransformableTest, givenKernelThatCannotTranformImagesWithTwoTransformableImagesAndTwoTransformableSamplersWhenAllArgsAreSetThenImagesAreNotTransformed) { using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; - image.reset(Image3dHelper<>::create(&context)); + image.reset(Image3dHelper<>::create(context.get())); sampler.reset(createTransformableSampler()); cl_mem clImage = image.get(); cl_sampler clSampler = sampler.get(); @@ -110,7 +111,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithTwoTransformableImagesAndTwoTra using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; - image.reset(Image3dHelper<>::create(&context)); + image.reset(Image3dHelper<>::create(context.get())); sampler.reset(createTransformableSampler()); cl_mem clImage = image.get(); cl_sampler clSampler = sampler.get(); @@ -137,7 +138,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithTwoTransformableImagesAndTwoTra using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; - image.reset(Image3dHelper<>::create(&context)); + image.reset(Image3dHelper<>::create(context.get())); sampler.reset(createTransformableSampler()); cl_mem clImage = image.get(); cl_sampler clSampler = sampler.get(); @@ -169,7 +170,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithOneTransformableImageAndTwoTran using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; - image.reset(Image3dHelper<>::create(&context)); + image.reset(Image3dHelper<>::create(context.get())); sampler.reset(createTransformableSampler()); cl_mem clImage = image.get(); cl_sampler clSampler = sampler.get(); @@ -196,7 +197,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithImages2dAndTwoTransformableSamp using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; - image.reset(Image2dHelper<>::create(&context)); + image.reset(Image2dHelper<>::create(context.get())); sampler.reset(createTransformableSampler()); cl_mem clImage = image.get(); cl_sampler clSampler = sampler.get(); @@ -223,7 +224,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithTwoTransformableImagesAndTwoTra using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; - image.reset(Image3dHelper<>::create(&context)); + image.reset(Image3dHelper<>::create(context.get())); sampler.reset(createTransformableSampler()); cl_mem clImage = image.get(); cl_sampler clSampler = sampler.get(); @@ -255,7 +256,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithNonTransformableSamplersWhenRes using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; - image.reset(Image3dHelper<>::create(&context)); + image.reset(Image3dHelper<>::create(context.get())); sampler.reset(createNonTransformableSampler()); cl_mem clImage = image.get(); cl_sampler clSampler = sampler.get(); @@ -287,7 +288,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithoutSamplersAndTransformableImag using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE; - image.reset(Image3dHelper<>::create(&context)); + image.reset(Image3dHelper<>::create(context.get())); cl_mem clImage = image.get(); pKernelInfo->kernelArgInfo[0].isSampler = false; diff --git a/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp b/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp index a1381a9a15..92d58afccc 100644 --- a/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp +++ b/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp @@ -34,7 +34,7 @@ TEST_F(KernelSubstituteTest, givenKernelWhenSubstituteKernelHeapWithGreaterSizeT const size_t newHeapSize = initialHeapSize + 1; char newHeap[newHeapSize]; - kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize); + kernel.mockKernel->substituteKernelHeap(rootDeviceIndex, newHeap, newHeapSize); auto secondAllocation = kernel.kernelInfo.kernelAllocation; EXPECT_NE(nullptr, secondAllocation); auto secondAllocationSize = secondAllocation->getUnderlyingBufferSize(); @@ -64,7 +64,7 @@ TEST_F(KernelSubstituteTest, givenKernelWhenSubstituteKernelHeapWithSameSizeThen const size_t newHeapSize = initialHeapSize; char newHeap[newHeapSize]; - kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize); + kernel.mockKernel->substituteKernelHeap(rootDeviceIndex, newHeap, newHeapSize); auto secondAllocation = kernel.kernelInfo.kernelAllocation; EXPECT_NE(nullptr, secondAllocation); auto secondAllocationSize = secondAllocation->getUnderlyingBufferSize(); @@ -93,7 +93,7 @@ TEST_F(KernelSubstituteTest, givenKernelWhenSubstituteKernelHeapWithSmallerSizeT const size_t newHeapSize = initialHeapSize - 1; char newHeap[newHeapSize]; - kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize); + kernel.mockKernel->substituteKernelHeap(rootDeviceIndex, newHeap, newHeapSize); auto secondAllocation = kernel.kernelInfo.kernelAllocation; EXPECT_NE(nullptr, secondAllocation); auto secondAllocationSize = secondAllocation->getUnderlyingBufferSize(); @@ -125,7 +125,7 @@ TEST_F(KernelSubstituteTest, givenKernelWithUsedKernelAllocationWhenSubstituteKe EXPECT_TRUE(commandStreamReceiver.getTemporaryAllocations().peekIsEmpty()); - kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize); + kernel.mockKernel->substituteKernelHeap(rootDeviceIndex, newHeap, newHeapSize); auto secondAllocation = kernel.kernelInfo.kernelAllocation; EXPECT_FALSE(commandStreamReceiver.getTemporaryAllocations().peekIsEmpty()); diff --git a/opencl/test/unit_test/sampler/sampler_set_arg_tests.cpp b/opencl/test/unit_test/sampler/sampler_set_arg_tests.cpp index f2fa083c1e..e069bae8ad 100644 --- a/opencl/test/unit_test/sampler/sampler_set_arg_tests.cpp +++ b/opencl/test/unit_test/sampler/sampler_set_arg_tests.cpp @@ -122,7 +122,7 @@ HWTEST_F(SamplerSetArgTest, WhenSettingKernelArgSamplerThenSamplerStatesAreCorre ASSERT_EQ(CL_SUCCESS, retVal); auto samplerState = reinterpret_cast( - ptrOffset(pKernel->getDynamicStateHeap(), + ptrOffset(pKernel->getDynamicStateHeap(rootDeviceIndex), pKernelInfo->kernelArgInfo[0].offsetHeap)); EXPECT_EQ(static_cast(CL_TRUE), static_cast(!samplerState->getNonNormalizedCoordinateEnable())); EXPECT_EQ(SAMPLER_STATE::TEXTURE_COORDINATE_MODE_MIRROR, samplerState->getTcxAddressControlMode()); @@ -426,7 +426,7 @@ HWTEST_P(NormalizedTest, WhenSettingKernelArgSamplerThenCoordsAreCorrect) { ASSERT_EQ(CL_SUCCESS, retVal); auto samplerState = reinterpret_cast( - ptrOffset(pKernel->getDynamicStateHeap(), + ptrOffset(pKernel->getDynamicStateHeap(rootDeviceIndex), pKernelInfo->kernelArgInfo[0].offsetHeap)); EXPECT_EQ(normalizedCoordinates, static_cast(!samplerState->getNonNormalizedCoordinateEnable())); @@ -477,7 +477,7 @@ HWTEST_P(AddressingModeTest, WhenSettingKernelArgSamplerThenModesAreCorrect) { ASSERT_EQ(CL_SUCCESS, retVal); auto samplerState = reinterpret_cast( - ptrOffset(pKernel->getDynamicStateHeap(), + ptrOffset(pKernel->getDynamicStateHeap(rootDeviceIndex), pKernelInfo->kernelArgInfo[0].offsetHeap)); auto expectedModeX = SAMPLER_STATE::TEXTURE_COORDINATE_MODE_MIRROR; @@ -557,7 +557,7 @@ HWTEST_F(SamplerSetArgTest, GivenMipmapsWhenSettingKernelArgSamplerThenLodAreCor ASSERT_EQ(CL_SUCCESS, retVal); auto samplerState = reinterpret_cast( - ptrOffset(pKernel->getDynamicStateHeap(), + ptrOffset(pKernel->getDynamicStateHeap(rootDeviceIndex), pKernelInfo->kernelArgInfo[0].offsetHeap)); EXPECT_EQ(FamilyType::SAMPLER_STATE::MIP_MODE_FILTER_LINEAR, samplerState->getMipModeFilter()); @@ -588,7 +588,7 @@ HWTEST_P(FilterModeTest, WhenSettingKernelArgSamplerThenFiltersAreCorrect) { retVal); auto samplerState = reinterpret_cast( - ptrOffset(pKernel->getDynamicStateHeap(), + ptrOffset(pKernel->getDynamicStateHeap(rootDeviceIndex), pKernelInfo->kernelArgInfo[0].offsetHeap)); sampler->setArg(const_cast(samplerState)); diff --git a/shared/test/unit_test/cmd_parse/hw_parse.h b/shared/test/unit_test/cmd_parse/hw_parse.h index 94385c16b4..bd313c94a9 100644 --- a/shared/test/unit_test/cmd_parse/hw_parse.h +++ b/shared/test/unit_test/cmd_parse/hw_parse.h @@ -131,7 +131,7 @@ struct HardwareParse { } template - const void *getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh); + const void *getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh, uint32_t rootDeviceIndex); template CmdType *getCommand(GenCmdList::iterator itorStart, GenCmdList::iterator itorEnd) { diff --git a/shared/test/unit_test/cmd_parse/hw_parse.inl b/shared/test/unit_test/cmd_parse/hw_parse.inl index 30b62f74ed..4577e05dce 100644 --- a/shared/test/unit_test/cmd_parse/hw_parse.inl +++ b/shared/test/unit_test/cmd_parse/hw_parse.inl @@ -94,7 +94,7 @@ void HardwareParse::findHardwareCommands() { } template -const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh) { +const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh, uint32_t rootDeviceIndex) { typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER; typedef typename FamilyType::STATE_BASE_ADDRESS STATE_BASE_ADDRESS; @@ -115,7 +115,7 @@ const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uin offsetCrossThreadData); // Determine where the argument is - auto &patchInfo = kernel.getDefaultKernelInfo().patchInfo; + auto &patchInfo = kernel.getKernelInfo(rootDeviceIndex).patchInfo; for (auto &arg : patchInfo.statelessGlobalMemObjKernelArgs) { if (arg->ArgumentNumber == indexArg) { return ptrOffset(pCrossThreadData, arg->DataParamOffset); diff --git a/shared/test/unit_test/gen11/cmd_parse_gen11.cpp b/shared/test/unit_test/gen11/cmd_parse_gen11.cpp index 4ef4b4810d..342f83f23f 100644 --- a/shared/test/unit_test/gen11/cmd_parse_gen11.cpp +++ b/shared/test/unit_test/gen11/cmd_parse_gen11.cpp @@ -85,5 +85,5 @@ template struct CmdParse; namespace NEO { template void HardwareParse::findHardwareCommands(); template void HardwareParse::findHardwareCommands(IndirectHeap *); -template const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh); +template const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh, uint32_t rootDeviceIndex); } // namespace NEO diff --git a/shared/test/unit_test/gen12lp/cmd_parse_gen12lp.cpp b/shared/test/unit_test/gen12lp/cmd_parse_gen12lp.cpp index 8604d5eba5..6ba523d808 100644 --- a/shared/test/unit_test/gen12lp/cmd_parse_gen12lp.cpp +++ b/shared/test/unit_test/gen12lp/cmd_parse_gen12lp.cpp @@ -99,5 +99,5 @@ template struct CmdParse; namespace NEO { template void HardwareParse::findHardwareCommands(); template void HardwareParse::findHardwareCommands(IndirectHeap *); -template const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh); +template const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh, uint32_t rootDeviceIndex); } // namespace NEO diff --git a/shared/test/unit_test/gen8/cmd_parse_gen8.cpp b/shared/test/unit_test/gen8/cmd_parse_gen8.cpp index 36699e47f9..4ba38ccdff 100644 --- a/shared/test/unit_test/gen8/cmd_parse_gen8.cpp +++ b/shared/test/unit_test/gen8/cmd_parse_gen8.cpp @@ -67,5 +67,5 @@ template struct CmdParse; namespace NEO { template void HardwareParse::findHardwareCommands(); template void HardwareParse::findHardwareCommands(IndirectHeap *); -template const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh); +template const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh, uint32_t rootDeviceIndex); } // namespace NEO diff --git a/shared/test/unit_test/gen9/cmd_parse_gen9.cpp b/shared/test/unit_test/gen9/cmd_parse_gen9.cpp index 22484602f8..ab5fcd51a8 100644 --- a/shared/test/unit_test/gen9/cmd_parse_gen9.cpp +++ b/shared/test/unit_test/gen9/cmd_parse_gen9.cpp @@ -85,5 +85,5 @@ template struct CmdParse; namespace NEO { template void HardwareParse::findHardwareCommands(); template void HardwareParse::findHardwareCommands(IndirectHeap *); -template const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh); +template const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh, uint32_t rootDeviceIndex); } // namespace NEO