diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp index 8d193cf980..7e90df8f3f 100644 --- a/opencl/source/api/api.cpp +++ b/opencl/source/api/api.cpp @@ -1845,7 +1845,7 @@ cl_int CL_API_CALL clSetKernelArg(cl_kernel kernel, retVal = CL_INVALID_KERNEL; break; } - if (pKernel->getDefaultKernelInfo().kernelArgInfo.size() <= argIndex) { + if (pKernel->getKernelArguments().size() <= argIndex) { retVal = CL_INVALID_ARG_INDEX; break; } diff --git a/opencl/source/helpers/dispatch_info.cpp b/opencl/source/helpers/dispatch_info.cpp index b07271c7bb..e098298bf2 100644 --- a/opencl/source/helpers/dispatch_info.cpp +++ b/opencl/source/helpers/dispatch_info.cpp @@ -11,7 +11,7 @@ namespace NEO { bool DispatchInfo::usesSlm() const { - return (kernel == nullptr) ? false : kernel->slmTotalSize > 0; + return (kernel == nullptr) ? false : kernel->getSlmTotalSize(pClDevice->getRootDeviceIndex()) > 0; } bool DispatchInfo::usesStatelessPrintfSurface() const { diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index 279fa3cdca..f85d88e19c 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -180,8 +180,10 @@ size_t HardwareCommandsHelper::sendInterfaceDescriptorData( interfaceDescriptor.setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL); + auto slmTotalSize = kernel.getSlmTotalSize(rootDeviceIndex); + setGrfInfo(&interfaceDescriptor, kernel, sizeCrossThreadData, sizePerThreadData, rootDeviceIndex); - EncodeDispatchKernel::appendAdditionalIDDFields(&interfaceDescriptor, hardwareInfo, threadsPerThreadGroup, kernel.slmTotalSize, SlmPolicy::SlmPolicyNone); + EncodeDispatchKernel::appendAdditionalIDDFields(&interfaceDescriptor, hardwareInfo, threadsPerThreadGroup, slmTotalSize, SlmPolicy::SlmPolicyNone); interfaceDescriptor.setBindingTablePointer(static_cast(bindingTablePointer)); @@ -190,7 +192,7 @@ size_t HardwareCommandsHelper::sendInterfaceDescriptorData( EncodeDispatchKernel::adjustBindingTablePrefetch(interfaceDescriptor, numSamplers, bindingTablePrefetchSize); auto programmableIDSLMSize = - static_cast(HwHelperHw::get().computeSlmValues(hardwareInfo, kernel.slmTotalSize)); + static_cast(HwHelperHw::get().computeSlmValues(hardwareInfo, slmTotalSize)); interfaceDescriptor.setSharedLocalMemorySize(programmableIDSLMSize); EncodeDispatchKernel::programBarrierEnable(interfaceDescriptor, diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index d5a4e81f8e..a55010b4a9 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -166,6 +166,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate printfHandler.get()->makeResident(commandStreamReceiver); } makeTimestampPacketsResident(commandStreamReceiver); + auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex(); if (executionModelKernel) { uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1; @@ -195,7 +196,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate scheduler.makeResident(commandStreamReceiver); // Update SLM usage - slmUsed |= scheduler.slmTotalSize > 0; + slmUsed |= scheduler.getSlmTotalSize(rootDeviceIndex) > 0; this->kernel->getProgram()->getBlockKernelManager()->makeInternalAllocationsResident(commandStreamReceiver); } @@ -210,7 +211,6 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate commandQueue.getGpgpuCommandStreamReceiver(), bcsCsr); } - auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex(); const auto &kernelDescriptor = kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor; auto memoryCompressionState = commandStreamReceiver.getMemoryCompressionState(kernel->isAuxTranslationRequired()); diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 128ec20b96..ea279e042e 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -65,8 +65,7 @@ class Surface; uint32_t Kernel::dummyPatchLocation = 0xbaddf00d; Kernel::Kernel(Program *programArg, const KernelInfoContainer &kernelInfosArg, bool schedulerKernel) - : slmTotalSize(kernelInfosArg[programArg->getDevices()[0]->getRootDeviceIndex()]->workloadInfo.slmStaticSize), - isParentKernel(kernelInfosArg[programArg->getDevices()[0]->getRootDeviceIndex()]->kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue), + : isParentKernel(kernelInfosArg[programArg->getDevices()[0]->getRootDeviceIndex()]->kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue), isSchedulerKernel(schedulerKernel), executionEnvironment(programArg->getExecutionEnvironment()), program(programArg), @@ -78,7 +77,9 @@ Kernel::Kernel(Program *programArg, const KernelInfoContainer &kernelInfosArg, b program->retainForKernel(); imageTransformer.reset(new ImageTransformer); for (const auto &pClDevice : deviceVector) { - kernelDeviceInfos[pClDevice->getRootDeviceIndex()].maxKernelWorkGroupSize = static_cast(pClDevice->getSharedDeviceInfo().maxWorkGroupSize); + auto rootDeviceIndex = pClDevice->getRootDeviceIndex(); + kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast(pClDevice->getSharedDeviceInfo().maxWorkGroupSize); + kernelDeviceInfos[rootDeviceIndex].slmTotalSize = kernelInfosArg[rootDeviceIndex]->workloadInfo.slmStaticSize; } } @@ -100,7 +101,7 @@ Kernel::~Kernel() { } for (uint32_t i = 0; i < patchedArgumentsNum; i++) { - if (getDefaultKernelInfo().kernelArgInfo.at(i).isSampler) { + if (SAMPLER_OBJ == getKernelArguments()[i].type) { auto sampler = castToObject(kernelArguments.at(i).object); if (sampler) { sampler->decRefInternal(); @@ -372,6 +373,8 @@ cl_int Kernel::initialize() { if (program->isKernelDebugEnabled() && kernelInfo.patchInfo.pAllocateSystemThreadSurface) { debugEnabled = true; } + auto numArgs = kernelInfo.kernelArgInfo.size(); + kernelDeviceInfo.slmSizes.resize(numArgs); isDeviceInitialized.set(rootDeviceIndex); } @@ -384,13 +387,11 @@ cl_int Kernel::initialize() { auto &defaultKernelInfo = getDefaultKernelInfo(); auto numArgs = defaultKernelInfo.kernelArgInfo.size(); kernelArguments.resize(numArgs); - slmSizes.resize(numArgs); kernelArgHandlers.resize(numArgs); kernelArgRequiresCacheFlush.resize(numArgs); for (uint32_t i = 0; i < numArgs; ++i) { storeKernelArg(i, NONE_OBJ, nullptr, nullptr, 0); - slmSizes[i] = 0; // set the argument handler auto &argInfo = defaultKernelInfo.kernelArgInfo[i]; @@ -483,7 +484,6 @@ cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize, const _cl_context *ctxt; cl_uint refCount = 0; uint64_t nonCannonizedGpuAddress = 0llu; - auto defaultRootDeviceIndex = getDevices()[0]->getRootDeviceIndex(); auto &defaultKernelInfo = getKernelInfo(defaultRootDeviceIndex); switch (paramName) { @@ -1112,7 +1112,7 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local availableThreadCount, dssCount, dssCount * KB * hardwareInfo.capabilityTable.slmSize, - hwHelper.alignSlmSize(slmTotalSize), + hwHelper.alignSlmSize(kernelDeviceInfos[rootDeviceIndex].slmTotalSize), static_cast(hwHelper.getMaxBarrierRegisterPerSlice()), hwHelper.getBarriersCountFromHasBarriers(barrierCount), workDim, @@ -1280,7 +1280,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex, storeKernelArg(argIndex, SLM_OBJ, nullptr, argVal, argSize); - slmSizes[argIndex] = argSize; + kernelDeviceInfos[rootDeviceIndex].slmSizes[argIndex] = argSize; // Extract our current slmOffset auto slmOffset = *ptrOffset(crossThreadData, @@ -1291,7 +1291,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex, // Update all slm offsets after this argIndex ++argIndex; - while (argIndex < slmSizes.size()) { + while (argIndex < kernelDeviceInfos[rootDeviceIndex].slmSizes.size()) { const auto &kernelArgInfo = defaultKernelInfo.kernelArgInfo[argIndex]; auto slmAlignment = kernelArgInfo.slmAlignment; @@ -1306,11 +1306,11 @@ cl_int Kernel::setArgLocal(uint32_t argIndex, *patchLocation = slmOffset; } - slmOffset += static_cast(slmSizes[argIndex]); + slmOffset += static_cast(kernelDeviceInfos[rootDeviceIndex].slmSizes[argIndex]); ++argIndex; } - slmTotalSize = defaultKernelInfo.workloadInfo.slmStaticSize + alignUp(slmOffset, KB); + kernelDeviceInfos[rootDeviceIndex].slmTotalSize = defaultKernelInfo.workloadInfo.slmStaticSize + alignUp(slmOffset, KB); return CL_SUCCESS; } @@ -2679,4 +2679,7 @@ void Kernel::setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim) { uint32_t Kernel::getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const { return kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize; } +uint32_t Kernel::getSlmTotalSize(uint32_t rootDeviceIndex) const { + return kernelDeviceInfos[rootDeviceIndex].slmTotalSize; +} } // namespace NEO diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h index 878b269834..f907c094cb 100644 --- a/opencl/source/kernel/kernel.h +++ b/opencl/source/kernel/kernel.h @@ -182,7 +182,7 @@ class Kernel : public BaseObject<_cl_kernel> { } size_t getKernelArgsNumber() const { - return getDefaultKernelInfo().kernelArgInfo.size(); + return kernelArguments.size(); } bool requiresSshForBuffers(uint32_t rootDeviceIndex) const { @@ -308,11 +308,8 @@ class Kernel : public BaseObject<_cl_kernel> { static uint32_t dummyPatchLocation; - std::vector slmSizes; - uint32_t allBufferArgsStateful = CL_TRUE; - uint32_t slmTotalSize; bool isBuiltIn = false; const bool isParentKernel; const bool isSchedulerKernel; @@ -406,6 +403,7 @@ class Kernel : public BaseObject<_cl_kernel> { void setNumWorkGroupsValues(uint32_t rootDeviceIndex, uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ); void setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim); uint32_t getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const; + uint32_t getSlmTotalSize(uint32_t rootDeviceIndex) const; protected: struct ObjectCounts { @@ -574,6 +572,9 @@ class Kernel : public BaseObject<_cl_kernel> { size_t numberOfBindingTableStates = 0u; size_t localBindingTableOffset = 0u; + std::vector slmSizes; + uint32_t slmTotalSize = 0u; + std::unique_ptr pSshLocal; uint32_t sshLocalSize = 0u; char *crossThreadData = nullptr; diff --git a/opencl/source/program/kernel_info.cpp b/opencl/source/program/kernel_info.cpp index f7d7204460..77ff5798f3 100644 --- a/opencl/source/program/kernel_info.cpp +++ b/opencl/source/program/kernel_info.cpp @@ -138,7 +138,7 @@ WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) { this->maxWorkGroupSize = dispatchInfo.getKernel()->getMaxKernelWorkGroupSize(rootDeviceIndex); this->hasBarriers = kernelInfo.kernelDescriptor.kernelAttributes.usesBarriers(); this->simdSize = static_cast(kernelInfo.getMaxSimdSize()); - this->slmTotalSize = static_cast(dispatchInfo.getKernel()->slmTotalSize); + this->slmTotalSize = static_cast(dispatchInfo.getKernel()->getSlmTotalSize(rootDeviceIndex)); this->coreFamily = device.getHardwareInfo().platform.eRenderCoreFamily; this->numThreadsPerSubSlice = static_cast(device.getSharedDeviceInfo().maxNumEUsPerSubSlice) * device.getSharedDeviceInfo().numThreadsPerEU; diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp index 79e782575d..ae2ceb9126 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp @@ -380,7 +380,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenKernelWithSlmWhenPreviousSLML commandStreamReceiver->lastSentL3Config = L3Config; commandStreamReceiver->lastSentThreadArbitrationPolicy = kernel.mockKernel->getThreadArbitrationPolicy(); - ((MockKernel *)kernel)->setTotalSLMSize(1024); + ((MockKernel *)kernel)->setTotalSLMSize(rootDeviceIndex, 1024); cmdList.clear(); commandQueue.enqueueKernel(kernel, 1, nullptr, &GWS, nullptr, 0, nullptr, nullptr); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.inl b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.inl index 68e64e3782..4cd254d479 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.inl +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.inl @@ -43,7 +43,7 @@ void CommandStreamReceiverHwTest::givenKernelWithSlmWhenPreviousNOSLM commandStreamReceiver->isPreambleSent = true; commandStreamReceiver->lastSentL3Config = 0; - static_cast(kernel)->setTotalSLMSize(1024); + static_cast(kernel)->setTotalSLMSize(rootDeviceIndex, 1024); cmdList.clear(); commandQueue.enqueueKernel(kernel, 1, nullptr, &GWS, nullptr, 0, nullptr, nullptr); @@ -89,7 +89,7 @@ void CommandStreamReceiverHwTest::givenBlockedKernelWithSlmWhenPrevio commandStreamReceiver->isPreambleSent = true; commandStreamReceiver->lastSentL3Config = 0; - static_cast(kernel)->setTotalSLMSize(1024); + static_cast(kernel)->setTotalSLMSize(rootDeviceIndex, 1024); commandQueue.enqueueKernel(kernel, 1, nullptr, &GWS, nullptr, 1, &blockingEvent, nullptr); diff --git a/opencl/test/unit_test/helpers/dispatch_info_builder_tests.cpp b/opencl/test/unit_test/helpers/dispatch_info_builder_tests.cpp index e056a37c6f..068df312f1 100644 --- a/opencl/test/unit_test/helpers/dispatch_info_builder_tests.cpp +++ b/opencl/test/unit_test/helpers/dispatch_info_builder_tests.cpp @@ -68,7 +68,7 @@ class DispatchInfoBuilderFixture : public ContextFixture, public ClDeviceFixture pKernel->setCrossThreadData(pCrossThreadData, sizeof(pCrossThreadData)); pKernel->setKernelArgHandler(0, &Kernel::setArgBuffer); - pKernel->slmTotalSize = 128; + pKernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize = 128; pKernel->isBuiltIn = true; } diff --git a/opencl/test/unit_test/helpers/dispatch_info_tests.cpp b/opencl/test/unit_test/helpers/dispatch_info_tests.cpp index 1ab386f2ae..534de50e99 100644 --- a/opencl/test/unit_test/helpers/dispatch_info_tests.cpp +++ b/opencl/test/unit_test/helpers/dispatch_info_tests.cpp @@ -40,7 +40,7 @@ class DispatchInfoFixture : public ContextFixture, public ClDeviceFixture { pProgram = new MockProgram(pContext, false, toClDeviceVector(*pClDevice)); pKernel = new MockKernel(pProgram, MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex)); - pKernel->slmTotalSize = 128; + pKernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize = 128; } void TearDown() override { delete pKernel; @@ -56,7 +56,7 @@ class DispatchInfoFixture : public ContextFixture, public ClDeviceFixture { SPatchMediaVFEState *pMediaVFEstate = nullptr; SPatchAllocateStatelessPrintfSurface *pPrintfSurface = nullptr; MockProgram *pProgram = nullptr; - Kernel *pKernel = nullptr; + MockKernel *pKernel = nullptr; }; typedef Test DispatchInfoTest; diff --git a/opencl/test/unit_test/kernel/clone_kernel_tests.cpp b/opencl/test/unit_test/kernel/clone_kernel_tests.cpp index 897a5ee6a0..f524342c6d 100644 --- a/opencl/test/unit_test/kernel/clone_kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/clone_kernel_tests.cpp @@ -6,6 +6,7 @@ */ #include "shared/source/memory_manager/unified_memory_manager.h" +#include "shared/test/unit_test/utilities/base_object_utils.h" #include "opencl/source/accelerators/intel_accelerator.h" #include "opencl/source/accelerators/intel_motion_estimation.h" @@ -162,7 +163,7 @@ TEST_F(CloneKernelTest, GivenArgLocalWhenCloningKernelThenKernelInfoIsCorrect) { EXPECT_EQ(pSourceKernel->getPatchedArgumentsNum(), pClonedKernel->getPatchedArgumentsNum()); EXPECT_EQ(pSourceKernel->getKernelArgInfo(0).isPatched, pClonedKernel->getKernelArgInfo(0).isPatched); - EXPECT_EQ(alignUp(slmSize, 1024), pClonedKernel->slmTotalSize); + EXPECT_EQ(alignUp(slmSize, 1024), pClonedKernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize); } TEST_F(CloneKernelTest, GivenArgBufferWhenCloningKernelThenKernelInfoIsCorrect) { @@ -336,10 +337,10 @@ TEST_F(CloneKernelTest, GivenArgAcceleratorWhenCloningKernelThenKernelInfoIsCorr } TEST_F(CloneKernelTest, GivenArgSamplerWhenCloningKernelThenKernelInfoIsCorrect) { - std::unique_ptr sampler(new MockSampler(pContext, - true, - (cl_addressing_mode)CL_ADDRESS_MIRRORED_REPEAT, - (cl_filter_mode)CL_FILTER_NEAREST)); + auto sampler = clUniquePtr(new MockSampler(pContext, + true, + (cl_addressing_mode)CL_ADDRESS_MIRRORED_REPEAT, + (cl_filter_mode)CL_FILTER_NEAREST)); uint32_t objectId = SAMPLER_OBJECT_ID_SHIFT + pKernelInfo->kernelArgInfo[0].offsetHeap; @@ -381,6 +382,8 @@ TEST_F(CloneKernelTest, GivenArgSamplerWhenCloningKernelThenKernelInfoIsCorrect) auto pNormalizedCoords = ptrOffset(crossThreadData, argInfo.offsetSamplerNormalizedCoords); EXPECT_EQ(GetNormCoordsEnum(sampler->normalizedCoordinates), *pNormalizedCoords); + + EXPECT_EQ(3, sampler->getRefInternalCount()); } HWCMDTEST_F(IGFX_GEN8_CORE, CloneKernelTest, GivenArgDeviceQueueWhenCloningKernelThenKernelInfoIsCorrect) { diff --git a/opencl/test/unit_test/kernel/kernel_slm_arg_tests.cpp b/opencl/test/unit_test/kernel/kernel_slm_arg_tests.cpp index 1b646b6b7e..ffd645bb6f 100644 --- a/opencl/test/unit_test/kernel/kernel_slm_arg_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_slm_arg_tests.cpp @@ -83,7 +83,7 @@ TEST_F(KernelSlmArgTest, WhenSettingSizeThenAlignmentOfHigherSlmArgsIsUpdated) { slmOffset = ptrOffset(crossThreadData, 0x30); EXPECT_EQ(0x400u, *slmOffset); - EXPECT_EQ(5 * KB, pKernel->slmTotalSize); + EXPECT_EQ(5 * KB, pKernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize); } TEST_F(KernelSlmArgTest, GivenReverseOrderWhenSettingSizeThenAlignmentOfHigherSlmArgsIsUpdated) { @@ -100,5 +100,5 @@ TEST_F(KernelSlmArgTest, GivenReverseOrderWhenSettingSizeThenAlignmentOfHigherSl slmOffset = ptrOffset(crossThreadData, 0x30); EXPECT_EQ(0x400u, *slmOffset); - EXPECT_EQ(5 * KB, pKernel->slmTotalSize); + EXPECT_EQ(5 * KB, pKernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize); } diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index 6a9fdf7a8b..944c60946e 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -2288,7 +2288,7 @@ TEST_F(KernelCrossThreadTests, GivenSlmStatisSizeWhenCreatingKernelThenSlmTotalS MockKernel *kernel = new MockKernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex)); - EXPECT_EQ(1024u, kernel->slmTotalSize); + EXPECT_EQ(1024u, kernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize); delete kernel; } diff --git a/opencl/test/unit_test/mocks/mock_kernel.h b/opencl/test/unit_test/mocks/mock_kernel.h index c108c5e766..4030626b20 100644 --- a/opencl/test/unit_test/mocks/mock_kernel.h +++ b/opencl/test/unit_test/mocks/mock_kernel.h @@ -211,8 +211,8 @@ class MockKernel : public Kernel { } } - void setTotalSLMSize(uint32_t size) { - slmTotalSize = size; + void setTotalSLMSize(uint32_t rootDeviceIndex, uint32_t size) { + kernelDeviceInfos[rootDeviceIndex].slmTotalSize = size; } void setKernelArguments(std::vector kernelArguments) {