diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index 95355589d9..578d00d180 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -266,6 +266,18 @@ size_t HardwareCommandsHelper::sendIndirectState( if (EncodeSurfaceState::doBindingTablePrefetch()) { bindingTablePrefetchSize = std::min(31u, static_cast(kernel.getNumberOfBindingTableStates())); } + + const bool isBindlessKernel = NEO::KernelDescriptor::isBindlessAddressingKernel(kernel.getKernelInfo().kernelDescriptor); + if (isBindlessKernel) { + uint64_t bindlessSurfaceStateBaseOffset = ptrDiff(ssh.getSpace(0), ssh.getCpuBase()); + + auto sshHeapSize = kernel.getSurfaceStateHeapSize(); + // Allocate space for new ssh data + auto dstSurfaceState = ssh.getSpace(sshHeapSize); + memcpy_s(dstSurfaceState, sshHeapSize, kernel.getSurfaceStateHeap(), sshHeapSize); + + kernel.patchBindlessOffsetsInCrossThreadData(bindlessSurfaceStateBaseOffset); + } } auto &gfxCoreHelper = device.getGfxCoreHelper(); diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index ba7f775f47..7b0e451229 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -140,12 +140,41 @@ void Kernel::patchWithImplicitSurface(uint64_t ptrToPatchInCrossThreadData, Grap } void *ssh = getSurfaceStateHeap(); - if ((nullptr != ssh) && isValidOffset(arg.bindful)) { - auto surfaceState = ptrOffset(ssh, arg.bindful); + if (nullptr != ssh) { void *addressToPatch = reinterpret_cast(allocation.getGpuAddressToPatch()); size_t sizeToPatch = allocation.getUnderlyingBufferSize(); - Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0, - areMultipleSubDevicesInContext()); + + if (isValidOffset(arg.bindful)) { + auto surfaceState = ptrOffset(ssh, arg.bindful); + Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0, + areMultipleSubDevicesInContext()); + } else if (isValidOffset(arg.bindless)) { + auto &gfxCoreHelper = clDevice.getDevice().getGfxCoreHelper(); + void *surfaceState = nullptr; + auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + if (clDevice.getDevice().getBindlessHeapsHelper()) { + auto ssInHeap = allocation.getBindlessInfo(); + surfaceState = ssInHeap.ssPtr; + auto patchLocation = ptrOffset(crossThreadData, arg.bindless); + auto patchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(ssInHeap.surfaceStateOffset)); + patchWithRequiredSize(reinterpret_cast(patchLocation), sizeof(patchValue), patchValue); + } else { + auto index = std::numeric_limits::max(); + const auto &iter = kernelInfo.kernelDescriptor.getBindlessOffsetToSurfaceState().find(arg.bindless); + if (iter != kernelInfo.kernelDescriptor.getBindlessOffsetToSurfaceState().end()) { + index = iter->second; + } + if (index < std::numeric_limits::max()) { + surfaceState = ptrOffset(ssh, index * surfaceStateSize); + } + } + + if (surfaceState) { + Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0, + areMultipleSubDevicesInContext()); + } + } } } @@ -223,7 +252,13 @@ cl_int Kernel::initialize() { // copy the ssh into our local copy memcpy_s(pSshLocal.get(), sshLocalSize, heapInfo.pSsh, heapInfo.surfaceStateHeapSize); + } else if (NEO::KernelDescriptor::isBindlessAddressingKernel(kernelDescriptor)) { + auto surfaceStateSize = static_cast(gfxCoreHelper.getRenderSurfaceStateSize()); + sshLocalSize = kernelDescriptor.kernelAttributes.numArgsStateful * surfaceStateSize; + DEBUG_BREAK_IF(kernelDescriptor.kernelAttributes.numArgsStateful != kernelDescriptor.getBindlessOffsetToSurfaceState().size()); + pSshLocal = std::make_unique(sshLocalSize); } + numberOfBindingTableStates = kernelDescriptor.payloadMappings.bindingTable.numEntries; localBindingTableOffset = kernelDescriptor.payloadMappings.bindingTable.tableOffset; @@ -233,7 +268,8 @@ cl_int Kernel::initialize() { return status; } - if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) { + if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless) || + isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.bindless)) { DEBUG_BREAK_IF(program->getConstantSurface(rootDeviceIndex) == nullptr); uint64_t constMemory = isBuiltIn ? castToUint64(program->getConstantSurface(rootDeviceIndex)->getUnderlyingBuffer()) : program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch(); @@ -241,7 +277,8 @@ cl_int Kernel::initialize() { patchWithImplicitSurface(constMemory, *program->getConstantSurface(rootDeviceIndex), arg); } - if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) { + if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless) || + isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.bindless)) { DEBUG_BREAK_IF(program->getGlobalSurface(rootDeviceIndex) == nullptr); uint64_t globalMemory = isBuiltIn ? castToUint64(program->getGlobalSurface(rootDeviceIndex)->getUnderlyingBuffer()) : program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch(); @@ -932,6 +969,16 @@ cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, G auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful); Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, false, false, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0, areMultipleSubDevicesInContext()); + } else if (isValidOffset(argAsPtr.bindless)) { + auto &gfxCoreHelper = this->getGfxCoreHelper(); + auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + auto ssIndex = getSurfaceStateIndexForBindlessOffset(argAsPtr.bindless); + if (ssIndex < std::numeric_limits::max()) { + auto surfaceState = ptrOffset(getSurfaceStateHeap(), ssIndex * surfaceStateSize); + Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, false, false, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0, + areMultipleSubDevicesInContext()); + } } storeKernelArg(argIndex, SVM_OBJ, nullptr, svmPtr, sizeof(void *), svmAlloc, svmFlags); @@ -987,6 +1034,24 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio } Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, forceNonAuxMode, disableL3, allocSize, ptrToPatch, offset, svmAlloc, 0, 0, areMultipleSubDevicesInContext()); + } else if (isValidOffset(argAsPtr.bindless)) { + size_t allocSize = 0; + size_t offset = 0; + if (svmAlloc != nullptr) { + allocSize = svmAlloc->getUnderlyingBufferSize(); + offset = ptrDiff(ptrToPatch, svmAlloc->getGpuAddressToPatch()); + allocSize -= offset; + } + + auto &gfxCoreHelper = this->getGfxCoreHelper(); + auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + auto ssIndex = getSurfaceStateIndexForBindlessOffset(argAsPtr.bindless); + if (ssIndex < std::numeric_limits::max()) { + auto surfaceState = ptrOffset(getSurfaceStateHeap(), ssIndex * surfaceStateSize); + Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, forceNonAuxMode, disableL3, allocSize, ptrToPatch, offset, svmAlloc, 0, 0, + areMultipleSubDevicesInContext()); + } } storeKernelArg(argIndex, SVM_ALLOC_OBJ, svmAlloc, svmPtr, sizeof(uintptr_t)); @@ -1297,10 +1362,20 @@ void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) { if (program->getConstantSurface(rootDeviceIndex)) { commandStreamReceiver.makeResident(*(program->getConstantSurface(rootDeviceIndex))); + + auto bindlessHeapAllocation = program->getConstantSurface(rootDeviceIndex)->getBindlessInfo().heapAllocation; + if (bindlessHeapAllocation) { + commandStreamReceiver.makeResident(*bindlessHeapAllocation); + } } if (program->getGlobalSurface(rootDeviceIndex)) { commandStreamReceiver.makeResident(*(program->getGlobalSurface(rootDeviceIndex))); + + auto bindlessHeapAllocation = program->getGlobalSurface(rootDeviceIndex)->getBindlessInfo().heapAllocation; + if (bindlessHeapAllocation) { + commandStreamReceiver.makeResident(*bindlessHeapAllocation); + } } if (program->getExportedFunctionsSurface(rootDeviceIndex)) { @@ -1510,11 +1585,14 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex, } else if (isValidOffset(argAsPtr.bindless)) { auto &gfxCoreHelper = this->getGfxCoreHelper(); auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); - auto surfaceState = ptrOffset(getSurfaceStateHeap(), surfaceStateSize * argIndex); - buffer->setArgStateful(surfaceState, forceNonAuxMode, - disableL3, isAuxTranslationKernel, arg.isReadOnly(), pClDevice->getDevice(), - areMultipleSubDevicesInContext()); + auto ssIndex = getSurfaceStateIndexForBindlessOffset(argAsPtr.bindless); + if (ssIndex < std::numeric_limits::max()) { + auto surfaceState = ptrOffset(getSurfaceStateHeap(), ssIndex * surfaceStateSize); + buffer->setArgStateful(surfaceState, forceNonAuxMode, + disableL3, isAuxTranslationKernel, arg.isReadOnly(), pClDevice->getDevice(), + areMultipleSubDevicesInContext()); + } } kernelArguments[argIndex].isStatelessUncacheable = argAsPtr.isPureStateful() ? false : buffer->isMemObjUncacheable(); @@ -2080,6 +2158,68 @@ void *Kernel::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t return ssInHeap.ssPtr; } +uint32_t Kernel::getSurfaceStateIndexForBindlessOffset(NEO::CrossThreadDataOffset bindlessOffset) const { + const auto &iter = kernelInfo.kernelDescriptor.getBindlessOffsetToSurfaceState().find(bindlessOffset); + if (iter != kernelInfo.kernelDescriptor.getBindlessOffsetToSurfaceState().end()) { + return iter->second; + } + DEBUG_BREAK_IF(true); + return std::numeric_limits::max(); +} + +void Kernel::patchBindlessOffsetsForImplicitArgs(uint64_t bindlessSurfaceStateBaseOffset) const { + auto implicitArgsVec = kernelInfo.kernelDescriptor.getImplicitArgBindlessCandidatesVec(); + + auto &gfxCoreHelper = this->getGfxCoreHelper(); + auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + for (size_t i = 0; i < implicitArgsVec.size(); i++) { + if (NEO::isValidOffset(implicitArgsVec[i]->bindless)) { + auto patchLocation = ptrOffset(getCrossThreadData(), implicitArgsVec[i]->bindless); + auto index = getSurfaceStateIndexForBindlessOffset(implicitArgsVec[i]->bindless); + + if (index < std::numeric_limits::max()) { + auto surfaceStateOffset = static_cast(bindlessSurfaceStateBaseOffset + index * surfaceStateSize); + auto patchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(surfaceStateOffset)); + + patchWithRequiredSize(reinterpret_cast(patchLocation), sizeof(patchValue), patchValue); + } + } + } +} + +void Kernel::patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const { + auto &gfxCoreHelper = this->getGfxCoreHelper(); + auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + for (size_t argIndex = 0; argIndex < kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size(); argIndex++) { + const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex]; + + auto crossThreadOffset = NEO::undefined; + if (arg.type == NEO::ArgDescriptor::argTPointer) { + crossThreadOffset = arg.as().bindless; + } else if (arg.type == NEO::ArgDescriptor::argTImage) { + crossThreadOffset = arg.as().bindless; + } else { + continue; + } + + if (NEO::isValidOffset(crossThreadOffset)) { + auto patchLocation = ptrOffset(getCrossThreadData(), crossThreadOffset); + auto index = getSurfaceStateIndexForBindlessOffset(crossThreadOffset); + + if (index < std::numeric_limits::max()) { + auto surfaceStateOffset = static_cast(bindlessSurfaceStateBaseOffset + index * surfaceStateSize); + auto patchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(surfaceStateOffset)); + + patchWithRequiredSize(reinterpret_cast(patchLocation), sizeof(patchValue), patchValue); + } + } + } + + patchBindlessOffsetsForImplicitArgs(bindlessSurfaceStateBaseOffset); +} + void Kernel::setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo) { this->additionalKernelExecInfo = additionalKernelExecInfo; } diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h index d4ff8c1e25..4e5f19ce58 100644 --- a/opencl/source/kernel/kernel.h +++ b/opencl/source/kernel/kernel.h @@ -218,6 +218,9 @@ class Kernel : public ReferenceTrackedObject { bool usesSyncBuffer() const; void patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset); void *patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless); + uint32_t getSurfaceStateIndexForBindlessOffset(NEO::CrossThreadDataOffset bindlessOffset) const; + void patchBindlessOffsetsForImplicitArgs(uint64_t bindlessSurfaceStateBaseOffset) const; + void patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const; // Helpers cl_int setArg(uint32_t argIndex, uint32_t argValue); diff --git a/opencl/source/program/process_device_binary.cpp b/opencl/source/program/process_device_binary.cpp index 7ca04c8d71..7ec9547127 100644 --- a/opencl/source/program/process_device_binary.cpp +++ b/opencl/source/program/process_device_binary.cpp @@ -265,16 +265,35 @@ cl_int Program::processProgramInfo(ProgramInfo &src, const ClDevice &clDevice) { } kernelInfoArray = std::move(src.kernelInfos); + + bool isBindlessKernelPresent = false; + for (auto &kernelInfo : kernelInfoArray) { + if (NEO::KernelDescriptor::isBindlessAddressingKernel(kernelInfo->kernelDescriptor)) { + isBindlessKernelPresent = true; + break; + } + } + auto svmAllocsManager = context ? context->getSVMAllocsManager() : nullptr; auto globalConstDataSize = src.globalConstants.size + src.globalConstants.zeroInitSize; if (globalConstDataSize != 0) { buildInfos[rootDeviceIndex].constantSurface = allocateGlobalsSurface(svmAllocsManager, clDevice.getDevice(), globalConstDataSize, src.globalConstants.zeroInitSize, true, linkerInput, src.globalConstants.initData); + if (isBindlessKernelPresent) { + if (!clDevice.getMemoryManager()->allocateBindlessSlot(buildInfos[rootDeviceIndex].constantSurface)) { + return CL_OUT_OF_HOST_MEMORY; + } + } } auto globalVariablesDataSize = src.globalVariables.size + src.globalVariables.zeroInitSize; buildInfos[rootDeviceIndex].globalVarTotalSize = globalVariablesDataSize; if (globalVariablesDataSize != 0) { buildInfos[rootDeviceIndex].globalSurface = allocateGlobalsSurface(svmAllocsManager, clDevice.getDevice(), globalVariablesDataSize, src.globalVariables.zeroInitSize, false, linkerInput, src.globalVariables.initData); + if (isBindlessKernelPresent) { + if (!clDevice.getMemoryManager()->allocateBindlessSlot(buildInfos[rootDeviceIndex].globalSurface)) { + return CL_OUT_OF_HOST_MEMORY; + } + } if (clDevice.areOcl21FeaturesEnabled() == false) { buildInfos[rootDeviceIndex].globalVarTotalSize = 0u; } diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp index 61785704cc..34c6ae9866 100644 --- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp +++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp @@ -1080,6 +1080,82 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd delete[] mockDsh; } +HWTEST2_F(HardwareCommandsTest, givenBindlessKernelWithBufferArgWhenSendIndirectStateThenSurfaceStateIsCopiedToHeapAndCrossThreadDataIsCorrectlyPatched, IsAtLeastXeHpCore) { + using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using DefaultWalkerType = typename FamilyType::DefaultWalkerType; + + CommandQueueHw cmdQ(pContext, pClDevice, 0, false); + + auto &commandStream = cmdQ.getCS(1024); + auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(DefaultWalkerType))); + + // define kernel info + std::unique_ptr pKernelInfo = std::make_unique(); + pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 1; + pKernelInfo->addArgBuffer(0, 0x30, sizeof(void *), 0x0); + pKernelInfo->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::AddressingMode::BindlessAndStateless; + + const auto bindlessOffset = 0x10; + pKernelInfo->argAsPtr(0).bindless = bindlessOffset; + pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState(); + + pKernelInfo->kernelDescriptor.kernelAttributes.crossThreadDataSize = 1024; + + MockKernel mockKernel(mockKernelWithInternal->mockKernel->getProgram(), *pKernelInfo, *pClDevice); + + auto retVal = mockKernel.initialize(); + EXPECT_EQ(0, retVal); + + memset(mockKernel.getSurfaceStateHeap(), 0x22, mockKernel.getSurfaceStateHeapSize()); + memset(mockKernel.getCrossThreadData(), 0x00, mockKernel.getCrossThreadDataSize()); + + auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::Type::dynamicState, 8192); + auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::Type::indirectObject, 8192); + auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::surfaceState, 8192); + + const auto expectedDestinationInHeap = ssh.getSpace(0); + const uint64_t bindlessSurfaceStateBaseOffset = ptrDiff(ssh.getSpace(0), ssh.getCpuBase()); + + const size_t localWorkSize = 256; + const size_t localWorkSizes[3]{localWorkSize, 1, 1}; + const uint32_t threadGroupCount = 1u; + uint32_t interfaceDescriptorIndex = 0; + auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType()); + auto kernelUsesLocalIds = HardwareCommandsHelper::kernelUsesLocalIds(mockKernel); + + INTERFACE_DESCRIPTOR_DATA interfaceDescriptorData; + HardwareCommandsHelper::template sendIndirectState( + commandStream, + dsh, + ioh, + ssh, + mockKernel, + mockKernel.getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false), + pKernelInfo->getMaxSimdSize(), + localWorkSizes, + threadGroupCount, + 0, + interfaceDescriptorIndex, + pDevice->getPreemptionMode(), + pWalkerCmd, + &interfaceDescriptorData, + true, + 0, + *pDevice); + + EXPECT_EQ(0, std::memcmp(expectedDestinationInHeap, mockKernel.getSurfaceStateHeap(), mockKernel.getSurfaceStateHeapSize())); + + const auto &gfxCoreHelper = mockKernel.getGfxCoreHelper(); + const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second; + const auto surfaceStateOffset = static_cast(bindlessSurfaceStateBaseOffset + ssIndex * surfaceStateSize); + const auto expectedPatchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(surfaceStateOffset)); + const auto expectedPatchLocation = reinterpret_cast(ptrOffset(mockKernel.getCrossThreadData(), bindlessOffset)); + + EXPECT_EQ(expectedPatchValue, *expectedPatchLocation); +} + HWTEST_F(HardwareCommandsTest, whenNumLocalIdsIsBiggerThanZeroThenExpectLocalIdsInUseIsTrue) { mockKernelWithInternal->kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels = 1; EXPECT_TRUE(HardwareCommandsHelper::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel)); diff --git a/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp index 52dca7c005..2c71d735e1 100644 --- a/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp @@ -651,6 +651,8 @@ class KernelArgBufferFixtureBindless : public KernelArgBufferFixture { pKernelInfo->argAsPtr(0).bindless = bindlessOffset; pKernelInfo->argAsPtr(0).stateless = undefined; pKernelInfo->argAsPtr(0).bindful = undefined; + + pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState(); } void tearDown() { delete pBuffer; @@ -674,6 +676,46 @@ HWTEST_F(KernelArgBufferTestBindless, givenUsedBindlessBuffersWhenSettingKernelA EXPECT_EQ(0xdeadu, *patchLocation); } +HWTEST_F(KernelArgBufferTestBindless, givenBindlessArgBufferWhenSettingKernelArgThenSurfaceStateIsEncodedAtProperOffset) { + using DataPortBindlessSurfaceExtendedMessageDescriptor = typename FamilyType::DataPortBindlessSurfaceExtendedMessageDescriptor; + + const auto &gfxCoreHelper = pKernel->getGfxCoreHelper(); + const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + const auto surfaceStateHeapSize = pKernel->getSurfaceStateHeapSize(); + + EXPECT_EQ(pKernelInfo->kernelDescriptor.kernelAttributes.numArgsStateful * surfaceStateSize, surfaceStateHeapSize); + + cl_mem memObj = pBuffer; + retVal = pKernel->setArg(0, sizeof(memObj), &memObj); + + const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second; + const auto ssOffset = ssIndex * surfaceStateSize; + + typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; + const auto surfaceState = reinterpret_cast(ptrOffset(pKernel->getSurfaceStateHeap(), ssOffset)); + const auto surfaceAddress = surfaceState->getSurfaceBaseAddress(); + + const auto bufferAddress = pBuffer->getGraphicsAllocation(pDevice->getRootDeviceIndex())->getGpuAddress(); + EXPECT_EQ(bufferAddress, surfaceAddress); +} + +HWTEST_F(KernelArgBufferTestBindless, givenBindlessArgBufferAndNotInitializedBindlessOffsetToSurfaceStateWhenSettingKernelArgThenSurfaceStateIsNotEncoded) { + using DataPortBindlessSurfaceExtendedMessageDescriptor = typename FamilyType::DataPortBindlessSurfaceExtendedMessageDescriptor; + + const auto surfaceStateHeap = pKernel->getSurfaceStateHeap(); + const auto surfaceStateHeapSize = pKernel->getSurfaceStateHeapSize(); + + auto ssHeapDataInitial = std::make_unique(surfaceStateHeapSize); + std::memcpy(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize); + + pKernelInfo->kernelDescriptor.bindlessArgsMap.clear(); + + cl_mem memObj = pBuffer; + retVal = pKernel->setArg(0, sizeof(memObj), &memObj); + + EXPECT_EQ(0, std::memcmp(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize)); +} + HWTEST_F(KernelArgBufferTestBindless, givenBindlessBuffersWhenPatchBindlessOffsetCalledThenBindlessOffsetToSurfaceStateWrittenInCrossThreadData) { pClDevice->getExecutionEnvironment()->rootDeviceEnvironments[pClDevice->getRootDeviceIndex()]->createBindlessHeapsHelper(pDevice, diff --git a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp index a1b266e582..ebf6d7ab49 100644 --- a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp @@ -6,6 +6,7 @@ */ #include "shared/source/helpers/aligned_memory.h" +#include "shared/source/helpers/gfx_core_helper.h" #include "shared/test/common/test_macros/hw_test.h" #include "opencl/source/kernel/kernel.h" @@ -116,6 +117,61 @@ HWTEST_F(KernelArgSvmTest, GivenSvmPtrStatefulWhenSettingKernelArgThenArgumentsA delete[] svmPtr; } +HWTEST_F(KernelArgSvmTest, GivenSvmPtrBindlessWhenSettingKernelArgThenArgumentsAreSetCorrectly) { + const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo(); + if (devInfo.svmCapabilities == 0) { + GTEST_SKIP(); + } + auto svmPtr = std::make_unique(256); + + const auto &gfxCoreHelper = pKernel->getGfxCoreHelper(); + const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + const auto bindlessOffset = 0x10; + pKernelInfo->argAsPtr(0).bindless = bindlessOffset; + pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState(); + + auto retVal = pKernel->setArgSvm(0, 256, svmPtr.get(), nullptr, 0u); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize()); + + const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second; + const auto ssOffset = ssIndex * surfaceStateSize; + + typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; + auto surfaceState = reinterpret_cast( + ptrOffset(pKernel->getSurfaceStateHeap(), + ssOffset)); + + void *surfaceAddress = reinterpret_cast(surfaceState->getSurfaceBaseAddress()); + EXPECT_EQ(svmPtr.get(), surfaceAddress); +} + +HWTEST_F(KernelArgSvmTest, GivenSvmPtrBindlessAndNotInitializedBindlessOffsetToSurfaceStateWhenSettingKernelArgThenSurfaceStateIsNotEncoded) { + const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo(); + if (devInfo.svmCapabilities == 0) { + GTEST_SKIP(); + } + auto svmPtr = std::make_unique(256); + + const auto surfaceStateHeap = pKernel->getSurfaceStateHeap(); + const auto surfaceStateHeapSize = pKernel->getSurfaceStateHeapSize(); + + const auto bindlessOffset = 0x10; + pKernelInfo->argAsPtr(0).bindless = bindlessOffset; + + auto ssHeapDataInitial = std::make_unique(surfaceStateHeapSize); + std::memcpy(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize); + + pKernelInfo->kernelDescriptor.bindlessArgsMap.clear(); + + auto retVal = pKernel->setArgSvm(0, 256, svmPtr.get(), nullptr, 0u); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_EQ(0, std::memcmp(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize)); +} + TEST_F(KernelArgSvmTest, GivenValidSvmAllocWhenSettingKernelArgThenArgumentsAreSetCorrectly) { const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo(); if (devInfo.svmCapabilities == 0) { @@ -221,6 +277,100 @@ HWTEST_F(KernelArgSvmTest, givenOffsetedSvmPointerWhenSetArgSvmAllocIsCalledThen EXPECT_EQ(offsetedPtr, surfaceAddress); } +HWTEST_F(KernelArgSvmTest, GivenValidSvmAllocBindlessWhenSettingKernelArgThenArgumentsAreSetCorrectly) { + const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo(); + if (devInfo.svmCapabilities == 0) { + GTEST_SKIP(); + } + + const auto &gfxCoreHelper = pKernel->getGfxCoreHelper(); + const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + auto svmPtr = std::make_unique(256); + + MockGraphicsAllocation svmAlloc(svmPtr.get(), 256); + + const auto bindlessOffset = 0x10; + pKernelInfo->argAsPtr(0).bindless = bindlessOffset; + pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState(); + + auto retVal = pKernel->setArgSvmAlloc(0, svmPtr.get(), &svmAlloc, 0u); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize()); + + const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second; + const auto ssOffset = ssIndex * surfaceStateSize; + + typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; + auto surfaceState = reinterpret_cast( + ptrOffset(pKernel->getSurfaceStateHeap(), + ssOffset)); + + void *surfaceAddress = reinterpret_cast(surfaceState->getSurfaceBaseAddress()); + EXPECT_EQ(svmPtr.get(), surfaceAddress); +} + +HWTEST_F(KernelArgSvmTest, givenOffsetedSvmPointerBindlessWhenSetArgSvmAllocIsCalledThenProperSvmAddressIsPatched) { + const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo(); + if (devInfo.svmCapabilities == 0) { + GTEST_SKIP(); + } + + const auto &gfxCoreHelper = pKernel->getGfxCoreHelper(); + const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + std::unique_ptr svmPtr(new char[256]); + + auto offsetedPtr = svmPtr.get() + 4; + + MockGraphicsAllocation svmAlloc(svmPtr.get(), 256); + + const auto bindlessOffset = 0x10; + pKernelInfo->argAsPtr(0).bindless = bindlessOffset; + pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState(); + + pKernel->setArgSvmAlloc(0, offsetedPtr, &svmAlloc, 0u); + + const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second; + const auto ssOffset = ssIndex * surfaceStateSize; + + typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; + auto surfaceState = reinterpret_cast( + ptrOffset(pKernel->getSurfaceStateHeap(), + ssOffset)); + + void *surfaceAddress = reinterpret_cast(surfaceState->getSurfaceBaseAddress()); + EXPECT_EQ(offsetedPtr, surfaceAddress); +} + +HWTEST_F(KernelArgSvmTest, GivenValidSvmAllocBindlessAndNotInitializedBindlessOffsetToSurfaceStateWhenSettingKernelArgThenSurfaceStateIsNotEncoded) { + const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo(); + if (devInfo.svmCapabilities == 0) { + GTEST_SKIP(); + } + + const auto surfaceStateHeap = pKernel->getSurfaceStateHeap(); + const auto surfaceStateHeapSize = pKernel->getSurfaceStateHeapSize(); + + auto svmPtr = std::make_unique(256); + + MockGraphicsAllocation svmAlloc(svmPtr.get(), 256); + + const auto bindlessOffset = 0x10; + pKernelInfo->argAsPtr(0).bindless = bindlessOffset; + + auto ssHeapDataInitial = std::make_unique(surfaceStateHeapSize); + std::memcpy(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize); + + pKernelInfo->kernelDescriptor.bindlessArgsMap.clear(); + + auto retVal = pKernel->setArgSvmAlloc(0, svmPtr.get(), &svmAlloc, 0u); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_EQ(0, std::memcmp(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize)); +} + HWTEST_F(KernelArgSvmTest, givenDeviceSupportingSharedSystemAllocationsWhenSetArgSvmIsCalledWithSurfaceStateThenSizeIsMaxAndAddressIsProgrammed) { const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo(); if (devInfo.svmCapabilities == 0) { @@ -246,6 +396,42 @@ HWTEST_F(KernelArgSvmTest, givenDeviceSupportingSharedSystemAllocationsWhenSetAr EXPECT_EQ(16384u, surfaceState->getHeight()); } +HWTEST_F(KernelArgSvmTest, givenBindlessArgAndDeviceSupportingSharedSystemAllocationsWhenSetArgSvmIsCalledWithSurfaceStateThenSizeIsMaxAndAddressIsProgrammed) { + const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo(); + if (devInfo.svmCapabilities == 0) { + GTEST_SKIP(); + } + + const auto &gfxCoreHelper = pKernel->getGfxCoreHelper(); + const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + this->pClDevice->deviceInfo.sharedSystemMemCapabilities = CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL | CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL | CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL | CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL; + + auto systemPointer = reinterpret_cast(0xfeedbac); + + const auto bindlessOffset = 0x10; + pKernelInfo->argAsPtr(0).bindless = bindlessOffset; + pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState(); + + pKernel->setArgSvmAlloc(0, systemPointer, nullptr, 0u); + + typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; + const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second; + const auto ssOffset = ssIndex * surfaceStateSize; + + typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; + auto surfaceState = reinterpret_cast( + ptrOffset(pKernel->getSurfaceStateHeap(), + ssOffset)); + + void *surfaceAddress = reinterpret_cast(surfaceState->getSurfaceBaseAddress()); + + EXPECT_EQ(systemPointer, surfaceAddress); + EXPECT_EQ(128u, surfaceState->getWidth()); + EXPECT_EQ(2048u, surfaceState->getDepth()); + EXPECT_EQ(16384u, surfaceState->getHeight()); +} + TEST_F(KernelArgSvmTest, WhenSettingKernelArgImmediateThenInvalidArgValueErrorIsReturned) { const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo(); if (devInfo.svmCapabilities == 0) { diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index e05ea51b16..1e8c629808 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -24,6 +24,7 @@ #include "shared/test/common/helpers/gtest_helpers.h" #include "shared/test/common/libult/ult_command_stream_receiver.h" #include "shared/test/common/mocks/mock_allocation_properties.h" +#include "shared/test/common/mocks/mock_bindless_heaps_helper.h" #include "shared/test/common/mocks/mock_cpu_page_fault_manager.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/common/mocks/mock_memory_manager.h" @@ -395,6 +396,137 @@ TEST_F(KernelTests, WhenIsSingleSubdevicePreferredIsCalledThenCorrectValuesAreRe } } +using BindlessKernelTests = KernelTests; + +TEST_F(BindlessKernelTests, GivenBindlessAddressingKernelWhenInitializeThenSurfaceStateIsCreatedWithCorrectSize) { + KernelInfo kernelInfo = {}; + kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 32; + kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::Bindless; + kernelInfo.kernelDescriptor.kernelAttributes.numArgsStateful = 3; + + MockKernel kernel(pProgram, kernelInfo, *pClDevice); + + auto retVal = kernel.initialize(); + EXPECT_EQ(CL_SUCCESS, retVal); + + const auto &gfxCoreHelper = pClDevice->getGfxCoreHelper(); + const auto surfaceStateSize = static_cast(gfxCoreHelper.getRenderSurfaceStateSize()); + const auto expectedSsHeapSize = kernelInfo.kernelDescriptor.kernelAttributes.numArgsStateful * surfaceStateSize; + + const auto ssHeap = kernel.getSurfaceStateHeap(); + const auto ssHeapSize = kernel.getSurfaceStateHeapSize(); + + EXPECT_EQ(expectedSsHeapSize, ssHeapSize); + EXPECT_NE(nullptr, ssHeap); +} + +TEST_F(BindlessKernelTests, givenBindlessKernelWhenPatchingCrossThreadDataThenCorrectBindlessOffsetsAreWritten) { + auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer); + argDescriptor.as() = NEO::ArgDescPointer(); + argDescriptor.as().bindful = NEO::undefined; + argDescriptor.as().bindless = 0x0; + + auto argDescriptorImg = NEO::ArgDescriptor(NEO::ArgDescriptor::argTImage); + argDescriptorImg.as() = NEO::ArgDescImage(); + argDescriptorImg.as().bindful = NEO::undefined; + argDescriptorImg.as().bindless = sizeof(uint64_t); + + auto argDescriptor2 = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer); + argDescriptor2.as() = NEO::ArgDescPointer(); + argDescriptor2.as().bindful = NEO::undefined; + argDescriptor2.as().stateless = 2 * sizeof(uint64_t); + + KernelInfo kernelInfo = {}; + pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + + pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptor); + pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptorImg); + pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptor2); + + pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.bindless = 3 * sizeof(uint64_t); + pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.bindless = 4 * sizeof(uint64_t); + + MockKernel mockKernel(pProgram, pProgram->mockKernelInfo, *pClDevice); + + pProgram->mockKernelInfo.kernelDescriptor.initBindlessOffsetToSurfaceState(); + + mockKernel.crossThreadData = new char[5 * sizeof(uint64_t)]; + mockKernel.crossThreadDataSize = 5 * sizeof(uint64_t); + memset(mockKernel.crossThreadData, 0x00, mockKernel.crossThreadDataSize); + + const uint64_t baseAddress = 0x1000; + auto &gfxCoreHelper = pClDevice->getGfxCoreHelper(); + auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + auto patchValue1 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(baseAddress)); + auto patchValue2 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(baseAddress + 1 * surfaceStateSize)); + auto patchValue3 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(baseAddress + 2 * surfaceStateSize)); + auto patchValue4 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(baseAddress + 3 * surfaceStateSize)); + + mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress); + + auto crossThreadData = std::make_unique(mockKernel.crossThreadDataSize / sizeof(uint64_t)); + memcpy(crossThreadData.get(), mockKernel.crossThreadData, mockKernel.crossThreadDataSize); + + EXPECT_EQ(patchValue1, crossThreadData[0]); + EXPECT_EQ(patchValue2, crossThreadData[1]); + EXPECT_EQ(0u, crossThreadData[2]); + EXPECT_EQ(patchValue3, crossThreadData[3]); + EXPECT_EQ(patchValue4, crossThreadData[4]); +} + +TEST_F(BindlessKernelTests, givenNoEntryInBindlessOffsetsMapWhenPatchingCrossThreadDataThenMemoryIsNotPatched) { + pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + + auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer); + argDescriptor.as() = NEO::ArgDescPointer(); + argDescriptor.as().bindful = NEO::undefined; + argDescriptor.as().bindless = 0x0; + pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptor); + + pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.bindless = sizeof(uint64_t); + + MockKernel mockKernel(pProgram, pProgram->mockKernelInfo, *pClDevice); + + mockKernel.crossThreadData = new char[4 * sizeof(uint64_t)]; + mockKernel.crossThreadDataSize = 4 * sizeof(uint64_t); + memset(mockKernel.crossThreadData, 0, mockKernel.crossThreadDataSize); + + const uint64_t baseAddress = 0x1000; + mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress); + + auto crossThreadData = std::make_unique(mockKernel.crossThreadDataSize / sizeof(uint64_t)); + memcpy(crossThreadData.get(), mockKernel.crossThreadData, mockKernel.crossThreadDataSize); + + EXPECT_EQ(0u, crossThreadData[0]); +} + +TEST_F(BindlessKernelTests, givenNoStatefulArgsWhenPatchingBindlessOffsetsInCrossThreadDataThenMemoryIsNotPatched) { + pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + + auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::argTValue); + argDescriptor.as() = NEO::ArgDescValue(); + argDescriptor.as().elements.push_back(NEO::ArgDescValue::Element{0, 8, 0, false}); + pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptor); + + MockKernel mockKernel(pProgram, pProgram->mockKernelInfo, *pClDevice); + + mockKernel.crossThreadData = new char[sizeof(uint64_t)]; + mockKernel.crossThreadDataSize = sizeof(uint64_t); + memset(mockKernel.crossThreadData, 0, mockKernel.crossThreadDataSize); + + const uint64_t baseAddress = 0x1000; + mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress); + + auto crossThreadData = std::make_unique(mockKernel.crossThreadDataSize / sizeof(uint64_t)); + memcpy(crossThreadData.get(), mockKernel.crossThreadData, mockKernel.crossThreadDataSize); + + EXPECT_EQ(0u, crossThreadData[0]); +} + class KernelFromBinaryTest : public ProgramSimpleFixture { public: void setUp() { @@ -1218,6 +1350,42 @@ HWTEST_F(KernelResidencyTest, givenKernelWhenMakeResidentIsCalledThenGlobalBuffe memoryManager->freeGraphicsMemory(pKernelInfo->kernelAllocation); } +HWTEST_F(KernelResidencyTest, givenBindlessHeapsHelperAndGlobalAndConstantBuffersWhenMakeResidentIsCalledThenGlobalAndConstantBufferHeapAllocationsAreMadeResident) { + auto bindlessHeapHelper = new MockBindlesHeapsHelper(pDevice, false); + pDevice->getExecutionEnvironment()->rootDeviceEnvironments[pDevice->getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapHelper); + + auto pKernelInfo = std::make_unique(); + pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 1; + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + commandStreamReceiver.storeMakeResidentAllocations = true; + + auto memoryManager = commandStreamReceiver.getMemoryManager(); + pKernelInfo->kernelAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize}); + + MockProgram program(toClDeviceVector(*pClDevice)); + MockContext ctx; + program.setContext(&ctx); + program.buildInfos[pDevice->getRootDeviceIndex()].globalSurface = new MockGraphicsAllocation(); + program.buildInfos[pDevice->getRootDeviceIndex()].constantSurface = new MockGraphicsAllocation(); + EXPECT_TRUE(memoryManager->allocateBindlessSlot(program.buildInfos[pDevice->getRootDeviceIndex()].globalSurface)); + EXPECT_TRUE(memoryManager->allocateBindlessSlot(program.buildInfos[pDevice->getRootDeviceIndex()].constantSurface)); + + std::unique_ptr kernel(new MockKernel(&program, *pKernelInfo, *pClDevice)); + ASSERT_EQ(CL_SUCCESS, kernel->initialize()); + + EXPECT_EQ(0u, commandStreamReceiver.makeResidentAllocations.size()); + kernel->makeResident(pDevice->getGpgpuCommandStreamReceiver()); + + EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.buildInfos[pDevice->getRootDeviceIndex()].globalSurface)); + EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.getGlobalSurface(rootDeviceIndex)->getBindlessInfo().heapAllocation)); + + EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.buildInfos[pDevice->getRootDeviceIndex()].constantSurface)); + EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.getConstantSurface(rootDeviceIndex)->getBindlessInfo().heapAllocation)); + + memoryManager->freeGraphicsMemory(pKernelInfo->kernelAllocation); +} + HWTEST_F(KernelResidencyTest, givenKernelWhenItUsesIndirectUnifiedMemoryDeviceAllocationThenTheyAreMadeResident) { MockKernelWithInternals mockKernel(*this->pClDevice); auto &commandStreamReceiver = this->pDevice->getUltCommandStreamReceiver(); @@ -2962,6 +3130,108 @@ TEST(KernelTest, givenKernelWithPatchInfoCollectionDisabledWhenPatchWithImplicit EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size()); } +HWTEST_F(KernelTest, givenBindlessArgBufferWhenPatchWithImplicitSurfaceThenSurfaceStateIsEncodedAtProperOffset) { + auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get()))); + MockKernelWithInternals kernel(*device); + + uint64_t gpuAddress = 0x1200; + const void *cpuPtr = reinterpret_cast(gpuAddress); + size_t allocSize = 0x1000; + MockGraphicsAllocation mockAllocation(const_cast(cpuPtr), gpuAddress, allocSize); + + kernel.kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless; + + const CrossThreadDataOffset bindlessOffset = 0x10; + kernel.kernelInfo.addArgBuffer(0, 0, sizeof(void *), undefined, bindlessOffset); + + kernel.kernelInfo.kernelDescriptor.initBindlessOffsetToSurfaceState(); + + uint64_t crossThreadData = 0; + kernel.mockKernel->patchWithImplicitSurface(castToUint64(&crossThreadData), mockAllocation, kernel.kernelInfo.argAsPtr(0)); + + const auto &gfxCoreHelper = device->getGfxCoreHelper(); + const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + const auto ssIndex = kernel.kernelInfo.kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second; + const auto ssOffset = ssIndex * surfaceStateSize; + + typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; + const auto surfaceState = reinterpret_cast(ptrOffset(kernel.mockKernel->getSurfaceStateHeap(), ssOffset)); + const auto surfaceAddress = surfaceState->getSurfaceBaseAddress(); + + const auto bufferAddress = mockAllocation.getGpuAddressToPatch(); + EXPECT_EQ(bufferAddress, surfaceAddress); +} + +HWTEST_F(KernelTest, givenBindlessArgBufferAndNotInitializedBindlessOffsetToSurfaceStateWhenPatchWithImplicitSurfaceThenSurfaceStateIsNotEncoded) { + auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get()))); + MockKernelWithInternals kernel(*device); + + uint64_t gpuAddress = 0x1200; + const void *cpuPtr = reinterpret_cast(gpuAddress); + size_t allocSize = 0x1000; + MockGraphicsAllocation mockAllocation(const_cast(cpuPtr), gpuAddress, allocSize); + + kernel.kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless; + + const CrossThreadDataOffset bindlessOffset = 0x10; + kernel.kernelInfo.addArgBuffer(0, 0, sizeof(void *), undefined, bindlessOffset); + + const auto surfaceStateHeap = kernel.mockKernel->getSurfaceStateHeap(); + const auto surfaceStateHeapSize = kernel.mockKernel->getSurfaceStateHeapSize(); + + auto ssHeapDataInitial = std::make_unique(surfaceStateHeapSize); + std::memcpy(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize); + + kernel.kernelInfo.kernelDescriptor.bindlessArgsMap.clear(); + + uint64_t crossThreadData = 0; + kernel.mockKernel->patchWithImplicitSurface(castToUint64(&crossThreadData), mockAllocation, kernel.kernelInfo.argAsPtr(0)); + + EXPECT_EQ(0, std::memcmp(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize)); +} + +HWTEST_F(KernelTest, givenBindlessHeapsHelperAndBindlessArgBufferWhenPatchWithImplicitSurfaceThenCrossThreadDataIsPatchedAndSurfaceStateIsEncoded) { + auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get()))); + auto &neoDevice = device->getDevice(); + + auto bindlessHeapHelper = new MockBindlesHeapsHelper(&neoDevice, false); + neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapHelper); + + MockKernelWithInternals kernel(*device); + + uint64_t gpuAddress = 0x1200; + const void *cpuPtr = reinterpret_cast(gpuAddress); + size_t allocSize = 0x1000; + MockGraphicsAllocation mockAllocation(const_cast(cpuPtr), gpuAddress, allocSize); + + kernel.kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless; + + EXPECT_TRUE(device->getMemoryManager()->allocateBindlessSlot(&mockAllocation)); + + const CrossThreadDataOffset bindlessOffset = 0x10; + kernel.kernelInfo.addArgBuffer(0, 0, sizeof(void *), undefined, bindlessOffset); + + kernel.kernelInfo.kernelDescriptor.initBindlessOffsetToSurfaceState(); + + uint64_t crossThreadData = 0; + kernel.mockKernel->patchWithImplicitSurface(castToUint64(&crossThreadData), mockAllocation, kernel.kernelInfo.argAsPtr(0)); + + auto ssInHeapInfo = mockAllocation.getBindlessInfo(); + + auto patchLocation = reinterpret_cast(ptrOffset(kernel.mockKernel->crossThreadData, bindlessOffset)); + auto patchValue = device->getGfxCoreHelper().getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(ssInHeapInfo.surfaceStateOffset)); + + EXPECT_EQ(patchValue, *patchLocation); + + typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; + const auto surfaceState = reinterpret_cast(ssInHeapInfo.ssPtr); + const auto surfaceAddress = surfaceState->getSurfaceBaseAddress(); + + const auto bufferAddress = mockAllocation.getGpuAddressToPatch(); + EXPECT_EQ(bufferAddress, surfaceAddress); +} + TEST(KernelTest, givenDefaultKernelWhenItIsCreatedThenItReportsStatelessWrites) { auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get()))); MockKernelWithInternals kernel(*device); diff --git a/opencl/test/unit_test/mocks/mock_kernel.h b/opencl/test/unit_test/mocks/mock_kernel.h index 649e5853c1..1f817c26f8 100644 --- a/opencl/test/unit_test/mocks/mock_kernel.h +++ b/opencl/test/unit_test/mocks/mock_kernel.h @@ -98,6 +98,8 @@ class MockKernel : public Kernel { using Kernel::anyKernelArgumentUsingSystemMemory; using Kernel::auxTranslationRequired; using Kernel::containsStatelessWrites; + using Kernel::crossThreadData; + using Kernel::crossThreadDataSize; using Kernel::dataParameterSimdSize; using Kernel::executionType; using Kernel::getDevice; diff --git a/opencl/test/unit_test/program/program_data_tests.cpp b/opencl/test/unit_test/program/program_data_tests.cpp index e462c32f2e..9b95200701 100644 --- a/opencl/test/unit_test/program/program_data_tests.cpp +++ b/opencl/test/unit_test/program/program_data_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -15,6 +15,7 @@ #include "shared/test/common/device_binary_format/patchtokens_tests.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/gtest_helpers.h" +#include "shared/test/common/mocks/mock_bindless_heaps_helper.h" #include "shared/test/common/mocks/mock_csr.h" #include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/mocks/mock_memory_manager.h" @@ -324,6 +325,116 @@ TEST_F(ProgramDataTest, whenGlobalVariablesAreNotExportedThenAllocateSurfacesAsN EXPECT_EQ(nullptr, this->pContext->getSVMAllocsManager()->getSVMAlloc(reinterpret_cast(pProgram->getGlobalSurface(pContext->getDevice(0)->getRootDeviceIndex())->getGpuAddress()))); } +using ProgramDataBindlessTest = ProgramDataTest; + +TEST_F(ProgramDataBindlessTest, givenBindlessKernelAndConstantsAndVariablesMemorySurfaceWhenProcessProgramInfoThenConstantsAndVariablesSurfaceBindlessSlotIsAllocated) { + auto &neoDevice = pClDevice->getDevice(); + neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->memoryOperationsInterface = + std::make_unique(); + + auto bindlessHeapHelper = new MockBindlesHeapsHelper(&neoDevice, false); + neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapHelper); + + ProgramInfo programInfo; + + char globalConstantsData[128] = {}; + programInfo.globalConstants.initData = globalConstantsData; + programInfo.globalConstants.size = sizeof(globalConstantsData); + + char globalVariablesData[128] = {}; + programInfo.globalVariables.initData = globalVariablesData; + programInfo.globalVariables.size = sizeof(globalVariablesData); + + auto kernelInfo1 = std::make_unique(); + kernelInfo1->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::Bindful; + auto kernelInfo2 = std::make_unique(); + kernelInfo1->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless; + + programInfo.kernelInfos.push_back(kernelInfo1.release()); + programInfo.kernelInfos.push_back(kernelInfo2.release()); + + std::unique_ptr> mockLinkerInput = std::make_unique>(); + programInfo.linkerInput = std::move(mockLinkerInput); + this->pProgram->processProgramInfo(programInfo, *pClDevice); + + ASSERT_NE(nullptr, pProgram->getConstantSurface(pContext->getDevice(0)->getRootDeviceIndex())); + ASSERT_NE(nullptr, pProgram->getGlobalSurface(pContext->getDevice(0)->getRootDeviceIndex())); + + auto globalConstantsAlloc = pProgram->getConstantSurface(pContext->getDevice(0)->getRootDeviceIndex()); + auto ssInHeap1 = globalConstantsAlloc->getBindlessInfo(); + + EXPECT_NE(nullptr, ssInHeap1.heapAllocation); + + auto globalVariablesAlloc = pProgram->getGlobalSurface(pContext->getDevice(0)->getRootDeviceIndex()); + auto ssInHeap2 = globalVariablesAlloc->getBindlessInfo(); + + EXPECT_NE(nullptr, ssInHeap2.heapAllocation); +} + +TEST_F(ProgramDataBindlessTest, givenBindlessKernelAndGlobalConstantsMemorySurfaceWhenProcessProgramInfoAndSSAllocationFailsThenGlobalConstantsSurfaceBindlessSlotIsNotAllocatedAndReturnOutOfHostMemory) { + auto &neoDevice = pClDevice->getDevice(); + neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->memoryOperationsInterface = + std::make_unique(); + + auto bindlessHeapHelper = new MockBindlesHeapsHelper(&neoDevice, false); + bindlessHeapHelper->failAllocateSS = true; + neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapHelper); + + ProgramInfo programInfo; + + char globalConstantsData[128] = {}; + programInfo.globalConstants.initData = globalConstantsData; + programInfo.globalConstants.size = sizeof(globalConstantsData); + + auto kernelInfo = std::make_unique(); + kernelInfo->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless; + + programInfo.kernelInfos.push_back(kernelInfo.release()); + + std::unique_ptr> mockLinkerInput = std::make_unique>(); + programInfo.linkerInput = std::move(mockLinkerInput); + auto ret = this->pProgram->processProgramInfo(programInfo, *pClDevice); + EXPECT_EQ(ret, CL_OUT_OF_HOST_MEMORY); + + auto globalConstantsAlloc = pProgram->getConstantSurface(pContext->getDevice(0)->getRootDeviceIndex()); + ASSERT_NE(nullptr, globalConstantsAlloc); + + auto ssInHeap = globalConstantsAlloc->getBindlessInfo(); + EXPECT_EQ(nullptr, ssInHeap.heapAllocation); +} + +TEST_F(ProgramDataBindlessTest, givenBindlessKernelAndGlobalVariablesMemorySurfaceWhenProcessProgramInfoAndSSAllocationFailsThenGlobalVariablesSurfaceBindlessSlotIsNotAllocatedAndReturnOutOfHostMemory) { + auto &neoDevice = pClDevice->getDevice(); + neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->memoryOperationsInterface = + std::make_unique(); + + auto bindlessHeapHelper = new MockBindlesHeapsHelper(&neoDevice, false); + bindlessHeapHelper->failAllocateSS = true; + neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapHelper); + + ProgramInfo programInfo; + + char globalVariablesData[128] = {}; + programInfo.globalVariables.initData = globalVariablesData; + programInfo.globalVariables.size = sizeof(globalVariablesData); + + auto kernelInfo = std::make_unique(); + kernelInfo->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless; + + programInfo.kernelInfos.push_back(kernelInfo.release()); + + std::unique_ptr> mockLinkerInput = std::make_unique>(); + programInfo.linkerInput = std::move(mockLinkerInput); + auto ret = this->pProgram->processProgramInfo(programInfo, *pClDevice); + EXPECT_EQ(ret, CL_OUT_OF_HOST_MEMORY); + + auto globalVariablesAlloc = pProgram->getGlobalSurface(pContext->getDevice(0)->getRootDeviceIndex()); + ASSERT_NE(nullptr, globalVariablesAlloc); + + auto ssInHeap = globalVariablesAlloc->getBindlessInfo(); + EXPECT_EQ(nullptr, ssInHeap.heapAllocation); +} + TEST_F(ProgramDataTest, givenConstantAllocationThatIsInUseByGpuWhenProgramIsBeingDestroyedThenItIsAddedToTemporaryAllocationList) { setupConstantAllocation();