From 313fb84fda0c7a82e97afc9519aa3333ea4b5bad Mon Sep 17 00:00:00 2001 From: Mateusz Hoppe Date: Wed, 14 Jun 2023 18:26:31 +0000 Subject: [PATCH] feature: bindless addressing mode support - allow bindless kernels to execute - bindless addressing kernels are using private heaps mode - do not differentiate bindful and bindless surface state base addresses Related-To: NEO-7063 Signed-off-by: Mateusz Hoppe --- level_zero/core/source/kernel/kernel_hw.h | 14 +- level_zero/core/source/kernel/kernel_imp.cpp | 49 ++++++ level_zero/core/source/kernel/kernel_imp.h | 2 + .../core/test/black_box_tests/CMakeLists.txt | 1 + .../black_box_tests/common/zello_compile.cpp | 62 +++++++ .../black_box_tests/common/zello_compile.h | 1 + .../black_box_tests/zello_bindless_kernel.cpp | 162 ++++++++++++++++++ .../test/unit_tests/fixtures/module_fixture.h | 1 + .../core/test/unit_tests/mocks/mock_kernel.h | 1 + .../unit_tests/sources/kernel/test_kernel.cpp | 103 +++++++++++ .../unit_tests/sources/module/test_module.cpp | 13 ++ .../command_container/command_encoder.inl | 4 + .../command_encoder_bdw_and_later.inl | 12 ++ .../command_encoder_xehp_and_later.inl | 37 ++-- .../device_binary_format_zebin.cpp | 4 + .../dispatch_kernel_encoder_interface.h | 3 +- shared/source/kernel/kernel_descriptor.cpp | 41 +++++ shared/source/kernel/kernel_descriptor.h | 9 + .../kernel_descriptor_from_patchtokens.cpp | 4 + .../xe_hpg_core/hw_cmds_xe_hpg_core_base.h | 2 +- .../device_binary_formats_tests.cpp | 46 +++++ .../encoders/test_encode_dispatch_kernel.cpp | 32 +++- ...rnel_descriptor_from_patchtokens_tests.cpp | 32 +++- .../kernel/kernel_descriptor_tests.cpp | 86 ++++++++++ .../mock_dispatch_kernel_encoder_interface.h | 4 +- 25 files changed, 699 insertions(+), 26 deletions(-) create mode 100644 level_zero/core/test/black_box_tests/zello_bindless_kernel.cpp diff --git a/level_zero/core/source/kernel/kernel_hw.h b/level_zero/core/source/kernel/kernel_hw.h index d792b1db0f..af6e6aceef 100644 --- a/level_zero/core/source/kernel/kernel_hw.h +++ b/level_zero/core/source/kernel/kernel_hw.h @@ -55,12 +55,18 @@ struct KernelHw : public KernelImp { } void *surfaceStateAddress = nullptr; auto surfaceState = GfxFamily::cmdInitRenderSurfaceState; - if (NEO::isValidOffset(argInfo.bindless)) { - surfaceStateAddress = patchBindlessSurfaceState(alloc, argInfo.bindless); - } else { + + if (NEO::isValidOffset(argInfo.bindful)) { surfaceStateAddress = ptrOffset(surfaceStateHeapData.get(), argInfo.bindful); + surfaceState = *reinterpret_cast(surfaceStateAddress); + + } else if (NEO::isValidOffset(argInfo.bindless)) { + if (this->module->getDevice()->getNEODevice()->getBindlessHeapsHelper()) { + surfaceStateAddress = patchBindlessSurfaceState(alloc, argInfo.bindless); + } else { + surfaceStateAddress = ptrOffset(surfaceStateHeapData.get(), getSurfaceStateIndexForBindlessOffset(argInfo.bindless) * sizeof(typename GfxFamily::RENDER_SURFACE_STATE)); + } } - surfaceState = *reinterpret_cast(surfaceStateAddress); uint64_t bufferAddressForSsh = baseAddress; auto alignment = NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignment(); diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 1e0772aa5b..4ed0eaa605 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -118,6 +118,14 @@ ze_result_t KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device memcpy_s(surfaceStateHeapTemplate.get(), surfaceStateHeapSize, kernelInfo->heapInfo.pSsh, surfaceStateHeapSize); + } else if (NEO::KernelDescriptor::isBindlessAddressingKernel(kernelInfo->kernelDescriptor)) { + auto &gfxCoreHelper = deviceImp->getNEODevice()->getGfxCoreHelper(); + auto surfaceStateSize = static_cast(gfxCoreHelper.getRenderSurfaceStateSize()); + + this->surfaceStateHeapSize = kernelInfo->kernelDescriptor.kernelAttributes.numArgsStateful * surfaceStateSize; + UNRECOVERABLE_IF(kernelInfo->kernelDescriptor.kernelAttributes.numArgsStateful != kernelInfo->kernelDescriptor.getBindlessOffsetToSurfaceState().size()); + + surfaceStateHeapTemplate.reset(new uint8_t[surfaceStateHeapSize]); } if (kernelInfo->heapInfo.dynamicStateHeapSize != 0) { @@ -1165,4 +1173,45 @@ void KernelImp::setAssertBuffer() { static_cast(assertHandler->getAssertBuffer()->getGpuAddressToPatch())); this->residencyContainer.push_back(assertHandler->getAssertBuffer()); } + +void KernelImp::patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const { + + auto &gfxCoreHelper = this->module->getDevice()->getGfxCoreHelper(); + auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + for (size_t argIndex = 0; argIndex < kernelImmData->getDescriptor().payloadMappings.explicitArgs.size(); argIndex++) { + const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex]; + + auto crossThreadOffset = NEO::undefined; + if (arg.type == NEO::ArgDescriptor::ArgTPointer) { + crossThreadOffset = arg.as().bindless; + } else if (arg.type == NEO::ArgDescriptor::ArgTImage) { + crossThreadOffset = arg.as().bindless; + } else { + continue; + } + + if (NEO::isValidOffset(crossThreadOffset)) { + auto patchLocation = ptrOffset(getCrossThreadData(), crossThreadOffset); + auto index = getSurfaceStateIndexForBindlessOffset(crossThreadOffset); + + if (index < std::numeric_limits::max()) { + auto surfaceStateOffset = static_cast(bindlessSurfaceStateBaseOffset + index * surfaceStateSize); + auto patchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(surfaceStateOffset)); + + patchWithRequiredSize(const_cast(patchLocation), sizeof(patchValue), patchValue); + } + } + } +} + +uint32_t KernelImp::getSurfaceStateIndexForBindlessOffset(NEO::CrossThreadDataOffset bindlessOffset) const { + const auto &iter = getKernelDescriptor().getBindlessOffsetToSurfaceState().find(bindlessOffset); + if (iter != getKernelDescriptor().getBindlessOffsetToSurfaceState().end()) { + return iter->second; + } + DEBUG_BREAK_IF(true); + return std::numeric_limits::max(); +} + } // namespace L0 diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h index 977a8a1311..2e9ea1e62a 100644 --- a/level_zero/core/source/kernel/kernel_imp.h +++ b/level_zero/core/source/kernel/kernel_imp.h @@ -164,6 +164,7 @@ struct KernelImp : Kernel { NEO::GraphicsAllocation *allocatePrivateMemoryGraphicsAllocation() override; void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) override; + void patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const override; NEO::GraphicsAllocation *getPrivateMemoryGraphicsAllocation() override { return privateMemoryGraphicsAllocation; @@ -189,6 +190,7 @@ struct KernelImp : Kernel { void setAssertBuffer(); virtual void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) = 0; void *patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless); + uint32_t getSurfaceStateIndexForBindlessOffset(NEO::CrossThreadDataOffset bindlessOffset) const; const KernelImmutableData *kernelImmData = nullptr; Module *module = nullptr; diff --git a/level_zero/core/test/black_box_tests/CMakeLists.txt b/level_zero/core/test/black_box_tests/CMakeLists.txt index f120e1e3b7..c51ff63756 100644 --- a/level_zero/core/test/black_box_tests/CMakeLists.txt +++ b/level_zero/core/test/black_box_tests/CMakeLists.txt @@ -34,6 +34,7 @@ target_include_directories(${L0_BLACK_BOX_TEST_SHARED_LIB} PUBLIC ${CMAKE_CURREN set_target_properties(${L0_BLACK_BOX_TEST_SHARED_LIB} PROPERTIES FOLDER ${L0_BLACK_BOX_TEST_PROJECT_FOLDER}) set(TEST_TARGETS + zello_bindless_kernel zello_commandlist_immediate zello_copy zello_copy_fence diff --git a/level_zero/core/test/black_box_tests/common/zello_compile.cpp b/level_zero/core/test/black_box_tests/common/zello_compile.cpp index f32770bc40..580e604063 100644 --- a/level_zero/core/test/black_box_tests/common/zello_compile.cpp +++ b/level_zero/core/test/black_box_tests/common/zello_compile.cpp @@ -67,6 +67,68 @@ std::vector compileToSpirV(const std::string &src, const std::string &o return ret; } +std::vector compileToNative(const std::string &src, const std::string &deviceName, const std::string &options, const std::string &internalOptions, std::string &outCompilerLog) { + std::vector ret; + + const char *mainFileName = "main.cl"; + const char *argv[] = {"ocloc", "-q", "-device", deviceName.c_str(), "-file", mainFileName, "-o", "output.bin", "", "", "", ""}; + uint32_t numArgs = sizeof(argv) / sizeof(argv[0]) - 4; + int argIndex = 8; + if (options.size() > 0) { + argv[argIndex++] = "-options"; + argv[argIndex++] = options.c_str(); + numArgs += 2; + } + if (internalOptions.size() > 0) { + argv[argIndex++] = "-internal_options"; + argv[argIndex++] = internalOptions.c_str(); + numArgs += 2; + } + const unsigned char *sources[] = {reinterpret_cast(src.c_str())}; + size_t sourcesLengths[] = {src.size() + 1}; + const char *sourcesNames[] = {mainFileName}; + unsigned int numOutputs = 0U; + unsigned char **outputs = nullptr; + size_t *ouputLengths = nullptr; + char **outputNames = nullptr; + + int result = oclocInvoke(numArgs, argv, + 1, sources, sourcesLengths, sourcesNames, + 0, nullptr, nullptr, nullptr, + &numOutputs, &outputs, &ouputLengths, &outputNames); + + unsigned char *binary = nullptr; + size_t binaryLen = 0; + const char *log = nullptr; + size_t logLen = 0; + for (unsigned int i = 0; i < numOutputs; ++i) { + std::string spvExtension = ".spv"; + std::string logFileName = "stdout.log"; + auto nameLen = std::strlen(outputNames[i]); + if (std::strstr(outputNames[i], "output.bin") != nullptr) { + binary = outputs[i]; + binaryLen = ouputLengths[i]; + } else if ((nameLen >= logFileName.size()) && (std::strstr(outputNames[i], logFileName.c_str()) != nullptr)) { + log = reinterpret_cast(outputs[i]); + logLen = ouputLengths[i]; + break; + } + } + + if ((result != 0) && (logLen == 0)) { + outCompilerLog = "Unknown error, ocloc returned : " + std::to_string(result) + "\n"; + return ret; + } + + if (logLen != 0) { + outCompilerLog = std::string(log, logLen).c_str(); + } + + ret.assign(binary, binary + binaryLen); + oclocFreeOutput(&numOutputs, &outputs, &ouputLengths, &outputNames); + return ret; +} + const char *memcpyBytesTestKernelSrc = R"===( kernel void memcpy_bytes(__global char *dst, const __global char *src) { unsigned int gid = get_global_id(0); diff --git a/level_zero/core/test/black_box_tests/common/zello_compile.h b/level_zero/core/test/black_box_tests/common/zello_compile.h index 89fdf6b851..0485030257 100644 --- a/level_zero/core/test/black_box_tests/common/zello_compile.h +++ b/level_zero/core/test/black_box_tests/common/zello_compile.h @@ -12,6 +12,7 @@ #include std::vector compileToSpirV(const std::string &src, const std::string &options, std::string &outCompilerLog); +std::vector compileToNative(const std::string &src, const std::string &deviceName, const std::string &options, const std::string &internalOptions, std::string &outCompilerLog); extern const char *memcpyBytesTestKernelSrc; diff --git a/level_zero/core/test/black_box_tests/zello_bindless_kernel.cpp b/level_zero/core/test/black_box_tests/zello_bindless_kernel.cpp new file mode 100644 index 0000000000..fbbf00ccf5 --- /dev/null +++ b/level_zero/core/test/black_box_tests/zello_bindless_kernel.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2021-2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include + +#include "zello_common.h" +#include "zello_compile.h" + +#include +#include +#include +#include +#include + +const char *source = R"===( +__kernel void kernel_copy(__global char *dst, __global char *src){ + uint gid = get_global_id(0); + dst[gid] = src[gid]; +} +)==="; + +static std::string kernelName = "kernel_copy"; + +enum class ExecutionMode : uint32_t { + CommandQueue, + ImmSyncCmdList +}; + +void createModule(const ze_context_handle_t context, const ze_device_handle_t device, const std::string &deviceName, ze_module_handle_t &module) { + std::string buildLog; + auto bin = compileToNative(source, deviceName, "", "-cl-intel-use-bindless-mode -cl-intel-use-bindless-advanced-mode", buildLog); + if (buildLog.size() > 0) { + std::cout << "Build log " << buildLog; + } + SUCCESS_OR_TERMINATE((0 == bin.size())); + + ze_module_desc_t moduleDesc = {ZE_STRUCTURE_TYPE_MODULE_DESC}; + moduleDesc.format = ZE_MODULE_FORMAT_NATIVE; + moduleDesc.pInputModule = bin.data(); + moduleDesc.inputSize = bin.size(); + moduleDesc.pBuildFlags = ""; + + SUCCESS_OR_TERMINATE(zeModuleCreate(context, device, &moduleDesc, &module, nullptr)); +} + +void createKernel(const ze_module_handle_t module, ze_kernel_handle_t &kernel, const char *kernelName) { + + ze_kernel_desc_t kernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC}; + kernelDesc.pKernelName = kernelName; + SUCCESS_OR_TERMINATE(zeKernelCreate(module, &kernelDesc, &kernel)); +} + +void runKernel(const ze_module_handle_t &module, const ze_kernel_handle_t &kernel, + ze_context_handle_t &context, ze_device_handle_t &device, uint32_t id, ExecutionMode mode, bool &outputValidationSuccessful) { + + CommandHandler commandHandler; + bool isImmediateCmdList = (mode == ExecutionMode::ImmSyncCmdList); + + SUCCESS_OR_TERMINATE(commandHandler.create(context, device, isImmediateCmdList)); + + constexpr size_t allocSize = 4096; + ze_device_mem_alloc_desc_t deviceDesc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC}; + deviceDesc.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED; + deviceDesc.ordinal = 0; + + ze_host_mem_alloc_desc_t hostDesc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC}; + hostDesc.flags = ZE_HOST_MEM_ALLOC_FLAG_BIAS_UNCACHED; + + void *srcBuffer = nullptr; + SUCCESS_OR_TERMINATE(zeMemAllocShared(context, &deviceDesc, &hostDesc, allocSize, 1, device, &srcBuffer)); + + void *dstBuffer = nullptr; + SUCCESS_OR_TERMINATE(zeMemAllocShared(context, &deviceDesc, &hostDesc, allocSize, 1, device, &dstBuffer)); + + // Initialize memory + constexpr uint8_t val = 55; + memset(srcBuffer, val, allocSize); + memset(dstBuffer, 0, allocSize); + + SUCCESS_OR_TERMINATE(zeKernelSetArgumentValue(kernel, 0, sizeof(dstBuffer), &dstBuffer)); + SUCCESS_OR_TERMINATE(zeKernelSetArgumentValue(kernel, 1, sizeof(srcBuffer), &srcBuffer)); + + ze_group_count_t dispatchTraits; + SUCCESS_OR_TERMINATE(zeKernelSetGroupSize(kernel, 32U, 1U, 1U)); + dispatchTraits.groupCountX = allocSize / 32u; + dispatchTraits.groupCountY = 1u; + dispatchTraits.groupCountZ = 1u; + + SUCCESS_OR_TERMINATE(commandHandler.appendKernel(kernel, dispatchTraits)); + SUCCESS_OR_TERMINATE(commandHandler.execute()); + SUCCESS_OR_TERMINATE(commandHandler.synchronize()); + + // Validate + if (memcmp(dstBuffer, srcBuffer, allocSize)) { + outputValidationSuccessful = false; + uint8_t *srcCharBuffer = static_cast(srcBuffer); + uint8_t *dstCharBuffer = static_cast(dstBuffer); + for (size_t i = 0; i < allocSize; i++) { + if (srcCharBuffer[i] != dstCharBuffer[i]) { + std::cout << "srcBuffer[" << i << "] = " << std::dec << static_cast(srcCharBuffer[i]) << " not equal to " + << "dstBuffer[" << i << "] = " << std::dec << static_cast(dstCharBuffer[i]) << "\n"; + break; + } + } + } else { + outputValidationSuccessful = true; + } + + SUCCESS_OR_TERMINATE(zeMemFree(context, dstBuffer)); + SUCCESS_OR_TERMINATE(zeMemFree(context, srcBuffer)); +} + +int main(int argc, char *argv[]) { + verbose = isVerbose(argc, argv); + bool outputValidated = false; + + ze_context_handle_t context = nullptr; + auto devices = zelloInitContextAndGetDevices(context); + auto device = devices[0]; + + ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; + SUCCESS_OR_TERMINATE(zeDeviceGetProperties(device, &deviceProperties)); + printDeviceProperties(deviceProperties); + + ze_module_handle_t module = nullptr; + + std::stringstream ss; + ss.setf(std::ios::hex, std::ios::basefield); + ss << "0x" << deviceProperties.deviceId; + + createModule(context, device, ss.str(), module); + + ExecutionMode executionModes[] = {ExecutionMode::CommandQueue, ExecutionMode::ImmSyncCmdList}; + ze_kernel_handle_t kernel = nullptr; + createKernel(module, kernel, kernelName.c_str()); + + for (auto mode : executionModes) { + + outputValidated = false; + + runKernel(module, kernel, context, device, 0, mode, outputValidated); + + if (!outputValidated) { + std::cout << "Zello bindless kernel failed\n" + << std::endl; + break; + } + } + + SUCCESS_OR_TERMINATE(zeKernelDestroy(kernel)); + SUCCESS_OR_TERMINATE(zeModuleDestroy(module)); + SUCCESS_OR_TERMINATE(zeContextDestroy(context)); + + if (outputValidated) { + std::cout << "\nZello bindless kernel PASSED " << std::endl; + } + return outputValidated == false ? -1 : 0; +} diff --git a/level_zero/core/test/unit_tests/fixtures/module_fixture.h b/level_zero/core/test/unit_tests/fixtures/module_fixture.h index 02d0667706..3eccf2d691 100644 --- a/level_zero/core/test/unit_tests/fixtures/module_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/module_fixture.h @@ -85,6 +85,7 @@ struct ModuleImmutableDataFixture : public DeviceFixture { using KernelImp::kernelRequiresGenerationOfLocalIdsByRuntime; using KernelImp::kernelRequiresUncachedMocsCount; using KernelImp::midThreadPreemptionDisallowedForRayTracingKernels; + using KernelImp::patchBindlessOffsetsInCrossThreadData; using KernelImp::printfBuffer; using KernelImp::privateMemoryGraphicsAllocation; using KernelImp::requiredWorkgroupOrder; diff --git a/level_zero/core/test/unit_tests/mocks/mock_kernel.h b/level_zero/core/test/unit_tests/mocks/mock_kernel.h index f8db5d3f1d..28f24d4724 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_kernel.h +++ b/level_zero/core/test/unit_tests/mocks/mock_kernel.h @@ -54,6 +54,7 @@ struct WhiteBox<::L0::Kernel> : public ::L0::KernelImp { using ::L0::KernelImp::midThreadPreemptionDisallowedForRayTracingKernels; using ::L0::KernelImp::module; using ::L0::KernelImp::numThreadsPerThreadGroup; + using ::L0::KernelImp::patchBindlessOffsetsInCrossThreadData; using ::L0::KernelImp::patchBindlessSurfaceState; using ::L0::KernelImp::perThreadDataForWholeThreadGroup; using ::L0::KernelImp::perThreadDataSize; diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index b1f9f56d7e..90214676b2 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -3036,5 +3036,108 @@ HWTEST2_F(MultiTileModuleTest, GivenMultiTileDeviceWhenSettingKernelArgAndSurfac EXPECT_FALSE(surfaceState->getDisableSupportForMultiGpuPartialWrites()); } +using BindlessKernelTest = Test; + +TEST_F(BindlessKernelTest, givenBindlessKernelWhenPatchingCrossThreadDataThenCorrectBindlessOffsetsAreWritten) { + Mock mockModule(this->device, nullptr); + Mock mockKernel; + mockKernel.module = &mockModule; + + mockKernel.descriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + mockKernel.descriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + + auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTPointer); + argDescriptor.as() = NEO::ArgDescPointer(); + argDescriptor.as().bindful = NEO::undefined; + argDescriptor.as().bindless = 0x0; + mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor); + + auto argDescriptorImg = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTImage); + argDescriptorImg.as() = NEO::ArgDescImage(); + argDescriptorImg.as().bindful = NEO::undefined; + argDescriptorImg.as().bindless = sizeof(uint64_t); + mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptorImg); + + auto argDescriptor2 = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTPointer); + argDescriptor2.as() = NEO::ArgDescPointer(); + argDescriptor2.as().bindful = NEO::undefined; + argDescriptor2.as().stateless = 2 * sizeof(uint64_t); + mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor2); + + mockKernel.descriptor.initBindlessOffsetToSurfaceState(); + + mockKernel.crossThreadData = std::make_unique(4 * sizeof(uint64_t)); + mockKernel.crossThreadDataSize = 4 * sizeof(uint64_t); + memset(mockKernel.crossThreadData.get(), 0, mockKernel.crossThreadDataSize); + + const uint64_t baseAddress = 0x1000; + auto &gfxCoreHelper = this->device->getGfxCoreHelper(); + auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + auto patchValue1 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(baseAddress)); + auto patchValue2 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(baseAddress + surfaceStateSize)); + + mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress); + + auto crossThreadData = std::make_unique(mockKernel.crossThreadDataSize / sizeof(uint64_t)); + memcpy(crossThreadData.get(), mockKernel.crossThreadData.get(), mockKernel.crossThreadDataSize); + + EXPECT_EQ(patchValue1, crossThreadData[0]); + EXPECT_EQ(patchValue2, crossThreadData[1]); + EXPECT_EQ(0u, crossThreadData[3]); +} +TEST_F(BindlessKernelTest, givenNoEntryInBindlessOffsetsMapWhenPatchingCrossThreadDataThenMemoryIsNotPatched) { + Mock mockModule(this->device, nullptr); + Mock mockKernel; + mockKernel.module = &mockModule; + + mockKernel.descriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + mockKernel.descriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + + auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTPointer); + argDescriptor.as() = NEO::ArgDescPointer(); + argDescriptor.as().bindful = NEO::undefined; + argDescriptor.as().bindless = 0x0; + mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor); + + mockKernel.crossThreadData = std::make_unique(4 * sizeof(uint64_t)); + mockKernel.crossThreadDataSize = 4 * sizeof(uint64_t); + memset(mockKernel.crossThreadData.get(), 0, mockKernel.crossThreadDataSize); + + const uint64_t baseAddress = 0x1000; + mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress); + + auto crossThreadData = std::make_unique(mockKernel.crossThreadDataSize / sizeof(uint64_t)); + memcpy(crossThreadData.get(), mockKernel.crossThreadData.get(), mockKernel.crossThreadDataSize); + + EXPECT_EQ(0u, crossThreadData[0]); +} + +TEST_F(BindlessKernelTest, givenNoStatefulArgsWhenPatchingBindlessOffsetsInCrossThreadDataThenMemoryIsNotPatched) { + Mock mockModule(this->device, nullptr); + Mock mockKernel; + mockKernel.module = &mockModule; + + mockKernel.descriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + mockKernel.descriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + + auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTValue); + argDescriptor.as() = NEO::ArgDescValue(); + argDescriptor.as().elements.push_back(NEO::ArgDescValue::Element{0, 8, 0, false}); + mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor); + + mockKernel.crossThreadData = std::make_unique(sizeof(uint64_t)); + mockKernel.crossThreadDataSize = sizeof(uint64_t); + memset(mockKernel.crossThreadData.get(), 0, mockKernel.crossThreadDataSize); + + const uint64_t baseAddress = 0x1000; + mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress); + + auto crossThreadData = std::make_unique(mockKernel.crossThreadDataSize / sizeof(uint64_t)); + memcpy(crossThreadData.get(), mockKernel.crossThreadData.get(), mockKernel.crossThreadDataSize); + + EXPECT_EQ(0u, crossThreadData[0]); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/module/test_module.cpp b/level_zero/core/test/unit_tests/sources/module/test_module.cpp index dfa719ac17..ba5faf87a4 100644 --- a/level_zero/core/test/unit_tests/sources/module/test_module.cpp +++ b/level_zero/core/test/unit_tests/sources/module/test_module.cpp @@ -231,6 +231,8 @@ HWTEST_F(ModuleTest, givenStatefulBufferWhenOffsetIsPatchedThenAllocBaseAddressI uint32_t argIndex = 0u; uint32_t offset = 0x1234; + + // Bindful arg const_cast(&(kernelImp->getImmutableData()->getDescriptor()))->payloadMappings.explicitArgs[argIndex].as().bufferOffset = 0; const_cast(&(kernelImp->getImmutableData()->getDescriptor()))->payloadMappings.explicitArgs[argIndex].as().bindful = 0x80; kernelImp->setBufferSurfaceState(argIndex, ptrOffset(devicePtr, offset), gpuAlloc); @@ -240,6 +242,17 @@ HWTEST_F(ModuleTest, givenStatefulBufferWhenOffsetIsPatchedThenAllocBaseAddressI auto surfaceStateAddress = reinterpret_cast(const_cast(surfaceStateAddressRaw)); EXPECT_EQ(devicePtr, reinterpret_cast(surfaceStateAddress->getSurfaceBaseAddress())); + // Bindless arg + surfaceStateAddress->setSurfaceBaseAddress(0); + const_cast(&(kernelImp->getImmutableData()->getDescriptor()))->payloadMappings.explicitArgs[argIndex].as() = ArgDescPointer(); + const_cast(&(kernelImp->getImmutableData()->getDescriptor()))->payloadMappings.explicitArgs[argIndex].as().bufferOffset = 0x8; + const_cast(&(kernelImp->getImmutableData()->getDescriptor()))->payloadMappings.explicitArgs[argIndex].as().bindless = 0; + const_cast(&(kernelImp->getImmutableData()->getDescriptor()))->bindlessArgsMap[0] = 0; + kernelImp->setBufferSurfaceState(argIndex, ptrOffset(devicePtr, offset), gpuAlloc); + + surfaceStateAddress = reinterpret_cast(const_cast(kernelImp->getSurfaceStateHeapData())); + EXPECT_EQ(devicePtr, reinterpret_cast(surfaceStateAddress->getSurfaceBaseAddress())); + Kernel::fromHandle(kernelHandle)->destroy(); context->freeMem(devicePtr); diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 1c5f92afe4..a3b3ec3785 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -749,6 +749,10 @@ size_t EncodeDispatchKernel::getSizeRequiredDsh(const KernelDescriptor & template size_t EncodeDispatchKernel::getSizeRequiredSsh(const KernelInfo &kernelInfo) { size_t requiredSshSize = kernelInfo.heapInfo.surfaceStateHeapSize; + bool isBindlessKernel = NEO::KernelDescriptor ::isBindlessAddressingKernel(kernelInfo.kernelDescriptor); + if (isBindlessKernel) { + requiredSshSize = kernelInfo.kernelDescriptor.kernelAttributes.numArgsStateful * sizeof(typename Family::RENDER_SURFACE_STATE); + } requiredSshSize = alignUp(requiredSshSize, EncodeDispatchKernel::getDefaultSshAlignment()); return requiredSshSize; } diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index 5d208fd08f..a8a481621b 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -114,6 +114,18 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis args.dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount, kernelDescriptor.payloadMappings.bindingTable.tableOffset)); } + } else { + if (args.dispatchInterface->getSurfaceStateHeapDataSize() > 0u) { + auto ssh = args.surfaceStateHeap; + if (ssh == nullptr) { + ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, args.dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); + } + uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase()); + // Allocate space for new ssh data + auto dstSurfaceState = ssh->getSpace(args.dispatchInterface->getSurfaceStateHeapDataSize()); + memcpy_s(dstSurfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), args.dispatchInterface->getSurfaceStateHeapData(), args.dispatchInterface->getSurfaceStateHeapDataSize()); + args.dispatchInterface->patchBindlessOffsetsInCrossThreadData(bindlessSshBaseOffset); + } } idd.setBindingTablePointer(bindingTablePointer); diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index f08c34dcc9..afc8452a3b 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -122,21 +122,34 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } uint32_t bindingTablePointer = 0u; - bool isBindlessKernel = NEO::KernelDescriptor::isBindlessAddressingKernel(kernelDescriptor); - if (!isBindlessKernel && !skipSshProgramming) { - container.prepareBindfulSsh(); - if (bindingTableStateCount > 0u) { - auto ssh = args.surfaceStateHeap; - if (ssh == nullptr) { - ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, args.dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); + if (!skipSshProgramming) { + if (!isBindlessKernel) { + container.prepareBindfulSsh(); + if (bindingTableStateCount > 0u) { + auto ssh = args.surfaceStateHeap; + if (ssh == nullptr) { + ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, args.dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); + } + bindingTablePointer = static_cast(EncodeSurfaceState::pushBindingTableAndSurfaceStates( + *ssh, + args.dispatchInterface->getSurfaceStateHeapData(), + args.dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount, + kernelDescriptor.payloadMappings.bindingTable.tableOffset)); + } + } else { + if (args.dispatchInterface->getSurfaceStateHeapDataSize() > 0u) { + auto ssh = args.surfaceStateHeap; + if (ssh == nullptr) { + ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, args.dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); + } + uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase()); + // Allocate space for new ssh data + auto dstSurfaceState = ssh->getSpace(args.dispatchInterface->getSurfaceStateHeapDataSize()); + memcpy_s(dstSurfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), args.dispatchInterface->getSurfaceStateHeapData(), args.dispatchInterface->getSurfaceStateHeapDataSize()); + args.dispatchInterface->patchBindlessOffsetsInCrossThreadData(bindlessSshBaseOffset); } - bindingTablePointer = static_cast(EncodeSurfaceState::pushBindingTableAndSurfaceStates( - *ssh, - args.dispatchInterface->getSurfaceStateHeapData(), - args.dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount, - kernelDescriptor.payloadMappings.bindingTable.tableOffset)); } } idd.setBindingTablePointer(bindingTablePointer); diff --git a/shared/source/device_binary_format/device_binary_format_zebin.cpp b/shared/source/device_binary_format/device_binary_format_zebin.cpp index 9208844c75..0ad4606e1f 100644 --- a/shared/source/device_binary_format/device_binary_format_zebin.cpp +++ b/shared/source/device_binary_format/device_binary_format_zebin.cpp @@ -125,6 +125,10 @@ DecodeError decodeSingleZebin(ProgramInfo &dst, const SingleDeviceBinary &src, s for (auto &kernelInfo : dst.kernelInfos) { kernelInfo->kernelDescriptor.kernelMetadata.isGeneratedByIgc = isGeneratedByIgc; + + if (KernelDescriptor::isBindlessAddressingKernel(kernelInfo->kernelDescriptor)) { + kernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState(); + } } prepareLinkerInputForZebin(dst, elf); diff --git a/shared/source/kernel/dispatch_kernel_encoder_interface.h b/shared/source/kernel/dispatch_kernel_encoder_interface.h index 92e8a86cc3..cdc419f386 100644 --- a/shared/source/kernel/dispatch_kernel_encoder_interface.h +++ b/shared/source/kernel/dispatch_kernel_encoder_interface.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -46,5 +46,6 @@ struct DispatchKernelEncoderI { virtual bool requiresGenerationOfLocalIdsByRuntime() const = 0; virtual ImplicitArgs *getImplicitArgs() const = 0; + virtual void patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const = 0; }; } // namespace NEO diff --git a/shared/source/kernel/kernel_descriptor.cpp b/shared/source/kernel/kernel_descriptor.cpp index 615e078b06..07df4adf16 100644 --- a/shared/source/kernel/kernel_descriptor.cpp +++ b/shared/source/kernel/kernel_descriptor.cpp @@ -121,4 +121,45 @@ bool KernelDescriptor::isBindlessAddressingKernel(const KernelDescriptor &desc) return bindlessBuffers || bindlessImages; } +void KernelDescriptor::initBindlessOffsetToSurfaceState() { + std::call_once(initBindlessArgsMapOnce, [this]() { + uint32_t index = 0; + + for (size_t i = 0; i < this->payloadMappings.explicitArgs.size(); i++) { + + switch (this->payloadMappings.explicitArgs[i].type) { + case ArgDescriptor::ArgType::ArgTImage: { + auto &argImage = this->payloadMappings.explicitArgs[i].as(); + if (isValidOffset(argImage.bindless)) { + this->bindlessArgsMap.emplace(std::pair{argImage.bindless, index++}); + } + } break; + case ArgDescriptor::ArgType::ArgTPointer: { + auto &argPtr = payloadMappings.explicitArgs[i].as(); + if (isValidOffset(argPtr.bindless)) { + this->bindlessArgsMap.emplace(std::pair{argPtr.bindless, index++}); + } + } break; + default: + break; + } + } + + StackVec implicitArgsVec({&this->payloadMappings.implicitArgs.printfSurfaceAddress, + &this->payloadMappings.implicitArgs.globalVariablesSurfaceAddress, + &this->payloadMappings.implicitArgs.globalConstantsSurfaceAddress, + &this->payloadMappings.implicitArgs.privateMemoryAddress, + &this->payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress, + &this->payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress, + &this->payloadMappings.implicitArgs.systemThreadSurfaceAddress, + &this->payloadMappings.implicitArgs.syncBufferAddress}); + + for (size_t i = 0; i < implicitArgsVec.size(); i++) { + if (isValidOffset(implicitArgsVec[i]->bindless)) { + this->bindlessArgsMap.emplace(std::pair{implicitArgsVec[i]->bindless, index++}); + } + } + }); +} + } // namespace NEO \ No newline at end of file diff --git a/shared/source/kernel/kernel_descriptor.h b/shared/source/kernel/kernel_descriptor.h index 5948858a4c..36d5f418e8 100644 --- a/shared/source/kernel/kernel_descriptor.h +++ b/shared/source/kernel/kernel_descriptor.h @@ -17,12 +17,14 @@ #include #include #include +#include #include #include namespace NEO { using StringMap = std::unordered_map; +using BindlessToSurfaceStateMap = std::unordered_map; using InstructionsSegmentOffset = uint16_t; struct KernelDescriptor { @@ -41,6 +43,10 @@ struct KernelDescriptor { virtual ~KernelDescriptor() = default; void updateCrossThreadDataSize(); + void initBindlessOffsetToSurfaceState(); + const BindlessToSurfaceStateMap &getBindlessOffsetToSurfaceState() const { + return bindlessArgsMap; + } struct KernelAttributes { uint32_t slmInlineSize = 0U; @@ -229,6 +235,9 @@ struct KernelDescriptor { std::vector generatedSsh; std::vector generatedDsh; + + BindlessToSurfaceStateMap bindlessArgsMap; + std::once_flag initBindlessArgsMapOnce; }; } // namespace NEO diff --git a/shared/source/kernel/kernel_descriptor_from_patchtokens.cpp b/shared/source/kernel/kernel_descriptor_from_patchtokens.cpp index 822c756858..5c30e1f1f9 100644 --- a/shared/source/kernel/kernel_descriptor_from_patchtokens.cpp +++ b/shared/source/kernel/kernel_descriptor_from_patchtokens.cpp @@ -527,6 +527,10 @@ void populateKernelDescriptor(KernelDescriptor &dst, const PatchTokenBinary::Ker if (DebugManager.flags.UpdateCrossThreadDataSize.get()) { dst.updateCrossThreadDataSize(); } + + if (KernelDescriptor::isBindlessAddressingKernel(dst)) { + dst.initBindlessOffsetToSurfaceState(); + } } } // namespace NEO diff --git a/shared/source/xe_hpg_core/hw_cmds_xe_hpg_core_base.h b/shared/source/xe_hpg_core/hw_cmds_xe_hpg_core_base.h index 4e512fc86f..7954c5412a 100644 --- a/shared/source/xe_hpg_core/hw_cmds_xe_hpg_core_base.h +++ b/shared/source/xe_hpg_core/hw_cmds_xe_hpg_core_base.h @@ -73,7 +73,7 @@ struct XeHpgCore { struct DataPortBindlessSurfaceExtendedMessageDescriptor { union { struct { - uint32_t bindlessSurfaceOffset : 25; + uint32_t bindlessSurfaceOffset : 26; uint32_t reserved : 6; }; uint32_t packed; diff --git a/shared/test/unit_test/device_binary_format/device_binary_formats_tests.cpp b/shared/test/unit_test/device_binary_format/device_binary_formats_tests.cpp index ab5a89a540..75afcade09 100644 --- a/shared/test/unit_test/device_binary_format/device_binary_formats_tests.cpp +++ b/shared/test/unit_test/device_binary_format/device_binary_formats_tests.cpp @@ -11,6 +11,7 @@ #include "shared/source/device_binary_format/device_binary_formats.h" #include "shared/source/device_binary_format/elf/elf_encoder.h" #include "shared/source/device_binary_format/elf/ocl_elf.h" +#include "shared/source/program/kernel_info.h" #include "shared/source/program/program_info.h" #include "shared/test/common/device_binary_format/patchtokens_tests.h" #include "shared/test/common/mocks/mock_execution_environment.h" @@ -357,6 +358,51 @@ TEST(DecodeSingleDeviceBinary, GivenArFormatThenDecodingFails) { EXPECT_STREQ("Device binary format is packed", decodeErrors.c_str()); } +TEST(DecodeSingleDeviceBinary, GivenBindlessKernelInZebinWhenDecodingThenKernelDescriptorInitilizesBindlessOffsetToSurfaceIndex) { + NEO::MockExecutionEnvironment mockExecutionEnvironment{}; + auto &gfxCoreHelper = mockExecutionEnvironment.rootDeviceEnvironments[0]->getHelper(); + + std::string validZeInfo = std::string("version :\'") + versionToString(NEO::Zebin::ZeInfo::zeInfoDecoderVersion) + R"===(' +kernels: + - name : kernel_bindless + execution_env: + simd_size: 8 + payload_arguments: + - arg_type: arg_bypointer + offset: 0 + size: 4 + arg_index: 0 + addrmode: bindless + addrspace: global + access_type: readwrite +... +)==="; + + uint8_t kernelIsa[8]{0U}; + ZebinTestData::ValidEmptyProgram zebin; + zebin.removeSection(NEO::Zebin::Elf::SHT_ZEBIN::SHT_ZEBIN_ZEINFO, NEO::Zebin::Elf::SectionNames::zeInfo); + zebin.appendSection(NEO::Zebin::Elf::SHT_ZEBIN::SHT_ZEBIN_ZEINFO, NEO::Zebin::Elf::SectionNames::zeInfo, ArrayRef::fromAny(validZeInfo.data(), validZeInfo.size())); + zebin.appendSection(NEO::Elf::SHT_PROGBITS, NEO::Zebin::Elf::SectionNames::textPrefix.str() + "kernel_bindless", {kernelIsa, sizeof(kernelIsa)}); + zebin.elfHeader->machine = NEO::defaultHwInfo->platform.eProductFamily; + + NEO::ProgramInfo programInfo; + std::string decodeErrors; + std::string decodeWarnings; + NEO::SingleDeviceBinary bin; + bin.deviceBinary = zebin.storage; + NEO::DecodeError status; + NEO::DeviceBinaryFormat format; + std::tie(status, format) = NEO::decodeSingleDeviceBinary(programInfo, bin, decodeErrors, decodeWarnings, gfxCoreHelper); + EXPECT_EQ(NEO::DecodeError::Success, status); + EXPECT_EQ(NEO::DeviceBinaryFormat::Zebin, format); + EXPECT_TRUE(decodeWarnings.empty()); + + ASSERT_EQ(1u, programInfo.kernelInfos.size()); + + EXPECT_TRUE(NEO::KernelDescriptor::isBindlessAddressingKernel(programInfo.kernelInfos[0]->kernelDescriptor)); + EXPECT_EQ(1u, programInfo.kernelInfos[0]->kernelDescriptor.bindlessArgsMap.size()); +} + TEST(PackDeviceBinary, GivenRequestToPackThenUsesOclElfFormat) { NEO::SingleDeviceBinary deviceBinary; diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp index 590af5da21..b2d323c570 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp @@ -968,12 +968,12 @@ HWTEST2_F(EncodeDispatchKernelTest, givenBindfulKernelWhenDispatchingKernelThenS EXPECT_NE(usedAfter, usedBefore); } -HWTEST2_F(EncodeDispatchKernelTest, givenBindlessKernelWhenDispatchingKernelThenThenSshFromContainerIsNotUsed, IsAtLeastSkl) { - using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE; +HWTEST2_F(EncodeDispatchKernelTest, givenBindlessKernelWhenDispatchingKernelThenThenSshFromContainerIsUsed, IsAtLeastSkl) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; using WALKER = typename FamilyType::WALKER_TYPE; uint32_t numBindingTable = 1; - BINDING_TABLE_STATE bindingTableState = FamilyType::cmdInitBindingTableState; + RENDER_SURFACE_STATE state = FamilyType::cmdInitRenderSurfaceState; uint32_t dims[] = {1, 1, 1}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); @@ -982,9 +982,9 @@ HWTEST2_F(EncodeDispatchKernelTest, givenBindlessKernelWhenDispatchingKernelThen dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.tableOffset = 0U; dispatchInterface->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless; - const uint8_t *sshData = reinterpret_cast(&bindingTableState); + const uint8_t *sshData = reinterpret_cast(&state); dispatchInterface->getSurfaceStateHeapDataResult = const_cast(sshData); - dispatchInterface->getSurfaceStateHeapDataSizeResult = static_cast(sizeof(BINDING_TABLE_STATE)); + dispatchInterface->getSurfaceStateHeapDataSizeResult = static_cast(sizeof(RENDER_SURFACE_STATE)); bool requiresUncachedMocs = false; auto usedBefore = cmdContainer->getIndirectHeap(HeapType::SURFACE_STATE)->getUsed(); @@ -994,7 +994,7 @@ HWTEST2_F(EncodeDispatchKernelTest, givenBindlessKernelWhenDispatchingKernelThen auto usedAfter = cmdContainer->getIndirectHeap(HeapType::SURFACE_STATE)->getUsed(); - EXPECT_EQ(usedAfter, usedBefore); + EXPECT_NE(usedAfter, usedBefore); } HWTEST_F(EncodeDispatchKernelTest, givenNonBindlessOrStatelessArgWhenDispatchingKernelThenSurfaceStateOffsetInCrossThreadDataIsNotPatched) { @@ -1462,6 +1462,26 @@ HWTEST_F(CommandEncodeStatesTest, givenKernelInfoWhenGettingRequiredSshSpaceThen EXPECT_EQ(expectedSize, size); } +HWTEST_F(CommandEncodeStatesTest, givenKernelInfoOfBindlessKernelWhenGettingRequiredSshSpaceThenReturnCorrectValues) { + using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE; + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + + kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + kernelInfo.kernelDescriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + // no surface states + kernelInfo.heapInfo.surfaceStateHeapSize = 0; + kernelInfo.kernelDescriptor.kernelAttributes.numArgsStateful = 0; + size_t size = EncodeDispatchKernel::getSizeRequiredSsh(kernelInfo); + EXPECT_EQ(0u, size); + + // two surface states + kernelInfo.kernelDescriptor.kernelAttributes.numArgsStateful = 2; + size_t expectedSize = alignUp(2 * sizeof(RENDER_SURFACE_STATE), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); + + size = EncodeDispatchKernel::getSizeRequiredSsh(kernelInfo); + EXPECT_EQ(expectedSize, size); +} + HWTEST_F(CommandEncodeStatesTest, givenCommandContainerWhenIsKernelDispatchedFromImmediateCmdListTrueThenGetHeapWithRequiredSizeAndAlignmentCalled) { std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); diff --git a/shared/test/unit_test/kernel/kernel_descriptor_from_patchtokens_tests.cpp b/shared/test/unit_test/kernel/kernel_descriptor_from_patchtokens_tests.cpp index 82fa28bc5b..002fa00f00 100644 --- a/shared/test/unit_test/kernel/kernel_descriptor_from_patchtokens_tests.cpp +++ b/shared/test/unit_test/kernel/kernel_descriptor_from_patchtokens_tests.cpp @@ -1590,4 +1590,34 @@ TEST(KernelDescriptorFromPatchtokens, givenDataParameterImplArgBufferTokenWhenPo EXPECT_EQ(offset, kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer); EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs); -} \ No newline at end of file +} + +TEST(KernelDescriptorFromPatchtokens, GivenBindlessArgsWhenPopulatingDescriptorThenBindlessOffsetToSurfaceStateIndexIsInitialized) { + NEO::PatchTokenBinary::KernelFromPatchtokens kernelTokens; + iOpenCL::SKernelBinaryHeaderCommon kernelHeader; + kernelTokens.header = &kernelHeader; + NEO::KernelDescriptor kernelDescriptor; + + iOpenCL::SPatchExecutionEnvironment execEnv = {}; + execEnv.UseBindlessMode = 1; + kernelTokens.tokens.executionEnvironment = &execEnv; + + iOpenCL::SPatchGlobalMemoryObjectKernelArgument globalMemArg = {}; + globalMemArg.Token = iOpenCL::PATCH_TOKEN_GLOBAL_MEMORY_OBJECT_KERNEL_ARGUMENT; + globalMemArg.ArgumentNumber = 0; + globalMemArg.Offset = 0x40; + + kernelTokens.tokens.kernelArgs.resize(1); + kernelTokens.tokens.kernelArgs[0].objectArg = &globalMemArg; + + NEO::populateKernelDescriptor(kernelDescriptor, kernelTokens, sizeof(void *)); + + EXPECT_EQ(NEO::KernelDescriptor::BindlessAndStateless, kernelDescriptor.kernelAttributes.bufferAddressingMode); + + EXPECT_TRUE(NEO::isUndefinedOffset(kernelDescriptor.payloadMappings.explicitArgs[0].as().bindful)); + EXPECT_EQ(0x40, kernelDescriptor.payloadMappings.explicitArgs[0].as().bindless); + + ASSERT_EQ(1u, kernelDescriptor.bindlessArgsMap.size()); + EXPECT_EQ(0u, kernelDescriptor.bindlessArgsMap[0x40]); + EXPECT_EQ(1u, kernelDescriptor.bindlessArgsMap.size()); +} diff --git a/shared/test/unit_test/kernel/kernel_descriptor_tests.cpp b/shared/test/unit_test/kernel/kernel_descriptor_tests.cpp index a03d83c7db..e70ddbf070 100644 --- a/shared/test/unit_test/kernel/kernel_descriptor_tests.cpp +++ b/shared/test/unit_test/kernel/kernel_descriptor_tests.cpp @@ -127,3 +127,89 @@ TEST(KernelDescriptor, GivenBufferOrImageBindlessAddressingWhenIsBindlessAddress desc.kernelAttributes.flags.usesImages = true; EXPECT_TRUE(NEO::KernelDescriptor::isBindlessAddressingKernel(desc)); } + +TEST(KernelDescriptor, GivenDescriptorWithBindlessArgsWhenInitBindlessOffsetsToSurfaceStateCalledThenMapIsInitializedOnceAndReturnsCorrectSurfaceIndices) { + NEO::KernelDescriptor desc; + + desc.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + desc.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + + auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTPointer); + argDescriptor.as() = NEO::ArgDescPointer(); + argDescriptor.as().bindful = NEO::undefined; + argDescriptor.as().bindless = 0x40; + + desc.payloadMappings.explicitArgs.push_back(argDescriptor); + + auto argDescriptor2 = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTPointer); + argDescriptor2.as() = NEO::ArgDescPointer(); + argDescriptor2.as().bindful = NEO::undefined; + argDescriptor2.as().bindless = NEO::undefined; + argDescriptor2.as().stateless = 0x80; + + desc.payloadMappings.explicitArgs.push_back(argDescriptor2); + + auto argDescriptor3 = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTImage); + argDescriptor3.as() = NEO::ArgDescImage(); + argDescriptor3.as().bindful = NEO::undefined; + argDescriptor3.as().bindless = 0x100; + + desc.payloadMappings.explicitArgs.push_back(argDescriptor3); + + auto argDescriptor4 = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTImage); + argDescriptor4.as() = NEO::ArgDescImage(); + argDescriptor4.as().bindful = NEO::undefined; + argDescriptor4.as().bindless = NEO::undefined; + + desc.payloadMappings.explicitArgs.push_back(argDescriptor4); + + argDescriptor.as().bindless = 0x80; + desc.payloadMappings.implicitArgs.globalVariablesSurfaceAddress = argDescriptor.as(); + + desc.initBindlessOffsetToSurfaceState(); + EXPECT_EQ(3u, desc.bindlessArgsMap.size()); + + EXPECT_EQ(0u, desc.bindlessArgsMap[0x40]); + EXPECT_EQ(1u, desc.bindlessArgsMap[0x100]); + EXPECT_EQ(2u, desc.bindlessArgsMap[0x80]); + + EXPECT_EQ(0u, desc.getBindlessOffsetToSurfaceState().find(0x40)->second); + EXPECT_EQ(1u, desc.getBindlessOffsetToSurfaceState().find(0x100)->second); + EXPECT_EQ(2u, desc.getBindlessOffsetToSurfaceState().find(0x80)->second); + + desc.bindlessArgsMap.clear(); + desc.initBindlessOffsetToSurfaceState(); + EXPECT_EQ(0u, desc.bindlessArgsMap.size()); +} + +TEST(KernelDescriptor, GivenDescriptorWithoutStatefulArgsWhenInitBindlessOffsetsToSurfaceStateCalledThenMapOfBindlessOffsetToSurfaceStateIndexIsEmpty) { + NEO::KernelDescriptor desc; + + desc.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + desc.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + + auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTPointer); + argDescriptor.as() = NEO::ArgDescPointer(); + argDescriptor.as().bindful = NEO::undefined; + argDescriptor.as().bindless = NEO::undefined; + argDescriptor.as().stateless = 0x40; + + desc.payloadMappings.explicitArgs.push_back(argDescriptor); + + auto argDescriptor2 = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTPointer); + argDescriptor2.as() = NEO::ArgDescPointer(); + argDescriptor2.as().bindful = NEO::undefined; + argDescriptor2.as().bindless = NEO::undefined; + argDescriptor2.as().stateless = 0x80; + + desc.payloadMappings.explicitArgs.push_back(argDescriptor2); + + NEO::ArgDescValue::Element argValueElement; + argValueElement.offset = 0x80; + auto argDescriptor3 = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTValue); + argDescriptor3.as().elements.push_back(argValueElement); + desc.payloadMappings.explicitArgs.push_back(argDescriptor3); + + desc.initBindlessOffsetToSurfaceState(); + EXPECT_EQ(0u, desc.bindlessArgsMap.size()); +} \ No newline at end of file diff --git a/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h b/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h index c5c391ab94..69ac2c59c2 100644 --- a/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h +++ b/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -29,6 +29,8 @@ struct MockDispatchKernelEncoder : public DispatchKernelEncoderI { NEO::ImplicitArgs *getImplicitArgs() const override { return nullptr; } + void patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const override { return; }; + MockGraphicsAllocation mockAllocation{}; static constexpr uint32_t crossThreadSize = 0x40; static constexpr uint32_t perThreadSize = 0x20;