From 6e7910546671971c575a72de02c3c6e76125a7e6 Mon Sep 17 00:00:00 2001 From: Mateusz Hoppe Date: Tue, 21 Apr 2020 22:40:21 +0200 Subject: [PATCH] Bindless addressing support for OCL Related-To: NEO-4607 Change-Id: Iaf4a8d45f22d134366e398a196bdd8dc906ab6ab Signed-off-by: Mateusz Hoppe --- .../device_queue/device_queue_hw_bdw_plus.inl | 1 + .../source/gen11/state_base_address_gen11.cpp | 1 + .../gen12lp/state_base_address_gen12lp.cpp | 1 + .../source/gen8/state_base_address_gen8.cpp | 1 + .../source/gen9/state_base_address_gen9.cpp | 1 + .../helpers/hardware_commands_helper_base.inl | 6 +- opencl/source/kernel/kernel.cpp | 25 ++++++ opencl/source/kernel/kernel.h | 1 + opencl/test/unit_test/CMakeLists.txt | 53 +++++++----- .../enqueue_kernel_aub_tests.cpp | 76 +++++++++++++++++ .../aub_tests/fixtures/aub_fixture.h | 4 +- .../fixtures/simple_arg_kernel_fixture.h | 42 ++++++++++ .../kernel/kernel_arg_buffer_tests.cpp | 66 +++++++++++++++ .../kernel/kernel_image_arg_tests.cpp | 83 +++++++++++++++++++ shared/source/helpers/CMakeLists.txt | 2 + .../source/helpers/state_base_address_bdw.inl | 22 +++++ .../helpers/state_base_address_bdw_plus.inl | 10 --- .../helpers/state_base_address_skl_plus.inl | 29 +++++++ shared/test/unit_test/gen8/CMakeLists.txt | 1 + .../gen8/state_base_address_tests_gen8.cpp | 28 +++++++ shared/test/unit_test/helpers/CMakeLists.txt | 2 + .../helpers/state_base_address_tests.cpp | 60 ++++++++++++++ .../helpers/state_base_address_tests.h | 36 ++++++++ 23 files changed, 519 insertions(+), 32 deletions(-) create mode 100644 shared/source/helpers/state_base_address_bdw.inl create mode 100644 shared/source/helpers/state_base_address_skl_plus.inl create mode 100644 shared/test/unit_test/gen8/state_base_address_tests_gen8.cpp create mode 100644 shared/test/unit_test/helpers/state_base_address_tests.cpp create mode 100644 shared/test/unit_test/helpers/state_base_address_tests.h diff --git a/opencl/source/device_queue/device_queue_hw_bdw_plus.inl b/opencl/source/device_queue/device_queue_hw_bdw_plus.inl index 9b8718c163..26e0e4e914 100644 --- a/opencl/source/device_queue/device_queue_hw_bdw_plus.inl +++ b/opencl/source/device_queue/device_queue_hw_bdw_plus.inl @@ -183,6 +183,7 @@ void DeviceQueueHw::setupIndirectState(IndirectHeap &surfaceStateHeap totalBlockSSHSize += alignUp(pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); + surfaceStateHeap.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); auto btOffset = HardwareCommandsHelper::pushBindingTableAndSurfaceStates(surfaceStateHeap, bindingTableCount, pBlockInfo->heapInfo.pSsh, pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize, diff --git a/opencl/source/gen11/state_base_address_gen11.cpp b/opencl/source/gen11/state_base_address_gen11.cpp index a421d4e2e7..b194431a2d 100644 --- a/opencl/source/gen11/state_base_address_gen11.cpp +++ b/opencl/source/gen11/state_base_address_gen11.cpp @@ -7,6 +7,7 @@ #include "shared/source/helpers/state_base_address.h" #include "shared/source/helpers/state_base_address_bdw_plus.inl" +#include "shared/source/helpers/state_base_address_skl_plus.inl" namespace NEO { template struct StateBaseAddressHelper; diff --git a/opencl/source/gen12lp/state_base_address_gen12lp.cpp b/opencl/source/gen12lp/state_base_address_gen12lp.cpp index d657d73d70..1f3bba94af 100644 --- a/opencl/source/gen12lp/state_base_address_gen12lp.cpp +++ b/opencl/source/gen12lp/state_base_address_gen12lp.cpp @@ -7,6 +7,7 @@ #include "shared/source/helpers/state_base_address.h" #include "shared/source/helpers/state_base_address_bdw_plus.inl" +#include "shared/source/helpers/state_base_address_skl_plus.inl" namespace NEO { template struct StateBaseAddressHelper; diff --git a/opencl/source/gen8/state_base_address_gen8.cpp b/opencl/source/gen8/state_base_address_gen8.cpp index c0c81ec39e..4305403be0 100644 --- a/opencl/source/gen8/state_base_address_gen8.cpp +++ b/opencl/source/gen8/state_base_address_gen8.cpp @@ -6,6 +6,7 @@ */ #include "shared/source/helpers/state_base_address.h" +#include "shared/source/helpers/state_base_address_bdw.inl" #include "shared/source/helpers/state_base_address_bdw_plus.inl" namespace NEO { diff --git a/opencl/source/gen9/state_base_address_gen9.cpp b/opencl/source/gen9/state_base_address_gen9.cpp index 4f85d26fd2..d079f246b7 100644 --- a/opencl/source/gen9/state_base_address_gen9.cpp +++ b/opencl/source/gen9/state_base_address_gen9.cpp @@ -7,6 +7,7 @@ #include "shared/source/helpers/state_base_address.h" #include "shared/source/helpers/state_base_address_bdw_plus.inl" +#include "shared/source/helpers/state_base_address_skl_plus.inl" namespace NEO { template struct StateBaseAddressHelper; diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index 5d325ce138..73be8bf48e 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -237,8 +237,7 @@ size_t HardwareCommandsHelper::pushBindingTableAndSurfaceStates(Indir DEBUG_BREAK_IF(srcKernelSsh == nullptr); auto srcSurfaceState = srcKernelSsh; - // Align the heap and allocate space for new ssh data - dstHeap.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); + // Allocate space for new ssh data auto dstSurfaceState = dstHeap.getSpace(sshSize); // Compiler sends BTI table that is already populated with surface state pointers relative to local SSH. @@ -297,6 +296,9 @@ size_t HardwareCommandsHelper::sendIndirectState( const auto &kernelInfo = kernel.getKernelInfo(); const auto &patchInfo = kernelInfo.patchInfo; + ssh.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); + kernel.patchBindlessSurfaceStateOffsets(ssh.getUsed()); + auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, (kernelInfo.patchInfo.bindingTableState != nullptr) ? kernelInfo.patchInfo.bindingTableState->Count : 0, kernel.getSurfaceStateHeap(), kernel.getSurfaceStateHeapSize(), kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset()); diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 0e64335fbe..0e6544edc8 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -2446,4 +2446,29 @@ uint64_t Kernel::getKernelStartOffset( return kernelStartOffset; } +void Kernel::patchBindlessSurfaceStateOffsets(const size_t sshOffset) { + + const bool bindlessBuffers = DebugManager.flags.UseBindlessBuffers.get(); + const bool bindlessImages = DebugManager.flags.UseBindlessImages.get(); + const bool bindlessUsed = bindlessBuffers || bindlessImages; + if (bindlessUsed) { + + for (size_t i = 0; i < kernelInfo.kernelArgInfo.size(); i++) { + if ((kernelInfo.kernelArgInfo[i].isBuffer && bindlessBuffers) || + (kernelInfo.kernelArgInfo[i].isImage && bindlessImages)) { + + auto patchLocation = ptrOffset(getCrossThreadData(), + kernelInfo.kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset); + + uint32_t patchSize = 4; + uint64_t patchValue = sshOffset + kernelInfo.kernelArgInfo[i].offsetHeap; + // compiler is not shifting surface offset << 6 + patchValue <<= 6; + + patchWithRequiredSize(patchLocation, patchSize, patchValue); + } + } + } +} + } // namespace NEO diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h index f7a160ceba..d8c855e688 100644 --- a/opencl/source/kernel/kernel.h +++ b/opencl/source/kernel/kernel.h @@ -219,6 +219,7 @@ class Kernel : public BaseObject<_cl_kernel> { void patchBlocksSimdSize(); bool usesSyncBuffer(); void patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset); + void patchBindlessSurfaceStateOffsets(const size_t sshOffset); GraphicsAllocation *getKernelReflectionSurface() const { return kernelReflectionSurface; diff --git a/opencl/test/unit_test/CMakeLists.txt b/opencl/test/unit_test/CMakeLists.txt index 6eef1f67fe..3d765d3317 100644 --- a/opencl/test/unit_test/CMakeLists.txt +++ b/opencl/test/unit_test/CMakeLists.txt @@ -261,7 +261,7 @@ function(neo_gen_kernels_with_options platform_name_with_type platform_name suff set(kernels_to_compile_${platform_name_with_type} ${kernels_to_compile_${platform_name_with_type}} PARENT_SCOPE) endfunction() -function(neo_gen_kernels_with_internal_options platform_name_with_type platform_name suffix filepath) +function(neo_gen_kernels_with_internal_options platform_name_with_type platform_name suffix filepath output_name) set(kernels_to_compile) foreach(filearg ${filepath}) get_filename_component(filename ${filearg} NAME) @@ -271,24 +271,29 @@ function(neo_gen_kernels_with_internal_options platform_name_with_type platform_ set(outputdir "${TargetDir}/${suffix}/test_files/${NEO_ARCH}/") set(workdir "${CMAKE_CURRENT_SOURCE_DIR}/${base_workdir}/") - foreach(arg ${ARGN}) + if (NOT "${output_name}" STREQUAL "") + set(basename ${output_name}) + endif() - set(outputpath_base "${outputdir}/${basename}_${suffix}") - set(output_files - ${outputpath_base}.spv - ${outputpath_base}.bin - ${outputpath_base}.gen - ) + set(outputpath_base "${outputdir}/${basename}_${suffix}") + set(output_files + ${outputpath_base}.spv + ${outputpath_base}.bin + ${outputpath_base}.gen + ) - add_custom_command( - OUTPUT ${output_files} - COMMAND ${cloc_cmd_prefix} -q -file ${filename} -device ${platform_name} -${NEO_BITS} -out_dir ${outputdir} -internal_options ${arg} - WORKING_DIRECTORY ${workdir} - DEPENDS ${filearg} ocloc - ) + if (NOT "${output_name}" STREQUAL "") + set(output_name -output ${output_name}) + endif() - list(APPEND kernels_to_compile ${output_files}) - endforeach() + add_custom_command( + OUTPUT ${output_files} + COMMAND ${cloc_cmd_prefix} -q -file ${filename} -device ${platform_name} -${NEO_BITS} -out_dir ${outputdir} ${output_name} -internal_options ${ARGN} + WORKING_DIRECTORY ${workdir} + DEPENDS ${filearg} ocloc + ) + + list(APPEND kernels_to_compile ${output_files}) endforeach() list(APPEND kernels_to_compile_${platform_name_with_type} ${kernels_to_compile}) set(kernels_to_compile_${platform_name_with_type} ${kernels_to_compile_${platform_name_with_type}} PARENT_SCOPE) @@ -378,7 +383,6 @@ set(TEST_KERNEL_STATELESS_internal_options_gen9lp "-cl-intel-greater-than-4GB-buffer-required -m32" ) - set(TEST_KERNEL_STATELESS test_files/stateless_kernel.cl ) @@ -397,6 +401,14 @@ set(TEST_KERNEL_SIP_DEBUG_LOCAL_options "-cl-include-sip-kernel-local-debug -cl-include-sip-csr -cl-set-bti:0" ) +set(TEST_KERNEL_BINDLESS_internal_options + "-cl-intel-use-bindless-buffers -cl-intel-use-bindless-images" +) + +set(TEST_KERNEL_BINDLESS + test_files/stateful_copy_buffer.cl +) + file(GLOB_RECURSE TEST_KERNELS test_files/*.cl) list(REMOVE_ITEM TEST_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/test_files/shouldfail.cl") list(REMOVE_ITEM TEST_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/test_files/simple_block_kernel.cl") @@ -429,10 +441,13 @@ macro(macro_for_each_gen) endif() # Gen9lp needs extra -m32 flag if( ("${GEN_TYPE_LOWER}" STREQUAL "gen9") AND ("${PLATFORM_TYPE_LOWER}" STREQUAL "lp")) - neo_gen_kernels_with_internal_options(${family_name_with_type} ${PLATFORM_LOWER} ${family_name_with_type} ${TEST_KERNEL_STATELESS} ${TEST_KERNEL_STATELESS_internal_options_gen9lp}) + neo_gen_kernels_with_internal_options(${family_name_with_type} ${PLATFORM_LOWER} ${family_name_with_type} ${TEST_KERNEL_STATELESS} "" ${TEST_KERNEL_STATELESS_internal_options_gen9lp} ) else() - neo_gen_kernels_with_internal_options(${family_name_with_type} ${PLATFORM_LOWER} ${family_name_with_type} ${TEST_KERNEL_STATELESS} ${TEST_KERNEL_STATELESS_internal_options}) + neo_gen_kernels_with_internal_options(${family_name_with_type} ${PLATFORM_LOWER} ${family_name_with_type} ${TEST_KERNEL_STATELESS} "" ${TEST_KERNEL_STATELESS_internal_options}) endif() + + neo_gen_kernels_with_internal_options(${family_name_with_type} ${PLATFORM_LOWER} ${family_name_with_type} ${TEST_KERNEL_BINDLESS} "bindless_copy_buffer" ${TEST_KERNEL_BINDLESS_internal_options}) + set(sip_kernel_file_name) set(sip_kernel_output_file) set(sip_debug_kernel_output_file) diff --git a/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp b/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp index 04429cc847..da99d02ff0 100644 --- a/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp +++ b/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp @@ -870,3 +870,79 @@ HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNo expectMemory(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory); expectMemory(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory); } + +using AUBBindlessKernel = Test>; +using IsBetweenSklAndTgllp = IsWithinProducts; + +HWTEST2_F(AUBBindlessKernel, givenBindlessCopyKernelWhenEnqueuedThenResultsValidate, IsBetweenSklAndTgllp) { + constexpr size_t bufferSize = MemoryConstants::pageSize; + cl_uint workDim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {bufferSize / 2, 1, 1}; + size_t localWorkSize[3] = {1, 1, 1}; + ; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; + + uint8_t bufferDataSrc[bufferSize]; + uint8_t bufferDataDst[bufferSize]; + + memset(bufferDataSrc, 1, bufferSize); + memset(bufferDataDst, 0, bufferSize); + + auto pBufferSrc = std::unique_ptr(Buffer::create(context, + CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + bufferSize, + bufferDataSrc, + retVal)); + ASSERT_NE(nullptr, pBufferSrc); + + auto pBufferDst = std::unique_ptr(Buffer::create(context, + CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + bufferSize, + bufferDataDst, + retVal)); + ASSERT_NE(nullptr, pBufferDst); + + auto simulatedCsr = AUBFixture::getSimulatedCsr(); + + simulatedCsr->writeMemory(*pBufferSrc->getGraphicsAllocation()); + simulatedCsr->writeMemory(*pBufferDst->getGraphicsAllocation()); + + //Src + kernel->setArg(0, pBufferSrc.get()); + //Dst + kernel->setArg(1, pBufferDst.get()); + + retVal = this->pCmdQ->enqueueKernel( + kernel.get(), + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + + ASSERT_EQ(CL_SUCCESS, retVal); + + globalWorkOffset[0] = bufferSize / 2; + retVal = this->pCmdQ->enqueueKernel( + kernel.get(), + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + + ASSERT_EQ(CL_SUCCESS, retVal); + + EXPECT_TRUE(this->kernel->getKernelInfo().kernelArgInfo[0].pureStatefulBufferAccess); + + this->pCmdQ->finish(); + expectMemory(reinterpret_cast(pBufferDst->getGraphicsAllocation()->getGpuAddress()), + bufferDataSrc, bufferSize); +} \ No newline at end of file diff --git a/opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h b/opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h index 445b0dcd1e..5d37d637f2 100644 --- a/opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h +++ b/opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h @@ -43,13 +43,15 @@ class AUBFixture : public CommandQueueHwFixture { executionEnvironment = platform()->peekExecutionEnvironment(); executionEnvironment->prepareRootDeviceEnvironments(1u); executionEnvironment->rootDeviceEnvironments[0]->setHwInfo(&hwInfo); + + device = std::make_unique(MockDevice::create(executionEnvironment, deviceIndex)); + if (testMode == TestMode::AubTestsWithTbx) { this->csr = TbxCommandStreamReceiver::create(strfilename.str(), true, *executionEnvironment, 0); } else { this->csr = AUBCommandStreamReceiver::create(strfilename.str(), true, *executionEnvironment, 0); } - device = std::make_unique(MockDevice::create(executionEnvironment, deviceIndex)); device->resetCommandStreamReceiver(this->csr); CommandQueueHwFixture::SetUp(AUBFixture::device.get(), cl_command_queue_properties(0)); diff --git a/opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h b/opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h index e14556c430..c4ef3fe7aa 100644 --- a/opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h +++ b/opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h @@ -284,4 +284,46 @@ class SimpleKernelStatelessFixture : public ProgramFixture { cl_int retVal = CL_SUCCESS; }; +class BindlessKernelFixture : public ProgramFixture { + public: + using ProgramFixture::SetUp; + void SetUp(ClDevice *device, Context *context) { + ProgramFixture::SetUp(); + cl_device_id deviceId = device; + cl_context clContext = context; + DebugManager.flags.UseBindlessBuffers.set(true); + DebugManager.flags.UseBindlessImages.set(true); + + CreateProgramFromBinary( + clContext, + &deviceId, + "bindless_copy_buffer"); + ASSERT_NE(nullptr, pProgram); + + retVal = pProgram->build( + 1, + &deviceId, + nullptr, + nullptr, + nullptr, + false); + ASSERT_EQ(CL_SUCCESS, retVal); + + kernel.reset(Kernel::create( + pProgram, + *pProgram->getKernelInfo("StatefulCopyBuffer"), + &retVal)); + ASSERT_NE(nullptr, kernel); + ASSERT_EQ(CL_SUCCESS, retVal); + } + + void TearDown() override { + ProgramFixture::TearDown(); + } + + DebugManagerStateRestore restorer; + std::unique_ptr kernel = nullptr; + cl_int retVal = CL_SUCCESS; +}; + } // namespace NEO diff --git a/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp index e71710b3aa..cb72270257 100644 --- a/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp @@ -5,6 +5,8 @@ * */ +#include "shared/test/unit_test/helpers/debug_manager_state_restore.h" + #include "opencl/source/kernel/kernel.h" #include "opencl/source/mem_obj/buffer.h" #include "opencl/test/unit_test/fixtures/context_fixture.h" @@ -207,3 +209,67 @@ TEST_F(KernelArgBufferTest, givenNoCacheFlushBufferWhenSettingAsArgThenNotExpect EXPECT_EQ(CL_SUCCESS, retVal); EXPECT_EQ(nullptr, pKernel->kernelArgRequiresCacheFlush[0]); } + +TEST_F(KernelArgBufferTest, givenUsedBindlessBuffersWhenPatchingSurfaceStateOffsetsThenCorrectOffsetIsPatchedInCrossThreadData) { + DebugManagerStateRestore restorer; + DebugManager.flags.UseBindlessBuffers.set(1); + + pKernelInfo->usesSsh = true; + pKernelInfo->requiresSshForBuffers = true; + + auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset; + pKernelInfo->kernelArgInfo[0].offsetHeap = 64; + pKernelInfo->kernelArgInfo[0].isBuffer = true; + + auto patchLocation = reinterpret_cast(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset)); + *patchLocation = 0xdead; + + uint32_t sshOffset = 4000; + pKernel->patchBindlessSurfaceStateOffsets(sshOffset); + auto expectedOffset = (sshOffset + pKernelInfo->kernelArgInfo[0].offsetHeap) << 6; + EXPECT_EQ(expectedOffset, *patchLocation); + + sshOffset = static_cast(maxNBitValue(20)) - 64; + pKernel->patchBindlessSurfaceStateOffsets(sshOffset); + expectedOffset = (sshOffset + pKernelInfo->kernelArgInfo[0].offsetHeap) << 6; + EXPECT_EQ(expectedOffset, *patchLocation); +} + +TEST_F(KernelArgBufferTest, givenUsedBindlessBuffersAndNonBufferArgWhenPatchingSurfaceStateOffsetsThenCrossThreadDataIsNotPatched) { + DebugManagerStateRestore restorer; + DebugManager.flags.UseBindlessBuffers.set(1); + + pKernelInfo->usesSsh = true; + pKernelInfo->requiresSshForBuffers = true; + + auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset; + pKernelInfo->kernelArgInfo[0].offsetHeap = 64; + pKernelInfo->kernelArgInfo[0].isBuffer = false; + + auto patchLocation = reinterpret_cast(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset)); + *patchLocation = 0xdead; + + uint32_t sshOffset = 4000; + pKernel->patchBindlessSurfaceStateOffsets(sshOffset); + EXPECT_EQ(0xdeadu, *patchLocation); +} + +TEST_F(KernelArgBufferTest, givenNotUsedBindlessBuffersAndBufferArgWhenPatchingSurfaceStateOffsetsThenCrossThreadDataIsNotPatched) { + DebugManagerStateRestore restorer; + DebugManager.flags.UseBindlessBuffers.set(false); + DebugManager.flags.UseBindlessImages.set(true); + + pKernelInfo->usesSsh = true; + pKernelInfo->requiresSshForBuffers = true; + + auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset; + pKernelInfo->kernelArgInfo[0].offsetHeap = 64; + pKernelInfo->kernelArgInfo[0].isBuffer = true; + + auto patchLocation = reinterpret_cast(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset)); + *patchLocation = 0xdead; + + uint32_t sshOffset = 4000; + pKernel->patchBindlessSurfaceStateOffsets(sshOffset); + EXPECT_EQ(0xdeadu, *patchLocation); +} \ No newline at end of file diff --git a/opencl/test/unit_test/kernel/kernel_image_arg_tests.cpp b/opencl/test/unit_test/kernel/kernel_image_arg_tests.cpp index 6915be820a..ce11568e80 100644 --- a/opencl/test/unit_test/kernel/kernel_image_arg_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_image_arg_tests.cpp @@ -6,6 +6,7 @@ */ #include "shared/source/helpers/ptr_math.h" +#include "shared/test/unit_test/helpers/debug_manager_state_restore.h" #include "opencl/source/helpers/memory_properties_flags_helpers.h" #include "opencl/source/kernel/kernel.h" @@ -324,3 +325,85 @@ TEST_F(KernelImageArgTest, givenNoCacheFlushImageWhenSettingAsArgThenExpectAlloc EXPECT_EQ(CL_SUCCESS, retVal); EXPECT_EQ(nullptr, pKernel->kernelArgRequiresCacheFlush[0]); } + +TEST_F(KernelImageArgTest, givenUsedBindlessImagesWhenPatchingSurfaceStateOffsetsThenCorrectOffsetIsPatchedInCrossThreadData) { + DebugManagerStateRestore restorer; + DebugManager.flags.UseBindlessImages.set(1); + + pKernelInfo->usesSsh = true; + + for (size_t i = 0; i < pKernelInfo->kernelArgInfo.size(); i++) { + pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20 + static_cast(4 * i); + auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset; + auto patchLocation = reinterpret_cast(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset)); + *patchLocation = 0xdead; + } + + pKernelInfo->kernelArgInfo[pKernelInfo->kernelArgInfo.size() - 1].isImage = false; + + uint32_t sshOffset = 4000; + pKernel->patchBindlessSurfaceStateOffsets(sshOffset); + + for (size_t i = 0; i < pKernelInfo->kernelArgInfo.size(); i++) { + auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset; + auto patchLocation = reinterpret_cast(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset)); + + if (pKernelInfo->kernelArgInfo[i].isImage) { + auto expectedOffset = (sshOffset + pKernelInfo->kernelArgInfo[i].offsetHeap) << 6; + EXPECT_EQ(expectedOffset, *patchLocation); + } else { + EXPECT_EQ(0xdeadu, *patchLocation); + } + } +} + +TEST_F(KernelImageArgTest, givenUsedBindlessImagesAndNonImageArgWhenPatchingSurfaceStateOffsetsThenCrossThreadDataIsNotPatched) { + DebugManagerStateRestore restorer; + DebugManager.flags.UseBindlessImages.set(1); + + pKernelInfo->usesSsh = true; + + for (size_t i = 0; i < pKernelInfo->kernelArgInfo.size(); i++) { + pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20 + static_cast(4 * i); + auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset; + auto patchLocation = reinterpret_cast(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset)); + *patchLocation = 0xdead; + } + + int nonImageIndex = 1; + pKernelInfo->kernelArgInfo[nonImageIndex].isImage = false; + + uint32_t sshOffset = 4000; + pKernel->patchBindlessSurfaceStateOffsets(sshOffset); + + auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[nonImageIndex].kernelArgPatchInfoVector[0].crossthreadOffset; + auto patchLocation = reinterpret_cast(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset)); + + EXPECT_EQ(0xdeadu, *patchLocation); +} + +TEST_F(KernelImageArgTest, givenNotUsedBindlessImagesAndImageArgWhenPatchingSurfaceStateOffsetsThenCrossThreadDataIsNotPatched) { + DebugManagerStateRestore restorer; + DebugManager.flags.UseBindlessImages.set(false); + DebugManager.flags.UseBindlessBuffers.set(true); + + pKernelInfo->usesSsh = true; + + for (size_t i = 0; i < pKernelInfo->kernelArgInfo.size(); i++) { + pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20 + static_cast(4 * i); + auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset; + auto patchLocation = reinterpret_cast(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset)); + *patchLocation = 0xdead; + } + + int nonImageIndex = 1; + pKernelInfo->kernelArgInfo[nonImageIndex].isImage = true; + + uint32_t sshOffset = 4000; + pKernel->patchBindlessSurfaceStateOffsets(sshOffset); + + auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[nonImageIndex].kernelArgPatchInfoVector[0].crossthreadOffset; + auto patchLocation = reinterpret_cast(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset)); + + EXPECT_EQ(0xdeadu, *patchLocation); +} \ No newline at end of file diff --git a/shared/source/helpers/CMakeLists.txt b/shared/source/helpers/CMakeLists.txt index 0cfa2882ab..9eb7271c63 100644 --- a/shared/source/helpers/CMakeLists.txt +++ b/shared/source/helpers/CMakeLists.txt @@ -70,6 +70,8 @@ set(NEO_CORE_HELPERS ${CMAKE_CURRENT_SOURCE_DIR}/simd_helper.h ${CMAKE_CURRENT_SOURCE_DIR}/state_base_address.h ${CMAKE_CURRENT_SOURCE_DIR}/state_base_address_base.inl + ${CMAKE_CURRENT_SOURCE_DIR}/state_base_address_bdw.inl + ${CMAKE_CURRENT_SOURCE_DIR}/state_base_address_skl_plus.inl ${CMAKE_CURRENT_SOURCE_DIR}/state_base_address_bdw_plus.inl ${CMAKE_CURRENT_SOURCE_DIR}/state_compute_mode_helper.h ${CMAKE_CURRENT_SOURCE_DIR}/stdio.h diff --git a/shared/source/helpers/state_base_address_bdw.inl b/shared/source/helpers/state_base_address_bdw.inl new file mode 100644 index 0000000000..62f7abc0f1 --- /dev/null +++ b/shared/source/helpers/state_base_address_bdw.inl @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2020 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/helpers/state_base_address.h" + +namespace NEO { + +template +void StateBaseAddressHelper::appendStateBaseAddressParameters( + STATE_BASE_ADDRESS *stateBaseAddress, + const IndirectHeap *ssh, + bool setGeneralStateBaseAddress, + uint64_t internalHeapBase, + GmmHelper *gmmHelper, + bool isMultiOsContextCapable) { +} + +} // namespace NEO diff --git a/shared/source/helpers/state_base_address_bdw_plus.inl b/shared/source/helpers/state_base_address_bdw_plus.inl index cd028d7af8..48b1b0b69b 100644 --- a/shared/source/helpers/state_base_address_bdw_plus.inl +++ b/shared/source/helpers/state_base_address_bdw_plus.inl @@ -9,16 +9,6 @@ namespace NEO { -template -void StateBaseAddressHelper::appendStateBaseAddressParameters( - STATE_BASE_ADDRESS *stateBaseAddress, - const IndirectHeap *ssh, - bool setGeneralStateBaseAddress, - uint64_t internalHeapBase, - GmmHelper *gmmHelper, - bool isMultiOsContextCapable) { -} - template void StateBaseAddressHelper::programBindingTableBaseAddress(LinearStream &commandStream, const IndirectHeap &ssh, GmmHelper *gmmHelper) { } diff --git a/shared/source/helpers/state_base_address_skl_plus.inl b/shared/source/helpers/state_base_address_skl_plus.inl new file mode 100644 index 0000000000..b48114c1fe --- /dev/null +++ b/shared/source/helpers/state_base_address_skl_plus.inl @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2020 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/helpers/state_base_address.h" + +namespace NEO { + +template +void StateBaseAddressHelper::appendStateBaseAddressParameters( + STATE_BASE_ADDRESS *stateBaseAddress, + const IndirectHeap *ssh, + bool setGeneralStateBaseAddress, + uint64_t internalHeapBase, + GmmHelper *gmmHelper, + bool isMultiOsContextCapable) { + + if (DebugManager.flags.UseBindlessBuffers.get() != 0 || DebugManager.flags.UseBindlessImages.get() != 0) { + stateBaseAddress->setBindlessSurfaceStateBaseAddressModifyEnable(true); + stateBaseAddress->setBindlessSurfaceStateBaseAddress(ssh->getHeapGpuBase()); + uint32_t size = uint32_t(ssh->getMaxAvailableSpace() / 64) - 1; + stateBaseAddress->setBindlessSurfaceStateSize(size); + } +} + +} // namespace NEO diff --git a/shared/test/unit_test/gen8/CMakeLists.txt b/shared/test/unit_test/gen8/CMakeLists.txt index f57455eb1f..7c08fd18e2 100644 --- a/shared/test/unit_test/gen8/CMakeLists.txt +++ b/shared/test/unit_test/gen8/CMakeLists.txt @@ -8,6 +8,7 @@ set(NEO_CORE_TESTS_GEN8 ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt ${CMAKE_CURRENT_SOURCE_DIR}/image_surface_state_tests_gen8.cpp ${CMAKE_CURRENT_SOURCE_DIR}/simd_helper_tests_gen8.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/state_base_address_tests_gen8.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_preamble_gen8.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_preemption_gen8.cpp ) diff --git a/shared/test/unit_test/gen8/state_base_address_tests_gen8.cpp b/shared/test/unit_test/gen8/state_base_address_tests_gen8.cpp new file mode 100644 index 0000000000..3c60e68b27 --- /dev/null +++ b/shared/test/unit_test/gen8/state_base_address_tests_gen8.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2020 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/test/unit_test/helpers/state_base_address_tests.h" + +BDWTEST_F(SBATest, givenUsedBindlessBuffersWhenAppendStateBaseAddressParametersIsCalledThenSBACmdHasNotBindingSurfaceStateProgrammed) { + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + + DebugManager.flags.UseBindlessBuffers.set(1); + + STATE_BASE_ADDRESS stateBaseAddress = {}; + STATE_BASE_ADDRESS stateBaseAddressReference = {}; + + StateBaseAddressHelper::appendStateBaseAddressParameters( + &stateBaseAddress, + &ssh, + false, + 0, + nullptr, + false); + + EXPECT_EQ(0u, ssh.getUsed()); + EXPECT_EQ(0, memcmp(&stateBaseAddressReference, &stateBaseAddress, sizeof(STATE_BASE_ADDRESS))); +} diff --git a/shared/test/unit_test/helpers/CMakeLists.txt b/shared/test/unit_test/helpers/CMakeLists.txt index 4b40170cf7..ee1be17ee5 100644 --- a/shared/test/unit_test/helpers/CMakeLists.txt +++ b/shared/test/unit_test/helpers/CMakeLists.txt @@ -16,6 +16,8 @@ set(NEO_CORE_HELPERS_TESTS ${CMAKE_CURRENT_SOURCE_DIR}/memory_leak_listener.h ${CMAKE_CURRENT_SOURCE_DIR}/memory_management.h ${CMAKE_CURRENT_SOURCE_DIR}/simd_helper_tests.inl + ${CMAKE_CURRENT_SOURCE_DIR}/state_base_address_tests.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/state_base_address_tests.h ${CMAKE_CURRENT_SOURCE_DIR}/string_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/string_to_hash_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ult_hw_config.h diff --git a/shared/test/unit_test/helpers/state_base_address_tests.cpp b/shared/test/unit_test/helpers/state_base_address_tests.cpp new file mode 100644 index 0000000000..12ed942f14 --- /dev/null +++ b/shared/test/unit_test/helpers/state_base_address_tests.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2020 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/test/unit_test/helpers/state_base_address_tests.h" + +using IsBetweenSklAndTgllp = IsWithinProducts; + +HWTEST2_F(SBATest, givenUsedBindlessBuffersWhenAppendStateBaseAddressParametersIsCalledThenSBACmdHasBindingSurfaceStateProgrammed, IsBetweenSklAndTgllp) { + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + + EXPECT_NE(IGFX_BROADWELL, ::productFamily); + + DebugManager.flags.UseBindlessBuffers.set(1); + + STATE_BASE_ADDRESS stateBaseAddress; + stateBaseAddress.setBindlessSurfaceStateSize(0); + stateBaseAddress.setBindlessSurfaceStateBaseAddress(0); + stateBaseAddress.setBindlessSurfaceStateBaseAddressModifyEnable(false); + + StateBaseAddressHelper::appendStateBaseAddressParameters( + &stateBaseAddress, + &ssh, + false, + 0, + nullptr, + false); + + EXPECT_EQ(ssh.getMaxAvailableSpace() / 64 - 1, stateBaseAddress.getBindlessSurfaceStateSize()); + EXPECT_EQ(ssh.getHeapGpuBase(), stateBaseAddress.getBindlessSurfaceStateBaseAddress()); + EXPECT_TRUE(stateBaseAddress.getBindlessSurfaceStateBaseAddressModifyEnable()); +} + +HWTEST2_F(SBATest, givenUsedBindlessImagesWhenAppendStateBaseAddressParametersIsCalledThenSBACmdHasBindingSurfaceStateProgrammed, IsBetweenSklAndTgllp) { + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + + EXPECT_NE(IGFX_BROADWELL, ::productFamily); + + DebugManager.flags.UseBindlessImages.set(1); + + STATE_BASE_ADDRESS stateBaseAddress; + stateBaseAddress.setBindlessSurfaceStateSize(0); + stateBaseAddress.setBindlessSurfaceStateBaseAddress(0); + stateBaseAddress.setBindlessSurfaceStateBaseAddressModifyEnable(false); + + StateBaseAddressHelper::appendStateBaseAddressParameters( + &stateBaseAddress, + &ssh, + false, + 0, + nullptr, + false); + + EXPECT_EQ(ssh.getMaxAvailableSpace() / 64 - 1, stateBaseAddress.getBindlessSurfaceStateSize()); + EXPECT_EQ(ssh.getHeapGpuBase(), stateBaseAddress.getBindlessSurfaceStateBaseAddress()); + EXPECT_TRUE(stateBaseAddress.getBindlessSurfaceStateBaseAddressModifyEnable()); +} diff --git a/shared/test/unit_test/helpers/state_base_address_tests.h b/shared/test/unit_test/helpers/state_base_address_tests.h new file mode 100644 index 0000000000..ba859eefa5 --- /dev/null +++ b/shared/test/unit_test/helpers/state_base_address_tests.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2020 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_stream/linear_stream.h" +#include "shared/source/helpers/state_base_address.h" +#include "shared/source/indirect_heap/indirect_heap.h" +#include "shared/test/unit_test/helpers/debug_manager_state_restore.h" + +#include "opencl/test/unit_test/mocks/mock_graphics_allocation.h" +#include "test.h" + +struct SBATest : ::testing::Test { + void SetUp() override { + size_t sizeStream = 512; + size_t alignmentStream = 0x1000; + sshBuffer = alignedMalloc(sizeStream, alignmentStream); + + ASSERT_NE(nullptr, sshBuffer); + + ssh.replaceBuffer(sshBuffer, sizeStream); + auto graphicsAllocation = new MockGraphicsAllocation(sshBuffer, sizeStream); + ssh.replaceGraphicsAllocation(graphicsAllocation); + } + + void TearDown() override { + delete ssh.getGraphicsAllocation(); + alignedFree(sshBuffer); + } + IndirectHeap ssh = {nullptr}; + void *sshBuffer = nullptr; + DebugManagerStateRestore restorer; +};