Support for bindless mode in L0 - improvements

Related-To: NEO-6448

- add new IGC compilation flag when bindless mode used
- fix SBA programming of BindlessSurfaceStateSize -
always set maximum surface state count
- fix residency of global DSH heap on gen9 - gen12lp
in bindless mode
- add L0 aub test with bindless kernel - disabled
- partial fixes in OCL aub tests


Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
Mateusz Hoppe
2021-11-30 22:59:19 +00:00
committed by Compute-Runtime-Automation
parent 55959d4d1d
commit 8b233f7f45
21 changed files with 352 additions and 19 deletions

View File

@@ -0,0 +1,10 @@
#
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
target_sources(ze_intel_gpu_aub_tests PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/bindless_kernel_aub_tests.cpp
)

View File

@@ -0,0 +1,115 @@
/*
* Copyright (C) 2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/helpers/array_count.h"
#include "shared/source/helpers/file_io.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/test_files.h"
#include "test.h"
#include "level_zero/core/source/module/module_imp.h"
#include "level_zero/core/test/aub_tests/fixtures/aub_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_driver_handle.h"
namespace L0 {
namespace ult {
struct L0BindlessAub : Test<AUBFixtureL0> {
void SetUp() {
DebugManager.flags.UseBindlessMode.set(1);
DebugManager.flags.UseExternalAllocatorForSshAndDsh.set(1);
AUBFixtureL0::SetUp();
}
void TearDown() {
module->destroy();
AUBFixtureL0::TearDown();
}
void createModuleFromFile(const std::string &fileName, ze_context_handle_t context, L0::Device *device) {
std::string testFile;
retrieveBinaryKernelFilenameNoRevision(testFile, fileName + "_", ".bin");
size_t size = 0;
auto src = loadDataFromFile(
testFile.c_str(),
size);
ASSERT_NE(0u, size);
ASSERT_NE(nullptr, src);
ze_module_desc_t moduleDesc = {ZE_STRUCTURE_TYPE_MODULE_DESC};
moduleDesc.format = ZE_MODULE_FORMAT_NATIVE;
moduleDesc.pInputModule = reinterpret_cast<const uint8_t *>(src.get());
moduleDesc.inputSize = size;
moduleDesc.pBuildFlags = "";
module = new ModuleImp(device, nullptr, ModuleType::User);
bool success = module->initialize(&moduleDesc, device->getNEODevice());
ASSERT_TRUE(success);
}
DebugManagerStateRestore restorer;
ModuleImp *module = nullptr;
};
HWTEST_F(L0BindlessAub, DISABLED_GivenBindlessKernelWhenExecutedThenOutputIsCorrect) {
constexpr size_t bufferSize = MemoryConstants::pageSize;
const uint32_t groupSize[] = {32, 1, 1};
const uint32_t groupCount[] = {bufferSize / 32, 1, 1};
NEO::SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::HOST_UNIFIED_MEMORY,
context->rootDeviceIndices,
context->deviceBitfields);
auto bufferSrc = driverHandle->svmAllocsManager->createHostUnifiedMemoryAllocation(bufferSize, unifiedMemoryProperties);
memset(bufferSrc, 55, bufferSize);
auto bufferDst = driverHandle->svmAllocsManager->createHostUnifiedMemoryAllocation(bufferSize, unifiedMemoryProperties);
memset(bufferDst, 0, bufferSize);
auto simulatedCsr = AUBFixtureL0::getSimulatedCsr<FamilyType>();
simulatedCsr->initializeEngine();
simulatedCsr->writeMemory(*driverHandle->svmAllocsManager->getSVMAlloc(bufferSrc)->gpuAllocations.getDefaultGraphicsAllocation());
simulatedCsr->writeMemory(*driverHandle->svmAllocsManager->getSVMAlloc(bufferDst)->gpuAllocations.getDefaultGraphicsAllocation());
ze_group_count_t dispatchTraits;
dispatchTraits.groupCountX = groupCount[0];
dispatchTraits.groupCountY = groupCount[1];
dispatchTraits.groupCountZ = groupCount[2];
createModuleFromFile("bindless_stateful_copy_buffer", context, device);
ze_kernel_handle_t kernel;
ze_kernel_desc_t kernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC};
kernelDesc.pKernelName = "StatefulCopyBuffer";
EXPECT_EQ(ZE_RESULT_SUCCESS, zeKernelCreate(module->toHandle(), &kernelDesc, &kernel));
EXPECT_EQ(ZE_RESULT_SUCCESS, zeKernelSetArgumentValue(kernel, 0, sizeof(void *), &bufferSrc));
EXPECT_EQ(ZE_RESULT_SUCCESS, zeKernelSetArgumentValue(kernel, 1, sizeof(void *), &bufferDst));
EXPECT_EQ(ZE_RESULT_SUCCESS, zeKernelSetGroupSize(kernel, groupSize[0], groupSize[1], groupSize[2]));
ze_command_list_handle_t cmdListHandle = commandList->toHandle();
EXPECT_EQ(ZE_RESULT_SUCCESS, zeCommandListAppendLaunchKernel(cmdListHandle, kernel, &dispatchTraits, nullptr, 0, nullptr));
commandList->close();
pCmdq->executeCommandLists(1, &cmdListHandle, nullptr, false);
pCmdq->synchronize(std::numeric_limits<uint32_t>::max());
expectMemory<FamilyType>(reinterpret_cast<void *>(driverHandle->svmAllocsManager->getSVMAlloc(bufferDst)->gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress()),
bufferSrc, bufferSize);
EXPECT_EQ(ZE_RESULT_SUCCESS, zeKernelDestroy(kernel));
driverHandle->svmAllocsManager->freeSVMAlloc(bufferSrc);
driverHandle->svmAllocsManager->freeSVMAlloc(bufferDst);
}
} // namespace ult
} // namespace L0

View File

@@ -5,6 +5,12 @@
*
*/
#include "shared/source/command_stream/command_stream_receiver_simulated_common_hw.h"
#include "shared/source/command_stream/command_stream_receiver_with_aub_dump.h"
#include "shared/source/command_stream/tbx_command_stream_receiver_hw.h"
#include "test_mode.h"
#include <cstdint>
#include <memory>
#include <string>
@@ -39,6 +45,43 @@ class AUBFixtureL0 {
void TearDown();
static void prepareCopyEngines(NEO::MockDevice &device, const std::string &filename);
template <typename FamilyType>
NEO::CommandStreamReceiverSimulatedCommonHw<FamilyType> *getSimulatedCsr() const {
return static_cast<NEO::CommandStreamReceiverSimulatedCommonHw<FamilyType> *>(csr);
}
template <typename FamilyType>
void expectMemory(void *gfxAddress, const void *srcAddress, size_t length) {
NEO::CommandStreamReceiverSimulatedCommonHw<FamilyType> *csrSimulated = getSimulatedCsr<FamilyType>();
if (NEO::testMode == NEO::TestMode::AubTestsWithTbx) {
auto tbxCsr = csrSimulated;
EXPECT_TRUE(tbxCsr->expectMemoryEqual(gfxAddress, srcAddress, length));
csrSimulated = static_cast<NEO::CommandStreamReceiverSimulatedCommonHw<FamilyType> *>(
static_cast<NEO::CommandStreamReceiverWithAUBDump<NEO::TbxCommandStreamReceiverHw<FamilyType>> *>(csr)->aubCSR.get());
}
if (csrSimulated) {
csrSimulated->expectMemoryEqual(gfxAddress, srcAddress, length);
}
}
template <typename FamilyType>
void expectNotEqualMemory(void *gfxAddress, const void *srcAddress, size_t length) {
NEO::CommandStreamReceiverSimulatedCommonHw<FamilyType> *csrSimulated = getSimulatedCsr<FamilyType>();
if (NEO::testMode == NEO::TestMode::AubTestsWithTbx) {
auto tbxCsr = csrSimulated;
EXPECT_TRUE(tbxCsr->expectMemoryNotEqual(gfxAddress, srcAddress, length));
csrSimulated = static_cast<NEO::CommandStreamReceiverSimulatedCommonHw<FamilyType> *>(
static_cast<NEO::CommandStreamReceiverWithAUBDump<NEO::TbxCommandStreamReceiverHw<FamilyType>> *>(csr)->aubCSR.get());
}
if (csrSimulated) {
csrSimulated->expectMemoryNotEqual(gfxAddress, srcAddress, length);
}
}
const uint32_t rootDeviceIndex = 0;
NEO::ExecutionEnvironment *executionEnvironment;
NEO::MemoryManager *memoryManager = nullptr;

View File

@@ -13,6 +13,14 @@ set(TEST_MODULES
${CMAKE_CURRENT_SOURCE_DIR}/test_modules/test_kernel.cl
)
set(TEST_KERNEL_BINDLESS_internal_options
"-cl-intel-use-bindless-mode -cl-intel-use-bindless-advanced-mode"
)
set(TEST_KERNEL_BINDLESS
${CMAKE_CURRENT_SOURCE_DIR}/test_modules/stateful_copy_buffer.cl
)
set(l0_test_kernels_outputs)
macro(macro_for_each_core_type)
foreach(PLATFORM_TYPE ${PLATFORM_TYPES})
@@ -20,12 +28,18 @@ macro(macro_for_each_core_type)
get_family_name_with_type(${CORE_TYPE} ${PLATFORM_TYPE})
string(TOLOWER ${PLATFORM_TYPE} PLATFORM_TYPE_LOWER)
set(PLATFORM_LOWER ${DEFAULT_SUPPORTED_${CORE_TYPE}_${PLATFORM_TYPE}_PLATFORM})
string(TOLOWER ${CORE_TYPE} CORE_TYPE_LOWER)
level_zero_generate_kernels(l0_test_kernel_outputs ${PLATFORM_LOWER} ${family_name_with_type} "-g" ${TEST_MODULES})
#skip Gen8 bindless kernel generation
if(NOT ("${CORE_TYPE_LOWER}" STREQUAL "gen8"))
level_zero_generate_kernels_with_internal_options(l0_bindless_test_kernel_outputs ${PLATFORM_LOWER} ${family_name_with_type} "bindless" "-g" ${TEST_KERNEL_BINDLESS_internal_options} ${TEST_KERNEL_BINDLESS})
endif()
endif()
endforeach()
endmacro()
apply_macro_for_each_core_type("TESTED")
add_custom_target(l0_common_test_kernels DEPENDS ${l0_test_kernel_outputs} copy_compiler_files)
add_custom_target(l0_common_test_kernels DEPENDS ${l0_test_kernel_outputs} ${l0_bindless_test_kernel_outputs} copy_compiler_files)
set_target_properties(l0_common_test_kernels PROPERTIES FOLDER ${TARGET_NAME_L0})
add_dependencies(prepare_test_kernels_for_l0 l0_common_test_kernels)

View File

@@ -49,3 +49,54 @@ function(level_zero_generate_kernels target_list platform_name suffix options)
set(${target_list} ${${target_list}} PARENT_SCOPE)
endfunction()
function(level_zero_generate_kernels_with_internal_options target_list platform_name suffix prefix options internal_options)
list(APPEND results copy_compiler_files)
set(outputdir "${TargetDir}/level_zero/${suffix}/test_files/${NEO_ARCH}/")
foreach(filepath ${ARGN})
get_filename_component(filename ${filepath} NAME)
get_filename_component(basename ${filepath} NAME_WE)
get_filename_component(workdir ${filepath} DIRECTORY)
set(outputpath_base "${outputdir}${prefix}_${basename}_${suffix}")
if(NOT NEO_DISABLE_BUILTINS_COMPILATION)
set(output_files
${outputpath_base}.bin
${outputpath_base}.gen
${outputpath_base}.spv
${outputpath_base}.dbg
)
set(output_name "-output" "${prefix}_${basename}")
string(CONCAT options \" ${options} \" )
string(CONCAT internal_options \" ${internal_options} \" )
add_custom_command(
COMMAND echo generate ${ocloc_cmd_prefix} -q -file ${filename} -device ${platform_name} -out_dir ${outputdir} ${output_name} -options ${options} -internal_options ${internal_options} , workdir is ${workdir}
OUTPUT ${output_files}
COMMAND ${ocloc_cmd_prefix} -q -file ${filename} -device ${platform_name} -out_dir ${outputdir} ${output_name} -options ${options} -internal_options ${internal_options}
WORKING_DIRECTORY ${workdir}
DEPENDS ${filepath} ocloc
)
list(APPEND ${target_list} ${output_files})
else()
foreach(_file_name "bin" "gen" "spv" "dbg")
set(_file_prebuilt "${NEO_SOURCE_DIR}/../neo_test_kernels/level_zero/${suffix}/test_files/${NEO_ARCH}/${prefix}_${basename}_${suffix}.${_file_name}")
add_custom_command(
OUTPUT ${outputpath_base}.${_file_name}
COMMAND ${CMAKE_COMMAND} -E make_directory ${outputdir}
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${_file_prebuilt} ${outputdir}
)
list(APPEND ${target_list} ${outputpath_base}.${_file_name})
endforeach()
endif()
endforeach()
set(${target_list} ${${target_list}} PARENT_SCOPE)
endfunction()

View File

@@ -0,0 +1,14 @@
/*
* Copyright (C) 2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
__kernel void StatefulCopyBuffer(
const __global uchar* src,
__global uchar* dst)
{
uint id = get_global_id(0);
dst[id] = src[id];
}

View File

@@ -492,7 +492,7 @@ HWTEST2_F(CommandQueueProgramSBATest,
using BindlessCommandQueueSBASupport = IsAtLeastProduct<IGFX_SKYLAKE>;
HWTEST2_F(CommandQueueProgramSBATest,
givenBindlessModeEnabledWhenProgrammingStateBaseAddressThenBindlessBaseAddressIsPassed, BindlessCommandQueueSBASupport) {
givenBindlessModeEnabledWhenProgrammingStateBaseAddressThenBindlessBaseAddressAndSizeAreSet, BindlessCommandQueueSBASupport) {
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
DebugManagerStateRestore dbgRestorer;
DebugManager.flags.UseBindlessMode.set(1);
@@ -525,7 +525,9 @@ HWTEST2_F(CommandQueueProgramSBATest,
auto cmdSba = genCmdCast<STATE_BASE_ADDRESS *>(*itor);
EXPECT_EQ(cmdSba->getBindlessSurfaceStateBaseAddressModifyEnable(), true);
EXPECT_EQ(cmdSba->getBindlessSurfaceStateBaseAddress(), neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[neoDevice->getRootDeviceIndex()]->getBindlessHeapsHelper()->getGlobalHeapsBase());
EXPECT_EQ(cmdSba->getBindlessSurfaceStateSize(), MemoryConstants::sizeOf4GBinPageEntities);
auto surfaceStateCount = StateBaseAddressHelper<FamilyType>::getMaxBindlessSurfaceStates();
EXPECT_EQ(surfaceStateCount, cmdSba->getBindlessSurfaceStateSize());
commandQueue->destroy();
}