mirror of
https://github.com/intel/compute-runtime.git
synced 2025-11-15 10:14:56 +08:00
Add Non-Uniform AUB tests
Change-Id: Ie1944caa2ea9b7240dde9460bd817f8889fff3bb
This commit is contained in:
committed by
sys_ocldev
parent
02e1284aa8
commit
620708e510
@@ -183,33 +183,35 @@ function(neo_gen_kernels target platform_name suffix)
|
||||
set_target_properties(${target} PROPERTIES FOLDER "kernels/${suffix}")
|
||||
endfunction()
|
||||
|
||||
function(neo_gen_kernel_with_options target platform_name suffix filepath)
|
||||
get_filename_component(filename ${filepath} NAME)
|
||||
get_filename_component(basename ${filepath} NAME_WE)
|
||||
get_filename_component(base_workdir ${filepath} DIRECTORY)
|
||||
|
||||
set(outputdir "${TargetDir}/${suffix}/test_files/${NEO_ARCH}/")
|
||||
set(workdir "${CMAKE_CURRENT_SOURCE_DIR}/${base_workdir}/")
|
||||
|
||||
function(neo_gen_kernels_with_options target platform_name suffix filepath)
|
||||
set(results)
|
||||
foreach(arg ${ARGN})
|
||||
string(REPLACE " " "_" argwospaces ${arg})
|
||||
foreach(filearg ${filepath})
|
||||
get_filename_component(filename ${filearg} NAME)
|
||||
get_filename_component(basename ${filearg} NAME_WE)
|
||||
get_filename_component(base_workdir ${filearg} DIRECTORY)
|
||||
|
||||
set(outputpath_base "${outputdir}/${basename}_${suffix}")
|
||||
set(output_files
|
||||
${outputpath_base}${ir_extension}${argwospaces}
|
||||
${outputpath_base}.bin${argwospaces}
|
||||
${outputpath_base}.gen${argwospaces}
|
||||
)
|
||||
set(outputdir "${TargetDir}/${suffix}/test_files/${NEO_ARCH}/")
|
||||
set(workdir "${CMAKE_CURRENT_SOURCE_DIR}/${base_workdir}/")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${output_files}
|
||||
COMMAND ${cloc_cmd_prefix} -q -file ${filename} -device ${platform_name} -${NEO_BITS} -out_dir ${outputdir} -options ${arg} -options_name
|
||||
WORKING_DIRECTORY ${workdir}
|
||||
DEPENDS ${filepath} cloc
|
||||
)
|
||||
foreach(arg ${ARGN})
|
||||
string(REPLACE " " "_" argwospaces ${arg})
|
||||
|
||||
list(APPEND results ${output_files})
|
||||
set(outputpath_base "${outputdir}/${basename}_${suffix}")
|
||||
set(output_files
|
||||
${outputpath_base}${ir_extension}${argwospaces}
|
||||
${outputpath_base}.bin${argwospaces}
|
||||
${outputpath_base}.gen${argwospaces}
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${output_files}
|
||||
COMMAND ${cloc_cmd_prefix} -q -file ${filename} -device ${platform_name} -${NEO_BITS} -out_dir ${outputdir} -options ${arg} -options_name
|
||||
WORKING_DIRECTORY ${workdir}
|
||||
DEPENDS ${filearg} cloc
|
||||
)
|
||||
|
||||
list(APPEND results ${output_files})
|
||||
endforeach()
|
||||
endforeach()
|
||||
add_custom_target(${target} DEPENDS ${results} copy_compiler_files)
|
||||
set_target_properties(${target} PROPERTIES FOLDER "kernels/${suffix}")
|
||||
@@ -293,6 +295,7 @@ set(TEST_KERNEL_2_0_options
|
||||
|
||||
set(TEST_KERNEL_2_0
|
||||
test_files/simple_block_kernel.cl
|
||||
test_files/simple_nonuniform.cl
|
||||
)
|
||||
|
||||
set(TEST_KERNEL_SIP_DEBUG_options
|
||||
@@ -306,6 +309,7 @@ set(TEST_KERNEL_SIP_DEBUG_LOCAL_options
|
||||
file(GLOB_RECURSE TEST_KERNELS test_files/*.cl)
|
||||
list(REMOVE_ITEM TEST_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/test_files/shouldfail.cl")
|
||||
list(REMOVE_ITEM TEST_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/test_files/simple_block_kernel.cl")
|
||||
list(REMOVE_ITEM TEST_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/test_files/simple_nonuniform.cl")
|
||||
|
||||
function(gen_run_tests_with_appverifier product slices subslices eu_per_ss)
|
||||
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug" AND "${IGDRCL_OPTION__BITS}" STREQUAL "64" AND APPVERIFIER_ALLOWED)
|
||||
@@ -373,7 +377,7 @@ macro(macro_for_each_gen)
|
||||
|
||||
if(MSVC OR CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
neo_gen_kernels(test_kernels_${family_name_with_type} ${PLATFORM_LOWER} ${family_name_with_type} ${TEST_KERNELS})
|
||||
neo_gen_kernel_with_options(test_kernel_${family_name_with_type} ${PLATFORM_LOWER} ${family_name_with_type} ${TEST_KERNEL} ${TEST_KERNEL_options})
|
||||
neo_gen_kernels_with_options(test_kernel_${family_name_with_type} ${PLATFORM_LOWER} ${family_name_with_type} ${TEST_KERNEL} ${TEST_KERNEL_options})
|
||||
|
||||
# Temporarily disabled debug kernel generation on gen8
|
||||
if(NOT ("${GEN_TYPE_LOWER}" STREQUAL "gen8"))
|
||||
@@ -399,7 +403,7 @@ macro(macro_for_each_gen)
|
||||
# add_dependencies(unit_tests test_kernel_sip_debug_${family_name_with_type})
|
||||
|
||||
if(PLATFORM_2_0_LOWER)
|
||||
neo_gen_kernel_with_options(test_kernel_2_0_${family_name_with_type} ${PLATFORM_2_0_LOWER} ${family_name_with_type} ${TEST_KERNEL_2_0} ${TEST_KERNEL_2_0_options})
|
||||
neo_gen_kernels_with_options(test_kernel_2_0_${family_name_with_type} ${PLATFORM_2_0_LOWER} ${family_name_with_type} "${TEST_KERNEL_2_0}" ${TEST_KERNEL_2_0_options})
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include "runtime/command_queue/command_queue.h"
|
||||
#include "runtime/helpers/ptr_math.h"
|
||||
#include "gen_cmd_parse.h"
|
||||
#include "unit_tests/aub_tests/fixtures/aub_fixture.h"
|
||||
#include "unit_tests/aub_tests/fixtures/hello_world_fixture.h"
|
||||
#include "unit_tests/fixtures/hello_world_fixture.h"
|
||||
#include "unit_tests/fixtures/simple_arg_fixture.h"
|
||||
@@ -17,6 +18,8 @@
|
||||
|
||||
using namespace OCLRT;
|
||||
|
||||
extern const HardwareInfo **platformDevices;
|
||||
|
||||
struct TestParam {
|
||||
cl_uint globalWorkSizeX;
|
||||
cl_uint globalWorkSizeY;
|
||||
@@ -350,3 +353,485 @@ INSTANTIATE_TEST_CASE_P(
|
||||
::testing::ValuesIn(TestSimdTable),
|
||||
::testing::ValuesIn(TestParamTable)));
|
||||
} // namespace ULT
|
||||
|
||||
struct AUBSimpleArgNonUniformFixture : public KernelAUBFixture<SimpleArgNonUniformKernelFixture> {
|
||||
void SetUp() override {
|
||||
deviceClVersionSupport = OCLRT::platformDevices[0]->capabilityTable.clVersionSupport;
|
||||
if (deviceClVersionSupport < 20) {
|
||||
return;
|
||||
}
|
||||
KernelAUBFixture<SimpleArgNonUniformKernelFixture>::SetUp();
|
||||
|
||||
argVal = static_cast<int>(0x22222222);
|
||||
|
||||
sizeWrittenMemory = 0;
|
||||
typeSize = sizeof(int);
|
||||
typeItems = 40 * 40 * 40;
|
||||
sizeUserMemory = alignUp(typeItems * typeSize, 64);
|
||||
|
||||
destMemory = alignedMalloc(sizeUserMemory, 4096);
|
||||
ASSERT_NE(nullptr, destMemory);
|
||||
for (uint32_t i = 0; i < typeItems; i++) {
|
||||
*(static_cast<int *>(destMemory) + i) = 0xdeadbeef;
|
||||
}
|
||||
|
||||
expectedMemory = alignedMalloc(sizeUserMemory, 4096);
|
||||
ASSERT_NE(nullptr, expectedMemory);
|
||||
|
||||
memset(expectedMemory, 0x0, sizeUserMemory);
|
||||
|
||||
kernel->setArgSvm(1, sizeUserMemory, destMemory);
|
||||
|
||||
outBuffer = csr->createAllocationAndHandleResidency(destMemory, sizeUserMemory);
|
||||
ASSERT_NE(nullptr, outBuffer);
|
||||
outBuffer->setAllocationType(GraphicsAllocation::AllocationType::BUFFER);
|
||||
outBuffer->setMemObjectsAllocationWithWritableFlags(true);
|
||||
}
|
||||
|
||||
void initializeExpectedMemory(size_t globalX, size_t globalY, size_t globalZ) {
|
||||
uint32_t id = 0;
|
||||
size_t testGlobalMax = globalX * globalY * globalZ;
|
||||
ASSERT_GT(typeItems, testGlobalMax);
|
||||
int maxId = static_cast<int>(testGlobalMax);
|
||||
|
||||
argVal = maxId;
|
||||
kernel->setArg(0, sizeof(int), &argVal);
|
||||
|
||||
int *expectedData = static_cast<int *>(expectedMemory);
|
||||
for (size_t z = 0; z < globalZ; z++) {
|
||||
for (size_t y = 0; y < globalY; y++) {
|
||||
for (size_t x = 0; x < globalX; x++) {
|
||||
*(expectedData + id) = id;
|
||||
++id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*(static_cast<int *>(destMemory) + maxId) = 0;
|
||||
*(expectedData + maxId) = maxId;
|
||||
|
||||
sizeWrittenMemory = maxId * typeSize;
|
||||
//add single int size for atomic sum of all work-items
|
||||
sizeWrittenMemory += typeSize;
|
||||
|
||||
sizeRemainderMemory = sizeUserMemory - sizeWrittenMemory;
|
||||
expectedRemainderMemory = alignedMalloc(sizeRemainderMemory, 4096);
|
||||
ASSERT_NE(nullptr, expectedRemainderMemory);
|
||||
int *expectedReminderData = static_cast<int *>(expectedRemainderMemory);
|
||||
size_t reminderElements = sizeRemainderMemory / typeSize;
|
||||
for (size_t i = 0; i < reminderElements; i++) {
|
||||
*(expectedReminderData + i) = 0xdeadbeef;
|
||||
}
|
||||
remainderDestMemory = static_cast<char *>(destMemory) + sizeWrittenMemory;
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
if (deviceClVersionSupport < 20) {
|
||||
return;
|
||||
}
|
||||
if (destMemory) {
|
||||
alignedFree(destMemory);
|
||||
destMemory = nullptr;
|
||||
}
|
||||
if (expectedMemory) {
|
||||
alignedFree(expectedMemory);
|
||||
expectedMemory = nullptr;
|
||||
}
|
||||
if (expectedRemainderMemory) {
|
||||
alignedFree(expectedRemainderMemory);
|
||||
expectedRemainderMemory = nullptr;
|
||||
}
|
||||
KernelAUBFixture<SimpleArgNonUniformKernelFixture>::TearDown();
|
||||
}
|
||||
unsigned int deviceClVersionSupport;
|
||||
|
||||
size_t typeSize;
|
||||
size_t typeItems;
|
||||
size_t sizeWrittenMemory;
|
||||
size_t sizeUserMemory;
|
||||
size_t sizeRemainderMemory;
|
||||
int argVal;
|
||||
void *destMemory = nullptr;
|
||||
void *expectedMemory = nullptr;
|
||||
void *expectedRemainderMemory = nullptr;
|
||||
char *remainderDestMemory = nullptr;
|
||||
GraphicsAllocation *outBuffer;
|
||||
|
||||
HardwareParse hwParser;
|
||||
};
|
||||
|
||||
using AUBSimpleArgNonUniformTest = Test<AUBSimpleArgNonUniformFixture>;
|
||||
|
||||
HWTEST_F(AUBSimpleArgNonUniformTest, DISABLED_givenOpenCL20SupportWhenProvidingWork1DimNonUniformGroupThenExpectTwoWalkers) {
|
||||
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
|
||||
if (deviceClVersionSupport >= 20) {
|
||||
cl_uint workDim = 1;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize[3] = {39, 1, 1};
|
||||
size_t localWorkSize[3] = {32, 1, 1};
|
||||
cl_uint numEventsInWaitList = 0;
|
||||
cl_event *eventWaitList = nullptr;
|
||||
cl_event *event = nullptr;
|
||||
|
||||
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
|
||||
|
||||
auto retVal = this->pCmdQ->enqueueKernel(
|
||||
this->kernel,
|
||||
workDim,
|
||||
globalWorkOffset,
|
||||
globalWorkSize,
|
||||
localWorkSize,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
hwParser.parseCommands<FamilyType>(*pCmdQ);
|
||||
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
|
||||
EXPECT_EQ(2u, walkerCount);
|
||||
|
||||
pCmdQ->flush();
|
||||
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
|
||||
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(AUBSimpleArgNonUniformTest, DISABLED_givenOpenCL20SupportWhenProvidingWork2DimNonUniformGroupInXDimensionThenExpectTwoWalkers) {
|
||||
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
|
||||
if (deviceClVersionSupport >= 20) {
|
||||
cl_uint workDim = 2;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize[3] = {39, 32, 1};
|
||||
size_t localWorkSize[3] = {16, 16, 1};
|
||||
cl_uint numEventsInWaitList = 0;
|
||||
cl_event *eventWaitList = nullptr;
|
||||
cl_event *event = nullptr;
|
||||
|
||||
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
|
||||
|
||||
auto retVal = this->pCmdQ->enqueueKernel(
|
||||
this->kernel,
|
||||
workDim,
|
||||
globalWorkOffset,
|
||||
globalWorkSize,
|
||||
localWorkSize,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
hwParser.parseCommands<FamilyType>(*pCmdQ);
|
||||
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
|
||||
EXPECT_EQ(2u, walkerCount);
|
||||
|
||||
pCmdQ->flush();
|
||||
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
|
||||
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(AUBSimpleArgNonUniformTest, DISABLED_givenOpenCL20SupportWhenProvidingWork2DimNonUniformGroupInYDimensionThenExpectTwoWalkers) {
|
||||
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
|
||||
if (deviceClVersionSupport >= 20) {
|
||||
cl_uint workDim = 2;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize[3] = {32, 39, 1};
|
||||
size_t localWorkSize[3] = {16, 16, 1};
|
||||
cl_uint numEventsInWaitList = 0;
|
||||
cl_event *eventWaitList = nullptr;
|
||||
cl_event *event = nullptr;
|
||||
|
||||
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
|
||||
|
||||
auto retVal = this->pCmdQ->enqueueKernel(
|
||||
this->kernel,
|
||||
workDim,
|
||||
globalWorkOffset,
|
||||
globalWorkSize,
|
||||
localWorkSize,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
hwParser.parseCommands<FamilyType>(*pCmdQ);
|
||||
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
|
||||
EXPECT_EQ(2u, walkerCount);
|
||||
|
||||
pCmdQ->flush();
|
||||
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
|
||||
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(AUBSimpleArgNonUniformTest, DISABLED_givenOpenCL20SupportWhenProvidingWork2DimNonUniformGroupInXandYDimensionThenExpectFourWalkers) {
|
||||
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
|
||||
if (deviceClVersionSupport >= 20) {
|
||||
cl_uint workDim = 2;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize[3] = {39, 39, 1};
|
||||
size_t localWorkSize[3] = {16, 16, 1};
|
||||
cl_uint numEventsInWaitList = 0;
|
||||
cl_event *eventWaitList = nullptr;
|
||||
cl_event *event = nullptr;
|
||||
|
||||
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
|
||||
|
||||
auto retVal = this->pCmdQ->enqueueKernel(
|
||||
this->kernel,
|
||||
workDim,
|
||||
globalWorkOffset,
|
||||
globalWorkSize,
|
||||
localWorkSize,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
hwParser.parseCommands<FamilyType>(*pCmdQ);
|
||||
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
|
||||
EXPECT_EQ(4u, walkerCount);
|
||||
|
||||
pCmdQ->flush();
|
||||
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
|
||||
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(AUBSimpleArgNonUniformTest, DISABLED_givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInXDimensionThenExpectTwoWalkers) {
|
||||
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
|
||||
if (deviceClVersionSupport >= 20) {
|
||||
cl_uint workDim = 3;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize[3] = {39, 32, 32};
|
||||
size_t localWorkSize[3] = {8, 8, 2};
|
||||
cl_uint numEventsInWaitList = 0;
|
||||
cl_event *eventWaitList = nullptr;
|
||||
cl_event *event = nullptr;
|
||||
|
||||
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
|
||||
|
||||
auto retVal = this->pCmdQ->enqueueKernel(
|
||||
this->kernel,
|
||||
workDim,
|
||||
globalWorkOffset,
|
||||
globalWorkSize,
|
||||
localWorkSize,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
hwParser.parseCommands<FamilyType>(*pCmdQ);
|
||||
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
|
||||
EXPECT_EQ(2u, walkerCount);
|
||||
|
||||
pCmdQ->flush();
|
||||
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
|
||||
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(AUBSimpleArgNonUniformTest, DISABLED_givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInYDimensionThenExpectTwoWalkers) {
|
||||
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
|
||||
if (deviceClVersionSupport >= 20) {
|
||||
cl_uint workDim = 3;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize[3] = {32, 39, 32};
|
||||
size_t localWorkSize[3] = {8, 8, 2};
|
||||
cl_uint numEventsInWaitList = 0;
|
||||
cl_event *eventWaitList = nullptr;
|
||||
cl_event *event = nullptr;
|
||||
|
||||
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
|
||||
|
||||
auto retVal = this->pCmdQ->enqueueKernel(
|
||||
this->kernel,
|
||||
workDim,
|
||||
globalWorkOffset,
|
||||
globalWorkSize,
|
||||
localWorkSize,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
hwParser.parseCommands<FamilyType>(*pCmdQ);
|
||||
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
|
||||
EXPECT_EQ(2u, walkerCount);
|
||||
|
||||
pCmdQ->flush();
|
||||
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
|
||||
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(AUBSimpleArgNonUniformTest, DISABLED_givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInZDimensionThenExpectTwoWalkers) {
|
||||
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
|
||||
if (deviceClVersionSupport >= 20) {
|
||||
cl_uint workDim = 3;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize[3] = {32, 32, 39};
|
||||
size_t localWorkSize[3] = {8, 2, 8};
|
||||
cl_uint numEventsInWaitList = 0;
|
||||
cl_event *eventWaitList = nullptr;
|
||||
cl_event *event = nullptr;
|
||||
|
||||
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
|
||||
|
||||
auto retVal = this->pCmdQ->enqueueKernel(
|
||||
this->kernel,
|
||||
workDim,
|
||||
globalWorkOffset,
|
||||
globalWorkSize,
|
||||
localWorkSize,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
hwParser.parseCommands<FamilyType>(*pCmdQ);
|
||||
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
|
||||
EXPECT_EQ(2u, walkerCount);
|
||||
|
||||
pCmdQ->flush();
|
||||
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
|
||||
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(AUBSimpleArgNonUniformTest, DISABLED_givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInXandYDimensionThenExpectFourWalkers) {
|
||||
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
|
||||
if (deviceClVersionSupport >= 20) {
|
||||
cl_uint workDim = 3;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize[3] = {39, 39, 32};
|
||||
size_t localWorkSize[3] = {8, 8, 2};
|
||||
cl_uint numEventsInWaitList = 0;
|
||||
cl_event *eventWaitList = nullptr;
|
||||
cl_event *event = nullptr;
|
||||
|
||||
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
|
||||
|
||||
auto retVal = this->pCmdQ->enqueueKernel(
|
||||
this->kernel,
|
||||
workDim,
|
||||
globalWorkOffset,
|
||||
globalWorkSize,
|
||||
localWorkSize,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
hwParser.parseCommands<FamilyType>(*pCmdQ);
|
||||
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
|
||||
EXPECT_EQ(4u, walkerCount);
|
||||
|
||||
pCmdQ->flush();
|
||||
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
|
||||
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(AUBSimpleArgNonUniformTest, DISABLED_givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInXandZDimensionThenExpectFourWalkers) {
|
||||
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
|
||||
if (deviceClVersionSupport >= 20) {
|
||||
cl_uint workDim = 3;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize[3] = {39, 32, 39};
|
||||
size_t localWorkSize[3] = {8, 2, 8};
|
||||
cl_uint numEventsInWaitList = 0;
|
||||
cl_event *eventWaitList = nullptr;
|
||||
cl_event *event = nullptr;
|
||||
|
||||
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
|
||||
|
||||
auto retVal = this->pCmdQ->enqueueKernel(
|
||||
this->kernel,
|
||||
workDim,
|
||||
globalWorkOffset,
|
||||
globalWorkSize,
|
||||
localWorkSize,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
hwParser.parseCommands<FamilyType>(*pCmdQ);
|
||||
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
|
||||
EXPECT_EQ(4u, walkerCount);
|
||||
|
||||
pCmdQ->flush();
|
||||
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
|
||||
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(AUBSimpleArgNonUniformTest, DISABLED_givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInYandZDimensionThenExpectFourWalkers) {
|
||||
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
|
||||
if (deviceClVersionSupport >= 20) {
|
||||
cl_uint workDim = 3;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize[3] = {32, 39, 39};
|
||||
size_t localWorkSize[3] = {2, 8, 8};
|
||||
cl_uint numEventsInWaitList = 0;
|
||||
cl_event *eventWaitList = nullptr;
|
||||
cl_event *event = nullptr;
|
||||
|
||||
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
|
||||
|
||||
auto retVal = this->pCmdQ->enqueueKernel(
|
||||
this->kernel,
|
||||
workDim,
|
||||
globalWorkOffset,
|
||||
globalWorkSize,
|
||||
localWorkSize,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
hwParser.parseCommands<FamilyType>(*pCmdQ);
|
||||
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
|
||||
EXPECT_EQ(4u, walkerCount);
|
||||
|
||||
pCmdQ->flush();
|
||||
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
|
||||
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(AUBSimpleArgNonUniformTest, DISABLED_givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInXandYandZDimensionThenExpectEightWalkers) {
|
||||
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
|
||||
if (deviceClVersionSupport >= 20) {
|
||||
cl_uint workDim = 3;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize[3] = {39, 39, 39};
|
||||
size_t localWorkSize[3] = {8, 8, 2};
|
||||
cl_uint numEventsInWaitList = 0;
|
||||
cl_event *eventWaitList = nullptr;
|
||||
cl_event *event = nullptr;
|
||||
|
||||
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
|
||||
|
||||
auto retVal = this->pCmdQ->enqueueKernel(
|
||||
this->kernel,
|
||||
workDim,
|
||||
globalWorkOffset,
|
||||
globalWorkSize,
|
||||
localWorkSize,
|
||||
numEventsInWaitList,
|
||||
eventWaitList,
|
||||
event);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
hwParser.parseCommands<FamilyType>(*pCmdQ);
|
||||
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
|
||||
EXPECT_EQ(8u, walkerCount);
|
||||
|
||||
pCmdQ->flush();
|
||||
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
|
||||
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,9 +45,16 @@ void setupAUB(const OCLRT::Device *pDevice, OCLRT::EngineType engineType) {
|
||||
aubFile.writeMMIO(mmioBase + 0x229c, 0xffff8280);
|
||||
|
||||
const size_t sizeHWSP = 0x1000;
|
||||
const size_t alignHWSP = 0x1000;
|
||||
auto pGlobalHWStatusPage = alignedMalloc(sizeHWSP, alignHWSP);
|
||||
const size_t sizeRing = 0x4 * 0x1000;
|
||||
|
||||
const size_t sizeTotal = alignUp((sizeHWSP + sizeRing + csTraits.sizeLRCA), 0x1000);
|
||||
const size_t alignTotal = sizeTotal;
|
||||
|
||||
auto totalBuffer = alignedMalloc(sizeTotal, alignTotal);
|
||||
size_t totalBufferOffset = 0;
|
||||
|
||||
auto pGlobalHWStatusPage = totalBuffer;
|
||||
totalBufferOffset += sizeHWSP;
|
||||
uint32_t ggttGlobalHardwareStatusPage = (uint32_t)((uintptr_t)pGlobalHWStatusPage);
|
||||
AubGTTData data = {true, false};
|
||||
AUB::reserveAddressGGTT(aubFile, ggttGlobalHardwareStatusPage, sizeHWSP, physAddress, data);
|
||||
@@ -55,10 +62,9 @@ void setupAUB(const OCLRT::Device *pDevice, OCLRT::EngineType engineType) {
|
||||
|
||||
aubFile.writeMMIO(mmioBase + 0x2080, ggttGlobalHardwareStatusPage);
|
||||
|
||||
const size_t sizeRing = 0x4 * 0x1000;
|
||||
const size_t alignRing = 0x1000;
|
||||
size_t sizeCommands = 0;
|
||||
auto pRing = alignedMalloc(sizeRing, alignRing);
|
||||
auto pRing = ptrOffset<void *>(totalBuffer, totalBufferOffset);
|
||||
totalBufferOffset += sizeRing;
|
||||
|
||||
auto ggttRing = (uint32_t)(uintptr_t)pRing;
|
||||
auto physRing = physAddress;
|
||||
@@ -84,7 +90,8 @@ void setupAUB(const OCLRT::Device *pDevice, OCLRT::EngineType engineType) {
|
||||
AUB::addMemoryWrite(aubFile, physRing, pRing, sizeCommands, AubMemDump::AddressSpaceValues::TraceNonlocal, csTraits.aubHintCommandBuffer);
|
||||
|
||||
auto sizeLRCA = csTraits.sizeLRCA;
|
||||
auto pLRCABase = alignedMalloc(csTraits.sizeLRCA, csTraits.alignLRCA);
|
||||
auto pLRCABase = ptrOffset<void *>(totalBuffer, totalBufferOffset);
|
||||
totalBufferOffset += csTraits.sizeLRCA;
|
||||
|
||||
csTraits.initialize(pLRCABase);
|
||||
csTraits.setRingHead(pLRCABase, 0x0000);
|
||||
@@ -117,9 +124,7 @@ void setupAUB(const OCLRT::Device *pDevice, OCLRT::EngineType engineType) {
|
||||
aubFile.writeMMIO(mmioBase + 0x2230, contextDescriptor.ulData[1]);
|
||||
aubFile.writeMMIO(mmioBase + 0x2230, contextDescriptor.ulData[0]);
|
||||
|
||||
alignedFree(pRing);
|
||||
alignedFree(pLRCABase);
|
||||
alignedFree(pGlobalHWStatusPage);
|
||||
alignedFree(totalBuffer);
|
||||
|
||||
aubFile.fileHandle.close();
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ class AUBFixture : public CommandQueueHwFixture {
|
||||
|
||||
CommandQueueHwFixture::SetUp(AUBFixture::device.get(), cl_command_queue_properties(0));
|
||||
}
|
||||
void TearDown() {
|
||||
void TearDown() override {
|
||||
CommandQueueHwFixture::TearDown();
|
||||
}
|
||||
|
||||
@@ -65,6 +65,22 @@ class AUBFixture : public CommandQueueHwFixture {
|
||||
return aubCsr;
|
||||
}
|
||||
|
||||
template <typename FamilyType>
|
||||
void expectMemory(void *gfxAddress, const void *srcAddress, size_t length) {
|
||||
auto aubCsr = getAubCsr<FamilyType>();
|
||||
PageWalker walker = [&](uint64_t physAddress, size_t size, size_t offset, uint64_t entryBits) {
|
||||
if (offset > length)
|
||||
abort();
|
||||
|
||||
aubCsr->stream->expectMemory(physAddress,
|
||||
reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(srcAddress) + offset),
|
||||
size,
|
||||
aubCsr->getAddressSpaceFromPTEBits(entryBits));
|
||||
};
|
||||
|
||||
aubCsr->ppgtt->pageWalk(reinterpret_cast<uintptr_t>(gfxAddress), length, 0, PageTableEntry::nonValidBits, walker, MemoryBanks::BankNotSpecified);
|
||||
}
|
||||
|
||||
static void *getGpuPointer(GraphicsAllocation *allocation) {
|
||||
return reinterpret_cast<void *>(allocation->getGpuAddress());
|
||||
}
|
||||
@@ -78,4 +94,18 @@ class AUBFixture : public CommandQueueHwFixture {
|
||||
private:
|
||||
using CommandQueueHwFixture::SetUp;
|
||||
};
|
||||
|
||||
template <typename KernelFixture>
|
||||
struct KernelAUBFixture : public AUBFixture,
|
||||
public KernelFixture {
|
||||
void SetUp() {
|
||||
AUBFixture::SetUp(nullptr);
|
||||
KernelFixture::SetUp(device.get(), context);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
KernelFixture::TearDown();
|
||||
AUBFixture::TearDown();
|
||||
}
|
||||
};
|
||||
} // namespace OCLRT
|
||||
|
||||
@@ -46,7 +46,7 @@ struct SimpleArgFixture : public FixtureFactory::IndirectHeapFixture,
|
||||
using CommandStreamFixture::pCS;
|
||||
using IndirectHeapFixture::SetUp;
|
||||
using KernelFixture::pKernel;
|
||||
using SimpleArgKernelFixture::SetUp;
|
||||
using KernelFixture::SetUp;
|
||||
|
||||
SimpleArgFixture()
|
||||
: pDestMemory(nullptr), sizeUserMemory(128 * sizeof(float)) {
|
||||
@@ -64,7 +64,7 @@ struct SimpleArgFixture : public FixtureFactory::IndirectHeapFixture,
|
||||
KernelFixture::SetUp(pDevice);
|
||||
ASSERT_NE(nullptr, pKernel);
|
||||
|
||||
int argVal = (int)0x22222222;
|
||||
argVal = static_cast<int>(0x22222222);
|
||||
pDestMemory = alignedMalloc(sizeUserMemory, 4096);
|
||||
ASSERT_NE(nullptr, pDestMemory);
|
||||
|
||||
@@ -79,12 +79,21 @@ struct SimpleArgFixture : public FixtureFactory::IndirectHeapFixture,
|
||||
pKernel->setArgSvm(1, sizeUserMemory, pDestMemory);
|
||||
|
||||
auto &commandStreamReceiver = pDevice->getCommandStreamReceiver();
|
||||
commandStreamReceiver.createAllocationAndHandleResidency(pDestMemory, sizeUserMemory);
|
||||
outBuffer = commandStreamReceiver.createAllocationAndHandleResidency(pDestMemory, sizeUserMemory);
|
||||
ASSERT_NE(nullptr, outBuffer);
|
||||
outBuffer->setAllocationType(GraphicsAllocation::AllocationType::BUFFER);
|
||||
outBuffer->setMemObjectsAllocationWithWritableFlags(true);
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
alignedFree(pExpectedMemory);
|
||||
alignedFree(pDestMemory);
|
||||
if (pExpectedMemory) {
|
||||
alignedFree(pExpectedMemory);
|
||||
pExpectedMemory = nullptr;
|
||||
}
|
||||
if (pDestMemory) {
|
||||
alignedFree(pDestMemory);
|
||||
pDestMemory = nullptr;
|
||||
}
|
||||
|
||||
KernelFixture::TearDown();
|
||||
IndirectHeapFixture::TearDown();
|
||||
@@ -93,8 +102,10 @@ struct SimpleArgFixture : public FixtureFactory::IndirectHeapFixture,
|
||||
DeviceFixture::TearDown();
|
||||
}
|
||||
|
||||
int argVal;
|
||||
void *pDestMemory;
|
||||
void *pExpectedMemory;
|
||||
size_t sizeUserMemory;
|
||||
GraphicsAllocation *outBuffer;
|
||||
};
|
||||
} // namespace OCLRT
|
||||
|
||||
@@ -117,8 +117,10 @@ class SimpleArgKernelFixture : public ProgramFixture {
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
delete pKernel;
|
||||
pKernel = nullptr;
|
||||
if (pKernel) {
|
||||
delete pKernel;
|
||||
pKernel = nullptr;
|
||||
}
|
||||
|
||||
pContext->release();
|
||||
|
||||
@@ -129,4 +131,56 @@ class SimpleArgKernelFixture : public ProgramFixture {
|
||||
Kernel *pKernel;
|
||||
MockContext *pContext;
|
||||
};
|
||||
|
||||
class SimpleArgNonUniformKernelFixture : public ProgramFixture {
|
||||
public:
|
||||
using ProgramFixture::SetUp;
|
||||
SimpleArgNonUniformKernelFixture()
|
||||
: retVal(CL_SUCCESS), kernel(nullptr) {
|
||||
}
|
||||
|
||||
protected:
|
||||
void SetUp(Device *device, Context *context) {
|
||||
ProgramFixture::SetUp();
|
||||
|
||||
cl_device_id deviceId = device;
|
||||
cl_context clContext = context;
|
||||
|
||||
CreateProgramFromBinary<Program>(
|
||||
clContext,
|
||||
&deviceId,
|
||||
"simple_nonuniform",
|
||||
"-cl-std=CL2.0");
|
||||
ASSERT_NE(nullptr, pProgram);
|
||||
|
||||
retVal = pProgram->build(
|
||||
1,
|
||||
&deviceId,
|
||||
"-cl-std=CL2.0",
|
||||
nullptr,
|
||||
nullptr,
|
||||
false);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
kernel = Kernel::create<MockKernel>(
|
||||
pProgram,
|
||||
*pProgram->getKernelInfo("simpleNonUniform"),
|
||||
&retVal);
|
||||
ASSERT_NE(nullptr, kernel);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
if (kernel) {
|
||||
delete kernel;
|
||||
kernel = nullptr;
|
||||
}
|
||||
|
||||
ProgramFixture::TearDown();
|
||||
}
|
||||
|
||||
cl_int retVal;
|
||||
Kernel *kernel;
|
||||
};
|
||||
|
||||
} // namespace OCLRT
|
||||
|
||||
@@ -163,6 +163,22 @@ struct HardwareParse {
|
||||
return numCommands;
|
||||
}
|
||||
|
||||
template <typename CmdType>
|
||||
uint32_t getCommandCount() {
|
||||
GenCmdList::iterator cmdItor = cmdList.begin();
|
||||
uint32_t cmdCount = 0;
|
||||
|
||||
do {
|
||||
cmdItor = find<CmdType *>(cmdItor, cmdList.end());
|
||||
if (cmdItor != cmdList.end()) {
|
||||
++cmdCount;
|
||||
++cmdItor;
|
||||
}
|
||||
} while (cmdItor != cmdList.end());
|
||||
|
||||
return cmdCount;
|
||||
}
|
||||
|
||||
// The starting point of parsing commandBuffers. This is important
|
||||
// because as buffers get reused, we only want to parse the deltas.
|
||||
LinearStream *previousCS;
|
||||
|
||||
14
unit_tests/test_files/simple_nonuniform.cl
Normal file
14
unit_tests/test_files/simple_nonuniform.cl
Normal file
@@ -0,0 +1,14 @@
|
||||
/*
|
||||
* Copyright (C) 2018 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
__kernel void simpleNonUniform(int atomicOffset, __global volatile int *dst) {
|
||||
int id = (int)(get_global_id(2) * (get_global_size(1) * get_global_size(0)) + get_global_id(1) * get_global_size(0) + get_global_id(0));
|
||||
dst[id] = id;
|
||||
|
||||
__global volatile atomic_int *atomic_dst = ( __global volatile atomic_int * )dst;
|
||||
atomic_fetch_add_explicit( &atomic_dst[atomicOffset], 1 , memory_order_relaxed, memory_scope_all_svm_devices );
|
||||
}
|
||||
Reference in New Issue
Block a user