From 332720920a588fea56d51ba9eaf259fcdb2d6407 Mon Sep 17 00:00:00 2001 From: Fabian Zwolinski Date: Mon, 4 Oct 2021 10:51:12 +0200 Subject: [PATCH] Expose clGetKernelSuggestedLocalWorkSizeKHR func Related-to: NEO-2922 Signed-off-by: Fabian Zwolinski --- opencl/source/api/api.cpp | 16 ++ opencl/source/api/api.h | 8 + opencl/source/platform/extensions.cpp | 3 +- opencl/test/unit_test/api/CMakeLists.txt | 1 + .../test/unit_test/api/api_tests_wrapper3.cpp | 3 +- .../api/cl_get_device_info_tests.inl | 3 +- ...on_function_address_for_platform_tests.inl | 5 + ...l_get_extension_function_address_tests.inl | 5 + ...el_suggested_local_work_size_khr_tests.inl | 150 ++++++++++++++++++ 9 files changed, 191 insertions(+), 3 deletions(-) create mode 100644 opencl/test/unit_test/api/cl_get_kernel_suggested_local_work_size_khr_tests.inl diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp index 810c49f0d8..cb3759130b 100644 --- a/opencl/source/api/api.cpp +++ b/opencl/source/api/api.cpp @@ -4327,6 +4327,20 @@ cl_program CL_API_CALL clCreateProgramWithILKHR(cl_context context, return program; } +cl_int CL_API_CALL clGetKernelSuggestedLocalWorkSizeKHR(cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + size_t *suggested_local_work_size) { + return clGetKernelSuggestedLocalWorkSizeINTEL(command_queue, + kernel, + work_dim, + global_work_offset, + global_work_size, + suggested_local_work_size); +} + #define RETURN_FUNC_PTR_IF_EXIST(name) \ { \ if (!strcmp(funcName, #name)) { \ @@ -4392,6 +4406,8 @@ void *CL_API_CALL clGetExtensionFunctionAddress(const char *funcName) { RETURN_FUNC_PTR_IF_EXIST(clSetProgramSpecializationConstant); + RETURN_FUNC_PTR_IF_EXIST(clGetKernelSuggestedLocalWorkSizeKHR); + ret = getAdditionalExtensionFunctionAddress(funcName); TRACING_EXIT(clGetExtensionFunctionAddress, &ret); return ret; diff --git a/opencl/source/api/api.h b/opencl/source/api/api.h index ea30f6385a..6fa18f68ae 100644 --- a/opencl/source/api/api.h +++ b/opencl/source/api/api.h @@ -927,6 +927,14 @@ extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithILKHR( size_t length, cl_int *errcodeRet) CL_API_SUFFIX__VERSION_1_2; +extern CL_API_ENTRY cl_int CL_API_CALL clGetKernelSuggestedLocalWorkSizeKHR( + cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + size_t *suggested_local_work_size) CL_API_SUFFIX__VERSION_3_0; + void *clHostMemAllocINTEL( cl_context context, const cl_mem_properties_intel *properties, diff --git a/opencl/source/platform/extensions.cpp b/opencl/source/platform/extensions.cpp index 3e66c5c552..fc759ecb6a 100644 --- a/opencl/source/platform/extensions.cpp +++ b/opencl/source/platform/extensions.cpp @@ -42,7 +42,8 @@ const char *deviceExtensionsList = "cl_khr_byte_addressable_store " "cl_khr_subgroup_shuffle " "cl_khr_subgroup_shuffle_relative " "cl_khr_subgroup_clustered_reduce " - "cl_intel_device_attribute_query "; + "cl_intel_device_attribute_query " + "cl_khr_suggested_local_work_size "; std::string getExtensionsList(const HardwareInfo &hwInfo) { std::string allExtensionsList; diff --git a/opencl/test/unit_test/api/CMakeLists.txt b/opencl/test/unit_test/api/CMakeLists.txt index 694720050f..bef784186f 100644 --- a/opencl/test/unit_test/api/CMakeLists.txt +++ b/opencl/test/unit_test/api/CMakeLists.txt @@ -83,6 +83,7 @@ set(IGDRCL_SRCS_tests_api ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_kernel_sub_group_info_khr_tests.inl ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_kernel_sub_group_info_tests.inl ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_kernel_suggested_local_work_size_intel_tests.inl + ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_kernel_suggested_local_work_size_khr_tests.inl ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_kernel_work_group_info_tests.inl ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_mem_object_info_tests.inl ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_pipe_info_tests.inl diff --git a/opencl/test/unit_test/api/api_tests_wrapper3.cpp b/opencl/test/unit_test/api/api_tests_wrapper3.cpp index a5995f1509..28afee7576 100644 --- a/opencl/test/unit_test/api/api_tests_wrapper3.cpp +++ b/opencl/test/unit_test/api/api_tests_wrapper3.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -7,6 +7,7 @@ #include "opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl" #include "opencl/test/unit_test/api/cl_get_kernel_suggested_local_work_size_intel_tests.inl" +#include "opencl/test/unit_test/api/cl_get_kernel_suggested_local_work_size_khr_tests.inl" #include "opencl/test/unit_test/api/cl_get_kernel_work_group_info_tests.inl" #include "opencl/test/unit_test/api/cl_get_mem_object_info_tests.inl" #include "opencl/test/unit_test/api/cl_get_pipe_info_tests.inl" diff --git a/opencl/test/unit_test/api/cl_get_device_info_tests.inl b/opencl/test/unit_test/api/cl_get_device_info_tests.inl index 93c74711d2..45f6527d58 100644 --- a/opencl/test/unit_test/api/cl_get_device_info_tests.inl +++ b/opencl/test/unit_test/api/cl_get_device_info_tests.inl @@ -281,7 +281,8 @@ TEST_F(clGetDeviceInfoTests, GivenClDeviceExtensionsParamWhenGettingDeviceInfoTh "cl_khr_subgroup_shuffle ", "cl_khr_subgroup_shuffle_relative ", "cl_khr_subgroup_clustered_reduce " - "cl_intel_device_attribute_query "}; + "cl_intel_device_attribute_query " + "cl_khr_suggested_local_work_size "}; for (auto extension : supportedExtensions) { auto foundOffset = extensionString.find(extension); diff --git a/opencl/test/unit_test/api/cl_get_extension_function_address_for_platform_tests.inl b/opencl/test/unit_test/api/cl_get_extension_function_address_for_platform_tests.inl index afe49e6877..c98bb2c9d1 100644 --- a/opencl/test/unit_test/api/cl_get_extension_function_address_for_platform_tests.inl +++ b/opencl/test/unit_test/api/cl_get_extension_function_address_for_platform_tests.inl @@ -83,6 +83,11 @@ TEST_F(clGetExtensionFunctionAddressForPlatformTests, GivenClGetKernelSuggestedL EXPECT_EQ(retVal, reinterpret_cast(clGetKernelSuggestedLocalWorkSizeINTEL)); } +TEST_F(clGetExtensionFunctionAddressForPlatformTests, GivenClGetKernelSuggestedLocalWorkSizeKHRWhenGettingExtensionFunctionThenCorrectAddressIsReturned) { + auto retVal = clGetExtensionFunctionAddressForPlatform(pPlatform, "clGetKernelSuggestedLocalWorkSizeKHR"); + EXPECT_EQ(retVal, reinterpret_cast(clGetKernelSuggestedLocalWorkSizeKHR)); +} + TEST_F(clGetExtensionFunctionAddressForPlatformTests, GivenClGetKernelMaxConcurrentWorkGroupCountINTELWhenGettingExtensionFunctionThenCorrectAddressIsReturned) { auto retVal = clGetExtensionFunctionAddressForPlatform(pPlatform, "clGetKernelMaxConcurrentWorkGroupCountINTEL"); EXPECT_EQ(retVal, reinterpret_cast(clGetKernelMaxConcurrentWorkGroupCountINTEL)); diff --git a/opencl/test/unit_test/api/cl_get_extension_function_address_tests.inl b/opencl/test/unit_test/api/cl_get_extension_function_address_tests.inl index 0572198531..80e5e3061c 100644 --- a/opencl/test/unit_test/api/cl_get_extension_function_address_tests.inl +++ b/opencl/test/unit_test/api/cl_get_extension_function_address_tests.inl @@ -173,6 +173,11 @@ TEST_F(clGetExtensionFunctionAddressTests, GivenClGetKernelSuggestedLocalWorkSiz EXPECT_EQ(retVal, reinterpret_cast(clGetKernelSuggestedLocalWorkSizeINTEL)); } +TEST_F(clGetExtensionFunctionAddressTests, GivenClGetKernelSuggestedLocalWorkSizeKHRWhenGettingExtensionFunctionThenCorrectAddressIsReturned) { + auto retVal = clGetExtensionFunctionAddress("clGetKernelSuggestedLocalWorkSizeKHR"); + EXPECT_EQ(retVal, reinterpret_cast(clGetKernelSuggestedLocalWorkSizeKHR)); +} + TEST_F(clGetExtensionFunctionAddressTests, GivenClGetKernelMaxConcurrentWorkGroupCountINTELWhenGettingExtensionFunctionThenCorrectAddressIsReturned) { auto retVal = clGetExtensionFunctionAddress("clGetKernelMaxConcurrentWorkGroupCountINTEL"); EXPECT_EQ(retVal, reinterpret_cast(clGetKernelMaxConcurrentWorkGroupCountINTEL)); diff --git a/opencl/test/unit_test/api/cl_get_kernel_suggested_local_work_size_khr_tests.inl b/opencl/test/unit_test/api/cl_get_kernel_suggested_local_work_size_khr_tests.inl new file mode 100644 index 0000000000..ad64cb3568 --- /dev/null +++ b/opencl/test/unit_test/api/cl_get_kernel_suggested_local_work_size_khr_tests.inl @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2020-2021 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "opencl/source/command_queue/command_queue.h" +#include "opencl/source/command_queue/gpgpu_walker.h" +#include "opencl/test/unit_test/mocks/mock_kernel.h" + +#include "cl_api_tests.h" + +using namespace NEO; + +using clGetKernelSuggestedLocalWorkSizeKHRTests = api_tests; + +namespace ULT { + +TEST_F(clGetKernelSuggestedLocalWorkSizeKHRTests, GivenInvalidInputWhenCallingGetKernelSuggestedLocalWorkSizeThenErrorIsReturned) { + size_t globalWorkOffset[3] = {}; + size_t globalWorkSize[3] = {1, 1, 1}; + size_t suggestedLocalWorkSize[3]; + cl_uint workDim = 1; + + retVal = clGetKernelSuggestedLocalWorkSizeKHR(nullptr, pMultiDeviceKernel, workDim, + globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_COMMAND_QUEUE, retVal); + + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, nullptr, workDim, + globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_KERNEL, retVal); + + pKernel->isPatchedOverride = false; + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, pMultiDeviceKernel, workDim, + globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_KERNEL, retVal); + pKernel->isPatchedOverride = true; + + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, pMultiDeviceKernel, workDim, + globalWorkOffset, globalWorkSize, nullptr); + EXPECT_EQ(CL_INVALID_VALUE, retVal); + + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, pMultiDeviceKernel, 0, + globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_WORK_DIMENSION, retVal); + + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, pMultiDeviceKernel, 4, + globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_WORK_DIMENSION, retVal); + + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, pMultiDeviceKernel, workDim, + globalWorkOffset, nullptr, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_GLOBAL_WORK_SIZE, retVal); + + for (size_t i = 0; i < 3; ++i) { + globalWorkSize[i] = 0; + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, pMultiDeviceKernel, 3, + globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_GLOBAL_WORK_SIZE, retVal); + globalWorkSize[i] = 1; + } +} + +TEST_F(clGetKernelSuggestedLocalWorkSizeKHRTests, GivenVariousInputWhenGettingSuggestedLocalWorkSizeThenCorrectValuesAreReturned) { + size_t globalWorkOffset[] = {0, 0, 0}; + size_t globalWorkSize[] = {128, 128, 128}; + size_t suggestedLocalWorkSize[] = {0, 0, 0}; + + Vec3 elws{0, 0, 0}; + Vec3 gws{128, 128, 128}; + Vec3 offset{0, 0, 0}; + DispatchInfo dispatchInfo{pDevice, pKernel, 1, gws, elws, offset}; + auto expectedLws = computeWorkgroupSize(dispatchInfo); + EXPECT_GT(expectedLws.x, 1u); + + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, pMultiDeviceKernel, 1, globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(expectedLws.x, suggestedLocalWorkSize[0]); + EXPECT_EQ(0u, suggestedLocalWorkSize[1]); + EXPECT_EQ(0u, suggestedLocalWorkSize[2]); + + dispatchInfo.setDim(2); + expectedLws = computeWorkgroupSize(dispatchInfo); + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, pMultiDeviceKernel, 2, globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(expectedLws.x, suggestedLocalWorkSize[0]); + EXPECT_EQ(expectedLws.y, suggestedLocalWorkSize[1]); + EXPECT_EQ(0u, suggestedLocalWorkSize[2]); + + dispatchInfo.setDim(3); + expectedLws = computeWorkgroupSize(dispatchInfo); + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, pMultiDeviceKernel, 3, globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(expectedLws.x, suggestedLocalWorkSize[0]); + EXPECT_EQ(expectedLws.y, suggestedLocalWorkSize[1]); + EXPECT_EQ(expectedLws.z, suggestedLocalWorkSize[2]); + + //null global work offset is fine + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, pMultiDeviceKernel, 3, nullptr, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(expectedLws.x, suggestedLocalWorkSize[0]); + EXPECT_EQ(expectedLws.y, suggestedLocalWorkSize[1]); + EXPECT_EQ(expectedLws.z, suggestedLocalWorkSize[2]); +} + +TEST_F(clGetKernelSuggestedLocalWorkSizeKHRTests, GivenKernelWithReqdWorkGroupSizeWhenGettingSuggestedLocalWorkSizeThenRequiredWorkSizeIsReturned) { + size_t globalWorkOffset[] = {0, 0, 0}; + size_t globalWorkSize[] = {128, 128, 128}; + size_t suggestedLocalWorkSize[] = {0, 0, 0}; + uint16_t regdLocalWorkSize[] = {32, 32, 32}; + + MockKernelWithInternals mockKernel(*pDevice); + mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] = regdLocalWorkSize[0]; + mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1] = regdLocalWorkSize[1]; + mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2] = regdLocalWorkSize[2]; + + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, mockKernel.mockMultiDeviceKernel, 3, globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(regdLocalWorkSize[0], suggestedLocalWorkSize[0]); + EXPECT_EQ(regdLocalWorkSize[1], suggestedLocalWorkSize[1]); + EXPECT_EQ(regdLocalWorkSize[2], suggestedLocalWorkSize[2]); +} + +TEST_F(clGetKernelSuggestedLocalWorkSizeKHRTests, GivenKernelWithExecutionEnvironmentPatchedWhenGettingSuggestedLocalWorkSizeThenCorrectValuesAreReturned) { + auto pKernelWithExecutionEnvironmentPatch = MockKernel::create(pCommandQueue->getDevice(), pProgram); + auto kernelInfos = MockKernel::toKernelInfoContainer(pKernelWithExecutionEnvironmentPatch->getKernelInfo(), testedRootDeviceIndex); + MultiDeviceKernel multiDeviceKernelWithExecutionEnvironmentPatch(MockMultiDeviceKernel::toKernelVector(pKernelWithExecutionEnvironmentPatch), kernelInfos); + + size_t globalWorkOffset[] = {0, 0, 0}; + size_t globalWorkSize[] = {128, 128, 128}; + size_t suggestedLocalWorkSize[] = {0, 0, 0}; + cl_uint workDim = 3; + + Vec3 elws{0, 0, 0}; + Vec3 gws{128, 128, 128}; + Vec3 offset{0, 0, 0}; + const DispatchInfo dispatchInfo{pDevice, pKernelWithExecutionEnvironmentPatch, workDim, gws, elws, offset}; + auto expectedLws = computeWorkgroupSize(dispatchInfo); + EXPECT_GT(expectedLws.x * expectedLws.y * expectedLws.z, 1u); + + retVal = clGetKernelSuggestedLocalWorkSizeKHR(pCommandQueue, &multiDeviceKernelWithExecutionEnvironmentPatch, workDim, globalWorkOffset, + globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(expectedLws.x, suggestedLocalWorkSize[0]); + EXPECT_EQ(expectedLws.y, suggestedLocalWorkSize[1]); + EXPECT_EQ(expectedLws.z, suggestedLocalWorkSize[2]); +} + +} // namespace ULT