diff --git a/runtime/api/api.cpp b/runtime/api/api.cpp index 298c08509a..91bf1f85b0 100644 --- a/runtime/api/api.cpp +++ b/runtime/api/api.cpp @@ -3977,6 +3977,7 @@ void *CL_API_CALL clGetExtensionFunctionAddress(const char *funcName) { RETURN_FUNC_PTR_IF_EXIST(clEnqueueMemAdviseINTEL); RETURN_FUNC_PTR_IF_EXIST(clGetDeviceFunctionPointerINTEL); RETURN_FUNC_PTR_IF_EXIST(clGetDeviceGlobalVariablePointerINTEL); + RETURN_FUNC_PTR_IF_EXIST(clGetKernelSuggestedLocalWorkSizeINTEL); RETURN_FUNC_PTR_IF_EXIST(clGetExecutionInfoINTEL); RETURN_FUNC_PTR_IF_EXIST(clEnqueueNDRangeKernelINTEL); @@ -5199,6 +5200,58 @@ cl_int CL_API_CALL clSetProgramSpecializationConstant(cl_program program, cl_uin return retVal; } +cl_int CL_API_CALL clGetKernelSuggestedLocalWorkSizeINTEL(cl_command_queue commandQueue, + cl_kernel kernel, + cl_uint workDim, + const size_t *globalWorkOffset, + const size_t *globalWorkSize, + size_t *suggestedLocalWorkSize) { + cl_int retVal = CL_SUCCESS; + API_ENTER(&retVal); + DBG_LOG_INPUTS("commandQueue", commandQueue, "cl_kernel", kernel, + "globalWorkOffset[0]", NEO::FileLoggerInstance().getInput(globalWorkOffset, 0), + "globalWorkOffset[1]", NEO::FileLoggerInstance().getInput(globalWorkOffset, 1), + "globalWorkOffset[2]", NEO::FileLoggerInstance().getInput(globalWorkOffset, 2), + "globalWorkSize", NEO::FileLoggerInstance().getSizes(globalWorkSize, workDim, true), + "suggestedLocalWorkSize", suggestedLocalWorkSize); + + retVal = validateObjects(commandQueue, kernel); + + if (CL_SUCCESS != retVal) { + return retVal; + } + + if ((workDim == 0) || (workDim > 3)) { + retVal = CL_INVALID_WORK_DIMENSION; + return retVal; + } + + if (globalWorkOffset == nullptr) { + retVal = CL_INVALID_GLOBAL_OFFSET; + return retVal; + } + + if (globalWorkSize == nullptr) { + retVal = CL_INVALID_GLOBAL_WORK_SIZE; + return retVal; + } + + auto pKernel = castToObjectOrAbort(kernel); + if (!pKernel->isPatched()) { + retVal = CL_INVALID_KERNEL; + return retVal; + } + + if (suggestedLocalWorkSize == nullptr) { + retVal = CL_INVALID_VALUE; + return retVal; + } + + pKernel->getSuggestedLocalWorkSize(workDim, globalWorkSize, globalWorkOffset, suggestedLocalWorkSize); + + return retVal; +} + cl_int CL_API_CALL clGetExecutionInfoINTEL(cl_command_queue commandQueue, cl_kernel kernel, cl_uint workDim, diff --git a/runtime/api/api.h b/runtime/api/api.h index c90f905ce7..efb65b4b34 100644 --- a/runtime/api/api.h +++ b/runtime/api/api.h @@ -1026,6 +1026,14 @@ cl_int CL_API_CALL clGetDeviceGlobalVariablePointerINTEL( size_t *globalVariableSizeRet, void **globalVariablePointerRet); +cl_int CL_API_CALL clGetKernelSuggestedLocalWorkSizeINTEL( + cl_command_queue commandQueue, + cl_kernel kernel, + cl_uint workDim, + const size_t *globalWorkOffset, + const size_t *globalWorkSize, + size_t *suggestedLocalWorkSize); + cl_int CL_API_CALL clGetExecutionInfoINTEL( cl_command_queue commandQueue, cl_kernel kernel, diff --git a/runtime/kernel/kernel.cpp b/runtime/kernel/kernel.cpp index 892d5f89e2..8674c822f3 100644 --- a/runtime/kernel/kernel.cpp +++ b/runtime/kernel/kernel.cpp @@ -21,11 +21,13 @@ #include "runtime/built_ins/built_ins.h" #include "runtime/built_ins/builtins_dispatch_builder.h" #include "runtime/command_queue/command_queue.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/command_stream/command_stream_receiver.h" #include "runtime/context/context.h" #include "runtime/device_queue/device_queue.h" #include "runtime/execution_model/device_enqueue.h" #include "runtime/gtpin/gtpin_notify.h" +#include "runtime/helpers/dispatch_info.h" #include "runtime/helpers/get_info.h" #include "runtime/helpers/per_thread_data.h" #include "runtime/helpers/sampler_helpers.h" @@ -999,6 +1001,31 @@ cl_int Kernel::setKernelExecutionType(cl_execution_info_kernel_type_intel execut return CL_SUCCESS; } +void Kernel::getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset, + size_t *localWorkSize) { + UNRECOVERABLE_IF((workDim == 0) || (workDim > 3)); + UNRECOVERABLE_IF(globalWorkOffset == nullptr); + UNRECOVERABLE_IF(globalWorkSize == nullptr); + Vec3 elws{0, 0, 0}; + Vec3 gws{ + globalWorkSize[0], + (workDim > 1) ? globalWorkSize[1] : 0, + (workDim > 2) ? globalWorkSize[2] : 0}; + Vec3 offset{ + globalWorkOffset[0], + (workDim > 1) ? globalWorkOffset[1] : 0, + (workDim > 2) ? globalWorkOffset[2] : 0}; + + const DispatchInfo dispatchInfo{this, workDim, gws, elws, offset}; + auto suggestedLws = computeWorkgroupSize(dispatchInfo); + + localWorkSize[0] = suggestedLws.x; + if (workDim > 1) + localWorkSize[1] = suggestedLws.y; + if (workDim > 2) + localWorkSize[2] = suggestedLws.z; +} + uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize) const { auto &hardwareInfo = getDevice().getHardwareInfo(); auto executionEnvironment = kernelInfo.patchInfo.executionEnvironment; diff --git a/runtime/kernel/kernel.h b/runtime/kernel/kernel.h index ca45be8b6c..b838e791d4 100644 --- a/runtime/kernel/kernel.h +++ b/runtime/kernel/kernel.h @@ -401,6 +401,8 @@ class Kernel : public BaseObject<_cl_kernel> { void setThreadArbitrationPolicy(uint32_t policy) { this->threadArbitrationPolicy = policy; } + void getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset, + size_t *localWorkSize); uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize) const; protected: diff --git a/runtime/program/kernel_info.cpp b/runtime/program/kernel_info.cpp index a34328bc75..6445032a43 100644 --- a/runtime/program/kernel_info.cpp +++ b/runtime/program/kernel_info.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017-2019 Intel Corporation + * Copyright (C) 2017-2020 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -156,7 +156,8 @@ WorkSizeInfo::WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t } WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) { this->maxWorkGroupSize = dispatchInfo.getKernel()->maxKernelWorkGroupSize; - this->hasBarriers = !!dispatchInfo.getKernel()->getKernelInfo().patchInfo.executionEnvironment->HasBarriers; + auto pExecutionEnvironment = dispatchInfo.getKernel()->getKernelInfo().patchInfo.executionEnvironment; + this->hasBarriers = (pExecutionEnvironment != nullptr) && (pExecutionEnvironment->HasBarriers); this->simdSize = (uint32_t)dispatchInfo.getKernel()->getKernelInfo().getMaxSimdSize(); this->slmTotalSize = (uint32_t)dispatchInfo.getKernel()->slmTotalSize; this->coreFamily = dispatchInfo.getKernel()->getDevice().getHardwareInfo().platform.eRenderCoreFamily; diff --git a/unit_tests/api/CMakeLists.txt b/unit_tests/api/CMakeLists.txt index 84be664b38..947874734a 100644 --- a/unit_tests/api/CMakeLists.txt +++ b/unit_tests/api/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2017-2019 Intel Corporation +# Copyright (C) 2017-2020 Intel Corporation # # SPDX-License-Identifier: MIT # @@ -83,6 +83,7 @@ set(IGDRCL_SRCS_tests_api ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_kernel_info_tests.inl ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_kernel_sub_group_info_khr_tests.inl ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_kernel_sub_group_info_tests.inl + ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_kernel_suggested_local_work_size_intel_tests.inl ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_kernel_work_group_info_tests.inl ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_mem_object_info_tests.inl ${CMAKE_CURRENT_SOURCE_DIR}/cl_get_pipe_info_tests.inl diff --git a/unit_tests/api/api_tests_wrapper3.cpp b/unit_tests/api/api_tests_wrapper3.cpp index 4245f74574..70fcbc35e0 100644 --- a/unit_tests/api/api_tests_wrapper3.cpp +++ b/unit_tests/api/api_tests_wrapper3.cpp @@ -1,11 +1,12 @@ /* - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "unit_tests/api/cl_get_execution_info_intel_tests.inl" +#include "unit_tests/api/cl_get_kernel_suggested_local_work_size_intel_tests.inl" #include "unit_tests/api/cl_get_kernel_work_group_info_tests.inl" #include "unit_tests/api/cl_get_mem_object_info_tests.inl" #include "unit_tests/api/cl_get_pipe_info_tests.inl" diff --git a/unit_tests/api/cl_get_extension_function_address_for_platform_tests.inl b/unit_tests/api/cl_get_extension_function_address_for_platform_tests.inl index 5ba0f228ab..edbf21c47b 100644 --- a/unit_tests/api/cl_get_extension_function_address_for_platform_tests.inl +++ b/unit_tests/api/cl_get_extension_function_address_for_platform_tests.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017-2019 Intel Corporation + * Copyright (C) 2017-2020 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -78,6 +78,11 @@ TEST_F(clGetExtensionFunctionAddressForPlatformTests, GivenClGetTracingStateINTE EXPECT_EQ(retVal, reinterpret_cast(clGetTracingStateINTEL)); } +TEST_F(clGetExtensionFunctionAddressForPlatformTests, GivenClGetKernelSuggestedLocalWorkSizeINTELWhenGettingExtensionFunctionThenCorrectAddressIsReturned) { + auto retVal = clGetExtensionFunctionAddressForPlatform(pPlatform, "clGetKernelSuggestedLocalWorkSizeINTEL"); + EXPECT_EQ(retVal, reinterpret_cast(clGetKernelSuggestedLocalWorkSizeINTEL)); +} + TEST_F(clGetExtensionFunctionAddressForPlatformTests, GivenClGetExecutionInfoINTELWhenGettingExtensionFunctionThenCorrectAddressIsReturned) { auto retVal = clGetExtensionFunctionAddressForPlatform(pPlatform, "clGetExecutionInfoINTEL"); EXPECT_EQ(retVal, reinterpret_cast(clGetExecutionInfoINTEL)); diff --git a/unit_tests/api/cl_get_extension_function_address_tests.inl b/unit_tests/api/cl_get_extension_function_address_tests.inl index 3326af0139..40de145602 100644 --- a/unit_tests/api/cl_get_extension_function_address_tests.inl +++ b/unit_tests/api/cl_get_extension_function_address_tests.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017-2019 Intel Corporation + * Copyright (C) 2017-2020 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -163,6 +163,11 @@ TEST_F(clGetExtensionFunctionAddressTests, GivenClGetDeviceFunctionPointerINTELW EXPECT_EQ(retVal, reinterpret_cast(clGetDeviceFunctionPointerINTEL)); } +TEST_F(clGetExtensionFunctionAddressTests, GivenClGetKernelSuggestedLocalWorkSizeINTELWhenGettingExtensionFunctionThenCorrectAddressIsReturned) { + auto retVal = clGetExtensionFunctionAddress("clGetKernelSuggestedLocalWorkSizeINTEL"); + EXPECT_EQ(retVal, reinterpret_cast(clGetKernelSuggestedLocalWorkSizeINTEL)); +} + TEST_F(clGetExtensionFunctionAddressTests, GivenClGetExecutionInfoINTELWhenGettingExtensionFunctionThenCorrectAddressIsReturned) { auto retVal = clGetExtensionFunctionAddress("clGetExecutionInfoINTEL"); EXPECT_EQ(retVal, reinterpret_cast(clGetExecutionInfoINTEL)); diff --git a/unit_tests/api/cl_get_kernel_suggested_local_work_size_intel_tests.inl b/unit_tests/api/cl_get_kernel_suggested_local_work_size_intel_tests.inl new file mode 100644 index 0000000000..a0647fb8dc --- /dev/null +++ b/unit_tests/api/cl_get_kernel_suggested_local_work_size_intel_tests.inl @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2020 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "runtime/command_queue/command_queue.h" +#include "runtime/command_queue/gpgpu_walker.h" +#include "unit_tests/mocks/mock_kernel.h" + +#include "cl_api_tests.h" + +using namespace NEO; + +using clGetKernelSuggestedLocalWorkSizeTests = api_tests; + +namespace ULT { + +TEST_F(clGetKernelSuggestedLocalWorkSizeTests, GivenInvalidInputWhenCallingGetKernelSuggestedLocalWorkSizeThenErrorIsReturned) { + size_t globalWorkOffset[3]; + size_t globalWorkSize[3]; + size_t suggestedLocalWorkSize[3]; + cl_uint workDim = 1; + + retVal = clGetKernelSuggestedLocalWorkSizeINTEL(nullptr, pKernel, workDim, + globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_COMMAND_QUEUE, retVal); + + retVal = clGetKernelSuggestedLocalWorkSizeINTEL(pCommandQueue, nullptr, workDim, + globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_KERNEL, retVal); + + pKernel->isPatchedOverride = false; + retVal = clGetKernelSuggestedLocalWorkSizeINTEL(pCommandQueue, pKernel, workDim, + globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_KERNEL, retVal); + pKernel->isPatchedOverride = true; + + retVal = clGetKernelSuggestedLocalWorkSizeINTEL(pCommandQueue, pKernel, workDim, + globalWorkOffset, globalWorkSize, nullptr); + EXPECT_EQ(CL_INVALID_VALUE, retVal); + + retVal = clGetKernelSuggestedLocalWorkSizeINTEL(pCommandQueue, pKernel, 0, + globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_WORK_DIMENSION, retVal); + + retVal = clGetKernelSuggestedLocalWorkSizeINTEL(pCommandQueue, pKernel, 4, + globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_WORK_DIMENSION, retVal); + + retVal = clGetKernelSuggestedLocalWorkSizeINTEL(pCommandQueue, pKernel, workDim, + nullptr, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_GLOBAL_OFFSET, retVal); + + retVal = clGetKernelSuggestedLocalWorkSizeINTEL(pCommandQueue, pKernel, workDim, + globalWorkOffset, nullptr, suggestedLocalWorkSize); + EXPECT_EQ(CL_INVALID_GLOBAL_WORK_SIZE, retVal); +} + +TEST_F(clGetKernelSuggestedLocalWorkSizeTests, GivenVariousInputWhenGettingSuggestedLocalWorkSizeThenCorrectValuesAreReturned) { + size_t globalWorkOffset[] = {0, 0, 0}; + size_t globalWorkSize[] = {128, 128, 128}; + size_t suggestedLocalWorkSize[] = {0, 0, 0}; + + Vec3 elws{0, 0, 0}; + Vec3 gws{128, 128, 128}; + Vec3 offset{0, 0, 0}; + DispatchInfo dispatchInfo{pKernel, 1, gws, elws, offset}; + auto expectedLws = computeWorkgroupSize(dispatchInfo); + EXPECT_GT(expectedLws.x, 1u); + + retVal = clGetKernelSuggestedLocalWorkSizeINTEL(pCommandQueue, pKernel, 1, globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(expectedLws.x, suggestedLocalWorkSize[0]); + EXPECT_EQ(0u, suggestedLocalWorkSize[1]); + EXPECT_EQ(0u, suggestedLocalWorkSize[2]); + + dispatchInfo.setDim(2); + expectedLws = computeWorkgroupSize(dispatchInfo); + retVal = clGetKernelSuggestedLocalWorkSizeINTEL(pCommandQueue, pKernel, 2, globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(expectedLws.x, suggestedLocalWorkSize[0]); + EXPECT_EQ(expectedLws.y, suggestedLocalWorkSize[1]); + EXPECT_EQ(0u, suggestedLocalWorkSize[2]); + + dispatchInfo.setDim(3); + expectedLws = computeWorkgroupSize(dispatchInfo); + retVal = clGetKernelSuggestedLocalWorkSizeINTEL(pCommandQueue, pKernel, 3, globalWorkOffset, globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(expectedLws.x, suggestedLocalWorkSize[0]); + EXPECT_EQ(expectedLws.y, suggestedLocalWorkSize[1]); + EXPECT_EQ(expectedLws.z, suggestedLocalWorkSize[2]); +} + +TEST_F(clGetKernelSuggestedLocalWorkSizeTests, GivenKernelWithExecutionEnvironmentPatchedWhenGettingSuggestedLocalWorkSizeThenCorrectValuesAreReturned) { + std::unique_ptr kernelWithExecutionEnvironmentPatch(MockKernel::create(pCommandQueue->getDevice(), pProgram)); + + size_t globalWorkOffset[] = {0, 0, 0}; + size_t globalWorkSize[] = {128, 128, 128}; + size_t suggestedLocalWorkSize[] = {0, 0, 0}; + cl_uint workDim = 3; + + Vec3 elws{0, 0, 0}; + Vec3 gws{128, 128, 128}; + Vec3 offset{0, 0, 0}; + const DispatchInfo dispatchInfo{kernelWithExecutionEnvironmentPatch.get(), workDim, gws, elws, offset}; + auto expectedLws = computeWorkgroupSize(dispatchInfo); + EXPECT_GT(expectedLws.x * expectedLws.y * expectedLws.z, 1u); + + retVal = clGetKernelSuggestedLocalWorkSizeINTEL(pCommandQueue, kernelWithExecutionEnvironmentPatch.get(), workDim, globalWorkOffset, + globalWorkSize, suggestedLocalWorkSize); + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(expectedLws.x, suggestedLocalWorkSize[0]); + EXPECT_EQ(expectedLws.y, suggestedLocalWorkSize[1]); + EXPECT_EQ(expectedLws.z, suggestedLocalWorkSize[2]); +} + +} // namespace ULT diff --git a/unit_tests/command_queue/local_work_size_tests.cpp b/unit_tests/command_queue/local_work_size_tests.cpp index 2d4fab8454..2fd61d72b4 100644 --- a/unit_tests/command_queue/local_work_size_tests.cpp +++ b/unit_tests/command_queue/local_work_size_tests.cpp @@ -640,6 +640,24 @@ TEST(localWorkSizeTest, givenDispatchInfoWhenWorkSizeInfoIsCreatedThenItHasCorre WorkSizeInfo workSizeInfo(dispatchInfo); EXPECT_EQ(workSizeInfo.numThreadsPerSubSlice, threadsPerEu * euPerSubSlice); } + +TEST(localWorkSizeTest, givenDispatchInfoWhenWorkSizeInfoIsCreatedThenHasBarriersIsCorrectlySet) { + MockClDevice device{new MockDevice}; + MockKernelWithInternals kernel(device); + DispatchInfo dispatchInfo; + dispatchInfo.setKernel(kernel.mockKernel); + + kernel.kernelInfo.patchInfo.executionEnvironment = nullptr; + EXPECT_FALSE(WorkSizeInfo{dispatchInfo}.hasBarriers); + + kernel.executionEnvironment.HasBarriers = 0; + kernel.kernelInfo.patchInfo.executionEnvironment = &kernel.executionEnvironment; + EXPECT_FALSE(WorkSizeInfo{dispatchInfo}.hasBarriers); + + kernel.executionEnvironment.HasBarriers = 1; + EXPECT_TRUE(WorkSizeInfo{dispatchInfo}.hasBarriers); +} + TEST(localWorkSizeTest, givenMaxWorkgroupSizeEqualToSimdSizeWhenLwsIsCalculatedThenItIsDownsizedToMaxWorkgroupSize) { WorkSizeInfo wsInfo(32, 0u, 32, 0u, platformDevices[0]->platform.eRenderCoreFamily, 32u, 0u, false, false); uint32_t workDim = 2;