diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp index 28b330bee5..3190a7a5c2 100644 --- a/opencl/source/api/api.cpp +++ b/opencl/source/api/api.cpp @@ -1731,7 +1731,7 @@ cl_kernel CL_API_CALL clCreateKernel(cl_program clProgram, KernelInfoContainer kernelInfos; kernelInfos.resize(pProgram->getMaxRootDeviceIndex() + 1); - for (const auto &pClDevice : pProgram->getDevices()) { + for (const auto &pClDevice : pProgram->getDevicesInProgram()) { auto rootDeviceIndex = pClDevice->getRootDeviceIndex(); auto pKernelInfo = pProgram->getKernelInfo(kernelName, rootDeviceIndex); if (pKernelInfo) { @@ -1786,7 +1786,7 @@ cl_int CL_API_CALL clCreateKernelsInProgram(cl_program clProgram, for (unsigned int i = 0; i < numKernelsInProgram; ++i) { KernelInfoContainer kernelInfos; kernelInfos.resize(pProgram->getMaxRootDeviceIndex() + 1); - for (const auto &pClDevice : pProgram->getDevices()) { + for (const auto &pClDevice : pProgram->getDevicesInProgram()) { auto rootDeviceIndex = pClDevice->getRootDeviceIndex(); auto kernelInfo = pProgram->getKernelInfo(i, rootDeviceIndex); DEBUG_BREAK_IF(kernelInfo == nullptr); diff --git a/opencl/source/kernel/multi_device_kernel.cpp b/opencl/source/kernel/multi_device_kernel.cpp index 4faa8909d8..255a178f8c 100644 --- a/opencl/source/kernel/multi_device_kernel.cpp +++ b/opencl/source/kernel/multi_device_kernel.cpp @@ -40,7 +40,7 @@ MultiDeviceKernel::MultiDeviceKernel(KernelVectorType kernelVector, const Kernel const std::vector &MultiDeviceKernel::getKernelArguments() const { return defaultKernel->getKernelArguments(); } cl_int MultiDeviceKernel::getInfo(cl_kernel_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { return defaultKernel->getInfo(paramName, paramValueSize, paramValue, paramValueSizeRet); } cl_int MultiDeviceKernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { return defaultKernel->getArgInfo(argIndx, paramName, paramValueSize, paramValue, paramValueSizeRet); } -const ClDeviceVector &MultiDeviceKernel::getDevices() const { return program->getDevices(); } +const ClDeviceVector &MultiDeviceKernel::getDevices() const { return program->getDevicesInProgram(); } size_t MultiDeviceKernel::getKernelArgsNumber() const { return defaultKernel->getKernelArgsNumber(); } Context &MultiDeviceKernel::getContext() const { return defaultKernel->getContext(); } bool MultiDeviceKernel::getHasIndirectAccess() const { return defaultKernel->getHasIndirectAccess(); } diff --git a/opencl/source/kernel/multi_device_kernel.h b/opencl/source/kernel/multi_device_kernel.h index bcef3cf1cd..a928dc7034 100644 --- a/opencl/source/kernel/multi_device_kernel.h +++ b/opencl/source/kernel/multi_device_kernel.h @@ -32,7 +32,7 @@ class MultiDeviceKernel : public BaseObject<_cl_kernel> { KernelVectorType kernels{}; kernels.resize(program->getMaxRootDeviceIndex() + 1); - for (auto &pDevice : program->getDevices()) { + for (auto &pDevice : program->getDevicesInProgram()) { auto rootDeviceIndex = pDevice->getRootDeviceIndex(); if (kernels[rootDeviceIndex]) { continue; diff --git a/opencl/source/program/program.cpp b/opencl/source/program/program.cpp index 9862d90cab..9760139742 100644 --- a/opencl/source/program/program.cpp +++ b/opencl/source/program/program.cpp @@ -498,9 +498,17 @@ void Program::setBuildStatus(cl_build_status status) { void Program::setBuildStatusSuccess(const ClDeviceVector &deviceVector, cl_program_binary_type binaryType) { for (const auto &device : deviceVector) { deviceBuildInfos[device].buildStatus = CL_BUILD_SUCCESS; + if (deviceBuildInfos[device].programBinaryType != binaryType) { + std::unique_lock lock(lockMutex); + clDevicesInProgram.push_back(device); + } deviceBuildInfos[device].programBinaryType = binaryType; for (const auto &subDevice : deviceBuildInfos[device].associatedSubDevices) { deviceBuildInfos[subDevice].buildStatus = CL_BUILD_SUCCESS; + if (deviceBuildInfos[subDevice].programBinaryType != binaryType) { + std::unique_lock lock(lockMutex); + clDevicesInProgram.push_back(subDevice); + } deviceBuildInfos[subDevice].programBinaryType = binaryType; } } @@ -552,4 +560,12 @@ void Program::prependFilePathToOptions(const std::string &filename) { } } +const ClDeviceVector &Program::getDevicesInProgram() const { + if (clDevicesInProgram.empty()) { + return clDevices; + } else { + return clDevicesInProgram; + } +} + } // namespace NEO diff --git a/opencl/source/program/program.h b/opencl/source/program/program.h index 8a99d21928..0be58a4347 100644 --- a/opencl/source/program/program.h +++ b/opencl/source/program/program.h @@ -256,6 +256,7 @@ class Program : public BaseObject<_cl_program> { void invokeCallback(void(CL_CALLBACK *funcNotify)(cl_program program, void *userData), void *userData); const ClDeviceVector &getDevices() const { return clDevices; } + const ClDeviceVector &getDevicesInProgram() const; bool isDeviceAssociated(const ClDevice &clDevice) const; static cl_int processInputDevices(ClDeviceVector *&deviceVectorPtr, cl_uint numDevices, const cl_device_id *deviceList, const ClDeviceVector &allAvailableDevices); @@ -361,6 +362,7 @@ class Program : public BaseObject<_cl_program> { ExecutionEnvironment &executionEnvironment; Context *context = nullptr; ClDeviceVector clDevices; + ClDeviceVector clDevicesInProgram; bool isBuiltIn = false; bool kernelDebugEnabled = false; diff --git a/opencl/test/unit_test/api/cl_build_program_tests.inl b/opencl/test/unit_test/api/cl_build_program_tests.inl index a0a40dccd6..6f48eb37da 100644 --- a/opencl/test/unit_test/api/cl_build_program_tests.inl +++ b/opencl/test/unit_test/api/cl_build_program_tests.inl @@ -119,6 +119,57 @@ TEST_F(clBuildProgramTests, GivenBinaryAsInputWhenCreatingProgramWithSourceThenP EXPECT_EQ(CL_SUCCESS, retVal); } +TEST_F(clBuildProgramTests, GivenBinaryAsInputWhenCreatingProgramWithBinaryForMultipleDevicesThenProgramBuildSucceeds) { + MockUnrestrictiveContextMultiGPU context; + cl_program pProgram = nullptr; + cl_int binaryStatus = CL_SUCCESS; + std::unique_ptr pBinary = nullptr; + size_t binarySize = 0; + std::string testFile; + retrieveBinaryKernelFilename(testFile, "CopyBuffer_simd16_", ".bin"); + + pBinary = loadDataFromFile( + testFile.c_str(), + binarySize); + + ASSERT_NE(0u, binarySize); + ASSERT_NE(nullptr, pBinary); + + const size_t numBinaries = 6; + const unsigned char *binaries[numBinaries]; + std::fill(binaries, binaries + numBinaries, reinterpret_cast(pBinary.get())); + cl_device_id devicesForProgram[] = {context.pRootDevice0, context.pSubDevice00, context.pSubDevice01, context.pRootDevice1, context.pSubDevice10, context.pSubDevice11}; + size_t sizeBinaries[numBinaries]; + std::fill(sizeBinaries, sizeBinaries + numBinaries, binarySize); + + pProgram = clCreateProgramWithBinary( + &context, + numBinaries, + devicesForProgram, + sizeBinaries, + binaries, + &binaryStatus, + &retVal); + + pBinary.reset(); + + EXPECT_NE(nullptr, pProgram); + ASSERT_EQ(CL_SUCCESS, retVal); + + retVal = clBuildProgram( + pProgram, + 0, + nullptr, + nullptr, + nullptr, + nullptr); + + ASSERT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); +} + TEST_F(clBuildProgramTests, GivenProgramCreatedFromBinaryWhenBuildProgramWithOptionsIsCalledThenStoredOptionsAreUsed) { cl_program pProgram = nullptr; cl_int binaryStatus = CL_SUCCESS; @@ -533,4 +584,192 @@ TEST(clBuildProgramTest, givenMultiDeviceProgramWithCreatedKernelsWhenBuildingTh retVal = clReleaseProgram(pProgram); EXPECT_EQ(CL_SUCCESS, retVal); } + +TEST(clBuildProgramTest, givenMultiDeviceProgramWithProgramBuiltForSingleDeviceWhenCreatingKernelThenProgramAndKernelDevicesMatchAndSuccessIsReturned) { + MockUnrestrictiveContextMultiGPU context; + cl_program pProgram = nullptr; + size_t sourceSize = 0; + cl_int retVal = CL_INVALID_PROGRAM; + std::string testFile; + + testFile.append(clFiles); + testFile.append("copybuffer.cl"); + auto pSource = loadDataFromFile( + testFile.c_str(), + sourceSize); + + ASSERT_NE(0u, sourceSize); + ASSERT_NE(nullptr, pSource); + + const char *sources[1] = {pSource.get()}; + pProgram = clCreateProgramWithSource( + &context, + 1, + sources, + &sourceSize, + &retVal); + + EXPECT_NE(nullptr, pProgram); + ASSERT_EQ(CL_SUCCESS, retVal); + + cl_device_id firstDevice = context.pRootDevice0; + cl_device_id firstSubDevice = context.pSubDevice00; + cl_device_id secondSubDevice = context.pSubDevice01; + + retVal = clBuildProgram( + pProgram, + 1, + &firstDevice, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + cl_kernel pKernel = clCreateKernel(pProgram, "fullCopy", &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + + MultiDeviceKernel *kernel = castToObject(pKernel); + auto devs = kernel->getDevices(); + EXPECT_EQ(devs[0], firstDevice); + EXPECT_EQ(devs[1], firstSubDevice); + EXPECT_EQ(devs[2], secondSubDevice); + + retVal = clReleaseKernel(pKernel); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST(clBuildProgramTest, givenMultiDeviceProgramWithProgramBuiltForSingleDeviceWithCreatedKernelWhenBuildingProgramForSecondDeviceThenInvalidOperationReturned) { + MockUnrestrictiveContextMultiGPU context; + cl_program pProgram = nullptr; + size_t sourceSize = 0; + cl_int retVal = CL_INVALID_PROGRAM; + std::string testFile; + + testFile.append(clFiles); + testFile.append("copybuffer.cl"); + auto pSource = loadDataFromFile( + testFile.c_str(), + sourceSize); + + ASSERT_NE(0u, sourceSize); + ASSERT_NE(nullptr, pSource); + + const char *sources[1] = {pSource.get()}; + pProgram = clCreateProgramWithSource( + &context, + 1, + sources, + &sourceSize, + &retVal); + + EXPECT_NE(nullptr, pProgram); + ASSERT_EQ(CL_SUCCESS, retVal); + + cl_device_id firstDevice = context.pRootDevice0; + cl_device_id secondDevice = context.pRootDevice1; + + retVal = clBuildProgram( + pProgram, + 1, + &firstDevice, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + cl_kernel kernel = clCreateKernel(pProgram, "fullCopy", &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clBuildProgram( + pProgram, + 1, + &secondDevice, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_INVALID_OPERATION, retVal); + + retVal = clReleaseKernel(kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clBuildProgram( + pProgram, + 1, + &secondDevice, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + kernel = clCreateKernel(pProgram, "fullCopy", &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clReleaseKernel(kernel); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST(clBuildProgramTest, givenMultiDeviceProgramWithProgramBuiltForMultipleDevicesSeparatelyWithCreatedKernelThenProgramAndKernelDevicesMatch) { + MockUnrestrictiveContextMultiGPU context; + cl_program pProgram = nullptr; + size_t sourceSize = 0; + cl_int retVal = CL_INVALID_PROGRAM; + std::string testFile; + + testFile.append(clFiles); + testFile.append("copybuffer.cl"); + auto pSource = loadDataFromFile( + testFile.c_str(), + sourceSize); + + ASSERT_NE(0u, sourceSize); + ASSERT_NE(nullptr, pSource); + + const char *sources[1] = {pSource.get()}; + pProgram = clCreateProgramWithSource( + &context, + 1, + sources, + &sourceSize, + &retVal); + + EXPECT_NE(nullptr, pProgram); + ASSERT_EQ(CL_SUCCESS, retVal); + + cl_device_id firstDevice = context.pRootDevice0; + cl_device_id secondDevice = context.pRootDevice1; + + retVal = clBuildProgram( + pProgram, + 1, + &firstDevice, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = clBuildProgram( + pProgram, + 1, + &secondDevice, + nullptr, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + cl_kernel pKernel = clCreateKernel(pProgram, "fullCopy", &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + + MultiDeviceKernel *kernel = castToObject(pKernel); + Program *program = castToObject(pProgram); + EXPECT_EQ(kernel->getDevices(), program->getDevices()); + + retVal = clReleaseKernel(pKernel); + EXPECT_EQ(CL_SUCCESS, retVal); + retVal = clReleaseProgram(pProgram); + EXPECT_EQ(CL_SUCCESS, retVal); +} } // namespace ULT