diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 082254007a..67b477cc3f 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -136,11 +136,11 @@ inline void patch(const SrcT &src, void *dst, uint32_t dstOffsetBytes) { } template -void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const PatchTokenT &patch) { +void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const PatchTokenT &patch) { uint32_t crossThreadDataOffset = patch.DataParamOffset; uint32_t pointerSize = patch.DataParamSize; uint32_t sshOffset = patch.SurfaceStateHeapOffset; - auto rootDeviceIndex = allocation.getRootDeviceIndex(); + auto rootDeviceIndex = device.getRootDeviceIndex(); void *crossThreadData = getCrossThreadData(rootDeviceIndex); void *ssh = getSurfaceStateHeap(rootDeviceIndex); if (crossThreadData != nullptr) { @@ -157,15 +157,15 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic auto surfaceState = ptrOffset(ssh, sshOffset); void *addressToPatch = reinterpret_cast(allocation.getGpuAddressToPatch()); size_t sizeToPatch = allocation.getUnderlyingBufferSize(); - Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, sizeToPatch, addressToPatch, 0, &allocation, 0, 0); + Buffer::setSurfaceState(&device, surfaceState, sizeToPatch, addressToPatch, 0, &allocation, 0, 0); } } -template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization &patch); +template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization &patch); -template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessPrivateSurface &patch); +template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const SPatchAllocateStatelessPrivateSurface &patch); -template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization &patch); +template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization &patch); cl_int Kernel::initialize() { std::bitset<64> isDeviceInitialized{}; @@ -316,14 +316,14 @@ cl_int Kernel::initialize() { return CL_OUT_OF_RESOURCES; } const auto &patch = patchInfo.pAllocateStatelessPrivateSurface; - patchWithImplicitSurface(reinterpret_cast(kernelDeviceInfo.privateSurface->getGpuAddressToPatch()), *kernelDeviceInfo.privateSurface, *patch); + patchWithImplicitSurface(reinterpret_cast(kernelDeviceInfo.privateSurface->getGpuAddressToPatch()), *kernelDeviceInfo.privateSurface, pClDevice->getDevice(), *patch); } if (patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization) { DEBUG_BREAK_IF(program->getConstantSurface(rootDeviceIndex) == nullptr); uintptr_t constMemory = isBuiltIn ? (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch(); const auto &patch = patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization; - patchWithImplicitSurface(reinterpret_cast(constMemory), *program->getConstantSurface(rootDeviceIndex), *patch); + patchWithImplicitSurface(reinterpret_cast(constMemory), *program->getConstantSurface(rootDeviceIndex), pClDevice->getDevice(), *patch); } if (patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization) { @@ -331,7 +331,7 @@ cl_int Kernel::initialize() { uintptr_t globalMemory = isBuiltIn ? (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch(); const auto &patch = patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization; - patchWithImplicitSurface(reinterpret_cast(globalMemory), *program->getGlobalSurface(rootDeviceIndex), *patch); + patchWithImplicitSurface(reinterpret_cast(globalMemory), *program->getGlobalSurface(rootDeviceIndex), pClDevice->getDevice(), *patch); } if (patchInfo.pAllocateStatelessEventPoolSurface) { @@ -1137,6 +1137,7 @@ inline void Kernel::makeArgsResident(CommandStreamReceiver &commandStreamReceive if (image && image->isImageFromImage()) { commandStreamReceiver.setSamplerCacheFlushRequired(CommandStreamReceiver::SamplerCacheFlushState::samplerCacheFlushBefore); } + memObj->getMigrateableMultiGraphicsAllocation().ensureMemoryOnDevice(*executionEnvironment.memoryManager, commandStreamReceiver.getRootDeviceIndex()); commandStreamReceiver.makeResident(*memObj->getGraphicsAllocation(commandStreamReceiver.getRootDeviceIndex())); if (memObj->getMcsAllocation()) { commandStreamReceiver.makeResident(*memObj->getMcsAllocation()); diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h index e379dd319b..3061944ddb 100644 --- a/opencl/source/kernel/kernel.h +++ b/opencl/source/kernel/kernel.h @@ -482,7 +482,7 @@ class Kernel : public BaseObject<_cl_kernel> { // Sets-up both crossThreadData and ssh for given implicit (private/constant, etc.) allocation template - void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const PatchTokenT &patch); + void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const PatchTokenT &patch); void getParentObjectCounts(ObjectCounts &objectCount); Kernel(Program *programArg, const KernelInfoContainer &kernelInfsoArg, bool schedulerKernel = false); diff --git a/opencl/source/program/process_device_binary.cpp b/opencl/source/program/process_device_binary.cpp index 34e7547839..20c1124fde 100644 --- a/opencl/source/program/process_device_binary.cpp +++ b/opencl/source/program/process_device_binary.cpp @@ -131,17 +131,17 @@ cl_int Program::processGenBinary(const ClDevice &clDevice) { } cleanCurrentKernelInfo(rootDeviceIndex); - for (auto &buildInfo : buildInfos) { - if (buildInfo.constantSurface || buildInfo.globalSurface) { - clDevice.getMemoryManager()->freeGraphicsMemory(buildInfo.constantSurface); - clDevice.getMemoryManager()->freeGraphicsMemory(buildInfo.globalSurface); - buildInfo.constantSurface = nullptr; - buildInfo.globalSurface = nullptr; - } + auto &buildInfo = buildInfos[rootDeviceIndex]; + + if (buildInfo.constantSurface || buildInfo.globalSurface) { + clDevice.getMemoryManager()->freeGraphicsMemory(buildInfo.constantSurface); + clDevice.getMemoryManager()->freeGraphicsMemory(buildInfo.globalSurface); + buildInfo.constantSurface = nullptr; + buildInfo.globalSurface = nullptr; } ProgramInfo programInfo; - auto blob = ArrayRef(reinterpret_cast(this->buildInfos[rootDeviceIndex].unpackedDeviceBinary.get()), this->buildInfos[rootDeviceIndex].unpackedDeviceBinarySize); + auto blob = ArrayRef(reinterpret_cast(buildInfo.unpackedDeviceBinary.get()), buildInfo.unpackedDeviceBinarySize); SingleDeviceBinary binary = {}; binary.deviceBinary = blob; std::string decodeErrors; diff --git a/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp index aaa6758e52..3e9d3d20e1 100644 --- a/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp @@ -130,6 +130,30 @@ TEST_F(MultiDeviceKernelArgBufferTest, GivenValidBufferWhenSettingKernelArgThenB } } +TEST_F(MultiDeviceKernelArgBufferTest, WhenMakingKernelArgResidentThenMemoryIsTransferredToProperDevice) { + + auto pKernel = std::unique_ptr(Kernel::create(pProgram.get(), kernelInfos, nullptr)); + + EXPECT_NE(nullptr, pKernel); + cl_mem val = pBuffer.get(); + auto pVal = &val; + + auto retVal = pKernel->setArg(0, sizeof(cl_mem *), pVal); + EXPECT_EQ(CL_SUCCESS, retVal); + + auto csr1 = deviceFactory.rootDevices[1]->getDefaultEngine().commandStreamReceiver; + auto csr2 = deviceFactory.rootDevices[2]->getDefaultEngine().commandStreamReceiver; + + pKernel->makeResident(*csr1); + EXPECT_EQ(1u, pBuffer->getMultiGraphicsAllocation().getLastUsedRootDeviceIndex()); + + pKernel->makeResident(*csr2); + EXPECT_EQ(2u, pBuffer->getMultiGraphicsAllocation().getLastUsedRootDeviceIndex()); + + pKernel->makeResident(*csr1); + EXPECT_EQ(1u, pBuffer->getMultiGraphicsAllocation().getLastUsedRootDeviceIndex()); +} + TEST_F(KernelArgBufferTest, GivenSvmPtrStatelessWhenSettingKernelArgThenArgumentsAreSetCorrectly) { Buffer *buffer = new MockBuffer(); diff --git a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp index c537abc2a6..daf19cf13e 100644 --- a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp @@ -259,7 +259,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) { RENDER_SURFACE_STATE *surfState = reinterpret_cast(pKernel->getSurfaceStateHeap(rootDeviceIndex)); memset(surfState, 0, rendSurfSize); - pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, patch); + pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, *pDevice, patch); // verify cross thread data was properly patched EXPECT_EQ(ptrToPatch, *reinterpret_cast(pKernel->getCrossThreadData(rootDeviceIndex))); @@ -280,7 +280,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) { // when cross thread and ssh data is not available then should not do anything pKernel->setCrossThreadData(nullptr, 0); pKernel->setSshLocal(nullptr, 0, rootDeviceIndex); - pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, patch); + pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, *pDevice, patch); } } diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index 04c36b0938..5cf026732f 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -2830,7 +2830,7 @@ TEST(KernelTest, givenKernelWithPatchInfoCollectionEnabledWhenPatchWithImplicitS SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization patchToken{}; uint64_t crossThreadData = 0; EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size()); - kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, patchToken); + kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, device->getDevice(), patchToken); EXPECT_EQ(1u, kernel.mockKernel->getPatchInfoDataList().size()); } @@ -2841,7 +2841,7 @@ TEST(KernelTest, givenKernelWithPatchInfoCollectionDisabledWhenPatchWithImplicit SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization patchToken{}; uint64_t crossThreadData = 0; EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size()); - kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, patchToken); + kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, device->getDevice(), patchToken); EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size()); } diff --git a/opencl/test/unit_test/program/program_tests.cpp b/opencl/test/unit_test/program/program_tests.cpp index 5512c128fd..c4ee780205 100644 --- a/opencl/test/unit_test/program/program_tests.cpp +++ b/opencl/test/unit_test/program/program_tests.cpp @@ -1936,6 +1936,58 @@ TEST_F(ProgramTests, givenProgramFromGenBinaryWhenSLMSizeIsBiggerThenDeviceLimit EXPECT_EQ(CL_OUT_OF_RESOURCES, retVal); } +TEST_F(ProgramTests, givenExistingConstantSurfacesWhenProcessGenBinaryThenCleanupTheSurfaceOnlyForSpecificDevice) { + PatchTokensTestData::ValidProgramWithKernelUsingSlm patchtokensProgram; + + auto program = std::make_unique(nullptr, false, toClDeviceVector(*pClDevice)); + + program->buildInfos.resize(2); + program->buildInfos[0].constantSurface = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties({rootDeviceIndex, MemoryConstants::cacheLineSize, + GraphicsAllocation::AllocationType::CONSTANT_SURFACE, pDevice->getDeviceBitfield()}); + program->buildInfos[1].constantSurface = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties({rootDeviceIndex, MemoryConstants::cacheLineSize, + GraphicsAllocation::AllocationType::CONSTANT_SURFACE, pDevice->getDeviceBitfield()}); + program->buildInfos[rootDeviceIndex].unpackedDeviceBinary = makeCopy(patchtokensProgram.storage.data(), patchtokensProgram.storage.size()); + program->buildInfos[rootDeviceIndex].unpackedDeviceBinarySize = patchtokensProgram.storage.size(); + + auto constantSurface0 = program->buildInfos[0].constantSurface; + EXPECT_NE(nullptr, constantSurface0); + auto constantSurface1 = program->buildInfos[1].constantSurface; + EXPECT_NE(nullptr, constantSurface1); + + auto retVal = program->processGenBinary(*pClDevice); + + EXPECT_EQ(nullptr, program->buildInfos[0].constantSurface); + EXPECT_EQ(constantSurface1, program->buildInfos[1].constantSurface); + + EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(ProgramTests, givenExistingGlobalSurfacesWhenProcessGenBinaryThenCleanupTheSurfaceOnlyForSpecificDevice) { + PatchTokensTestData::ValidProgramWithKernelUsingSlm patchtokensProgram; + + auto program = std::make_unique(nullptr, false, toClDeviceVector(*pClDevice)); + + program->buildInfos.resize(2); + program->buildInfos[0].globalSurface = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties({rootDeviceIndex, MemoryConstants::cacheLineSize, + GraphicsAllocation::AllocationType::GLOBAL_SURFACE, pDevice->getDeviceBitfield()}); + program->buildInfos[1].globalSurface = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties({rootDeviceIndex, MemoryConstants::cacheLineSize, + GraphicsAllocation::AllocationType::GLOBAL_SURFACE, pDevice->getDeviceBitfield()}); + program->buildInfos[rootDeviceIndex].unpackedDeviceBinary = makeCopy(patchtokensProgram.storage.data(), patchtokensProgram.storage.size()); + program->buildInfos[rootDeviceIndex].unpackedDeviceBinarySize = patchtokensProgram.storage.size(); + + auto globaltSurface0 = program->buildInfos[0].globalSurface; + EXPECT_NE(nullptr, globaltSurface0); + auto globalSurface1 = program->buildInfos[1].globalSurface; + EXPECT_NE(nullptr, globalSurface1); + + auto retVal = program->processGenBinary(*pClDevice); + + EXPECT_EQ(nullptr, program->buildInfos[0].globalSurface); + EXPECT_EQ(globalSurface1, program->buildInfos[1].globalSurface); + + EXPECT_EQ(CL_SUCCESS, retVal); +} + TEST_F(ProgramTests, GivenNoCompilerInterfaceRootDeviceEnvironmentWhenRebuildingBinaryThenOutOfHostMemoryErrorIsReturned) { auto pDevice = pContext->getDevice(0); auto executionEnvironment = pDevice->getExecutionEnvironment();