Handle SVM allocations from multi root device contexts

Related-To: NEO-5001, NEO-3691
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2021-03-23 10:16:23 +00:00
committed by Compute-Runtime-Automation
parent 56b2686f0d
commit d6bbe48175
7 changed files with 144 additions and 48 deletions

View File

@ -4828,7 +4828,7 @@ cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel,
}
}
GraphicsAllocation *pSvmAlloc = nullptr;
MultiGraphicsAllocation *pSvmAllocs = nullptr;
if (argValue != nullptr) {
auto svmManager = pMultiDeviceKernel->getContext().getSVMAllocsManager();
auto svmData = svmManager->getSVMAlloc(argValue);
@ -4841,11 +4841,11 @@ cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel,
}
}
} else {
pSvmAlloc = svmData->gpuAllocations.getGraphicsAllocation(pMultiDeviceKernel->getDevices()[0]->getRootDeviceIndex());
pSvmAllocs = &svmData->gpuAllocations;
}
}
retVal = pMultiDeviceKernel->setArgSvmAlloc(argIndex, const_cast<void *>(argValue), pSvmAlloc);
retVal = pMultiDeviceKernel->setArgSvmAlloc(argIndex, const_cast<void *>(argValue), pSvmAllocs);
TRACING_EXIT(clSetKernelArgSVMPointer, &retVal);
return retVal;
}
@ -4916,12 +4916,12 @@ cl_int CL_API_CALL clSetKernelExecInfo(cl_kernel kernel,
TRACING_EXIT(clSetKernelExecInfo, &retVal);
return retVal;
}
GraphicsAllocation *svmAlloc = svmData->gpuAllocations.getGraphicsAllocation(pMultiDeviceKernel->getDevices()[0]->getRootDeviceIndex());
auto &svmAllocs = svmData->gpuAllocations;
if (paramName == CL_KERNEL_EXEC_INFO_SVM_PTRS) {
pMultiDeviceKernel->setSvmKernelExecInfo(svmAlloc);
pMultiDeviceKernel->setSvmKernelExecInfo(svmAllocs);
} else {
pMultiDeviceKernel->setUnifiedMemoryExecInfo(svmAlloc);
pMultiDeviceKernel->setUnifiedMemoryExecInfo(svmAllocs);
}
}
break;

View File

@ -467,9 +467,12 @@ cl_int Kernel::cloneKernel(Kernel *pSourceKernel) {
}
// copy additional information other than argument values set to source kernel with clSetKernelExecInfo
for (auto gfxAlloc : pSourceKernel->kernelSvmGfxAllocations) {
for (auto &gfxAlloc : pSourceKernel->kernelSvmGfxAllocations) {
kernelSvmGfxAllocations.push_back(gfxAlloc);
}
for (auto &gfxAlloc : pSourceKernel->kernelUnifiedMemoryGfxAllocations) {
kernelUnifiedMemoryGfxAllocations.push_back(gfxAlloc);
}
this->isBuiltIn = pSourceKernel->isBuiltIn;

View File

@ -48,11 +48,8 @@ bool MultiDeviceKernel::getHasIndirectAccess() const { return defaultKernel->get
cl_int MultiDeviceKernel::checkCorrectImageAccessQualifier(cl_uint argIndex, size_t argSize, const void *argValue) const { return getResultFromEachKernel(&Kernel::checkCorrectImageAccessQualifier, argIndex, argSize, argValue); }
void MultiDeviceKernel::unsetArg(uint32_t argIndex) { callOnEachKernel(&Kernel::unsetArg, argIndex); }
cl_int MultiDeviceKernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) { return getResultFromEachKernel(&Kernel::setArgument, argIndex, argSize, argVal); }
cl_int MultiDeviceKernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocation *svmAlloc) { return getResultFromEachKernel(&Kernel::setArgSvmAlloc, argIndex, svmPtr, svmAlloc); }
void MultiDeviceKernel::setUnifiedMemoryProperty(cl_kernel_exec_info infoType, bool infoValue) { callOnEachKernel(&Kernel::setUnifiedMemoryProperty, infoType, infoValue); }
void MultiDeviceKernel::setSvmKernelExecInfo(GraphicsAllocation *argValue) { callOnEachKernel(&Kernel::setSvmKernelExecInfo, argValue); }
void MultiDeviceKernel::clearSvmKernelExecInfo() { callOnEachKernel(&Kernel::clearSvmKernelExecInfo); }
void MultiDeviceKernel::setUnifiedMemoryExecInfo(GraphicsAllocation *argValue) { callOnEachKernel(&Kernel::setUnifiedMemoryExecInfo, argValue); }
void MultiDeviceKernel::clearUnifiedMemoryExecInfo() { callOnEachKernel(&Kernel::clearUnifiedMemoryExecInfo); }
int MultiDeviceKernel::setKernelThreadArbitrationPolicy(uint32_t propertyValue) { return getResultFromEachKernel(&Kernel::setKernelThreadArbitrationPolicy, propertyValue); }
cl_int MultiDeviceKernel::setKernelExecutionType(cl_execution_info_kernel_type_intel executionType) { return getResultFromEachKernel(&Kernel::setKernelExecutionType, executionType); }
@ -68,4 +65,30 @@ cl_int MultiDeviceKernel::cloneKernel(MultiDeviceKernel *pSourceMultiDeviceKerne
}
return CL_SUCCESS;
}
cl_int MultiDeviceKernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, MultiGraphicsAllocation *svmAllocs) {
for (auto rootDeviceIndex = 0u; rootDeviceIndex < kernels.size(); rootDeviceIndex++) {
auto pKernel = getKernel(rootDeviceIndex);
if (pKernel) {
auto svmAlloc = svmAllocs ? svmAllocs->getGraphicsAllocation(rootDeviceIndex) : nullptr;
pKernel->setArgSvmAlloc(argIndex, svmPtr, svmAlloc);
}
}
return CL_SUCCESS;
}
void MultiDeviceKernel::setSvmKernelExecInfo(const MultiGraphicsAllocation &argValue) {
for (auto rootDeviceIndex = 0u; rootDeviceIndex < kernels.size(); rootDeviceIndex++) {
auto pKernel = getKernel(rootDeviceIndex);
if (pKernel) {
pKernel->setSvmKernelExecInfo(argValue.getGraphicsAllocation(rootDeviceIndex));
}
}
}
void MultiDeviceKernel::setUnifiedMemoryExecInfo(const MultiGraphicsAllocation &argValue) {
for (auto rootDeviceIndex = 0u; rootDeviceIndex < kernels.size(); rootDeviceIndex++) {
auto pKernel = getKernel(rootDeviceIndex);
if (pKernel) {
pKernel->setUnifiedMemoryExecInfo(argValue.getGraphicsAllocation(rootDeviceIndex));
}
}
}
} // namespace NEO

View File

@ -54,12 +54,12 @@ class MultiDeviceKernel : public BaseObject<_cl_kernel> {
const ClDeviceVector &getDevices() const;
size_t getKernelArgsNumber() const;
Context &getContext() const;
cl_int setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocation *svmAlloc);
cl_int setArgSvmAlloc(uint32_t argIndex, void *svmPtr, MultiGraphicsAllocation *svmAllocs);
bool getHasIndirectAccess() const;
void setUnifiedMemoryProperty(cl_kernel_exec_info infoType, bool infoValue);
void setSvmKernelExecInfo(GraphicsAllocation *argValue);
void setSvmKernelExecInfo(const MultiGraphicsAllocation &argValue);
void clearSvmKernelExecInfo();
void setUnifiedMemoryExecInfo(GraphicsAllocation *argValue);
void setUnifiedMemoryExecInfo(const MultiGraphicsAllocation &argValue);
void clearUnifiedMemoryExecInfo();
int setKernelThreadArbitrationPolicy(uint32_t propertyValue);
cl_int setKernelExecutionType(cl_execution_info_kernel_type_intel executionType);

View File

@ -494,35 +494,42 @@ TEST_F(CloneKernelTest, GivenArgSvmWhenCloningKernelThenKernelInfoIsCorrect) {
}
TEST_F(CloneKernelTest, GivenArgSvmAllocWhenCloningKernelThenKernelInfoIsCorrect) {
char *svmPtr = new char[256];
MockGraphicsAllocation svmAlloc(svmPtr, 256);
char memory[100] = {};
MultiGraphicsAllocation multiGraphicsAllocation(3);
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
auto svmAlloc = new MockGraphicsAllocation(rootDeviceIndex, memory, 100);
multiGraphicsAllocation.addAllocation(svmAlloc);
}
auto rootDeviceIndex = *context->getRootDeviceIndices().begin();
retVal = pSourceMultiDeviceKernel->setArgSvmAlloc(0, svmPtr, &svmAlloc);
retVal = pSourceMultiDeviceKernel->setArgSvmAlloc(0, memory, &multiGraphicsAllocation);
ASSERT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(1u, pSourceKernel[rootDeviceIndex]->getKernelArguments().size());
EXPECT_EQ(Kernel::SVM_ALLOC_OBJ, pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).type);
EXPECT_NE(0u, pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).size);
EXPECT_EQ(1u, pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum());
EXPECT_TRUE(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
EXPECT_EQ(1u, pSourceKernel[rootDeviceIndex]->getKernelArguments().size());
EXPECT_EQ(multiGraphicsAllocation.getGraphicsAllocation(rootDeviceIndex), pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).object);
EXPECT_EQ(Kernel::SVM_ALLOC_OBJ, pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).type);
EXPECT_NE(0u, pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).size);
EXPECT_EQ(1u, pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum());
EXPECT_TRUE(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
}
retVal = pClonedMultiDeviceKernel->cloneKernel(pSourceMultiDeviceKernel.get());
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArguments().size(), pClonedKernel[rootDeviceIndex]->getKernelArguments().size());
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).type, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).type);
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).object, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).object);
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).value, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).value);
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).size, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).size);
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum(), pClonedKernel[rootDeviceIndex]->getPatchedArgumentsNum());
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArguments().size(), pClonedKernel[rootDeviceIndex]->getKernelArguments().size());
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).type, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).type);
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).object, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).object);
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).value, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).value);
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).size, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).size);
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum(), pClonedKernel[rootDeviceIndex]->getPatchedArgumentsNum());
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
auto pKernelArg = (void **)(pClonedKernel[rootDeviceIndex]->getCrossThreadData() +
pClonedKernel[rootDeviceIndex]->getKernelInfo().kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
EXPECT_EQ(svmPtr, *pKernelArg);
delete[] svmPtr;
auto pKernelArg = (void **)(pClonedKernel[rootDeviceIndex]->getCrossThreadData() +
pClonedKernel[rootDeviceIndex]->getKernelInfo().kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
EXPECT_EQ(memory, *pKernelArg);
delete multiGraphicsAllocation.getGraphicsAllocation(rootDeviceIndex);
}
}
TEST_F(CloneKernelTest, GivenArgImmediateWhenCloningKernelThenKernelInfoIsCorrect) {
@ -565,13 +572,14 @@ TEST_F(CloneKernelTest, GivenExecInfoWhenCloningKernelThenSvmAllocationIsCorrect
auto svmData = context->getSVMAllocsManager()->getSVMAlloc(ptrSVM);
ASSERT_NE(nullptr, svmData);
GraphicsAllocation *pSvmAlloc = svmData->gpuAllocations.getGraphicsAllocation(device1->getRootDeviceIndex());
ASSERT_NE(nullptr, pSvmAlloc);
auto &pSvmAllocs = svmData->gpuAllocations;
pSourceMultiDeviceKernel->setSvmKernelExecInfo(pSvmAlloc);
pSourceMultiDeviceKernel->setSvmKernelExecInfo(pSvmAllocs);
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
EXPECT_EQ(1u, pSourceKernel[rootDeviceIndex]->kernelSvmGfxAllocations.size());
EXPECT_NE(nullptr, pSourceKernel[rootDeviceIndex]->kernelSvmGfxAllocations.at(0));
EXPECT_EQ(pSvmAllocs.getGraphicsAllocation(rootDeviceIndex), pSourceKernel[rootDeviceIndex]->kernelSvmGfxAllocations.at(0));
}
retVal = pClonedMultiDeviceKernel->cloneKernel(pSourceMultiDeviceKernel.get());
@ -585,6 +593,34 @@ TEST_F(CloneKernelTest, GivenExecInfoWhenCloningKernelThenSvmAllocationIsCorrect
context->getSVMAllocsManager()->freeSVMAlloc(ptrSVM);
}
TEST_F(CloneKernelTest, GivenUnifiedMemoryExecInfoWhenCloningKernelThenUnifiedMemoryAllocationIsCorrect) {
REQUIRE_SVM_OR_SKIP(device1);
void *ptrSVM = context->getSVMAllocsManager()->createSVMAlloc(256, {}, context->getRootDeviceIndices(), context->getDeviceBitfields());
ASSERT_NE(nullptr, ptrSVM);
auto svmData = context->getSVMAllocsManager()->getSVMAlloc(ptrSVM);
ASSERT_NE(nullptr, svmData);
auto &pSvmAllocs = svmData->gpuAllocations;
pSourceMultiDeviceKernel->setUnifiedMemoryExecInfo(pSvmAllocs);
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
EXPECT_EQ(1u, pSourceKernel[rootDeviceIndex]->kernelUnifiedMemoryGfxAllocations.size());
EXPECT_NE(nullptr, pSourceKernel[rootDeviceIndex]->kernelUnifiedMemoryGfxAllocations.at(0));
EXPECT_EQ(pSvmAllocs.getGraphicsAllocation(rootDeviceIndex), pSourceKernel[rootDeviceIndex]->kernelUnifiedMemoryGfxAllocations.at(0));
}
retVal = pClonedMultiDeviceKernel->cloneKernel(pSourceMultiDeviceKernel.get());
EXPECT_EQ(CL_SUCCESS, retVal);
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->kernelUnifiedMemoryGfxAllocations.size(), pClonedKernel[rootDeviceIndex]->kernelUnifiedMemoryGfxAllocations.size());
EXPECT_EQ(pSourceKernel[rootDeviceIndex]->kernelUnifiedMemoryGfxAllocations.at(0), pClonedKernel[rootDeviceIndex]->kernelUnifiedMemoryGfxAllocations.at(0));
}
context->getSVMAllocsManager()->freeSVMAlloc(ptrSVM);
}
TEST_F(CloneKernelTest, givenBuiltinSourceKernelWhenCloningThenSetBuiltinFlagToClonedKernel) {
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
pSourceKernel[rootDeviceIndex]->isBuiltIn = true;

View File

@ -16,6 +16,7 @@
#include "opencl/source/api/api.h"
#include "opencl/source/mem_obj/mem_obj_helper.h"
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
#include "opencl/test/unit_test/fixtures/multi_root_device_fixture.h"
#include "opencl/test/unit_test/mocks/mock_buffer.h"
#include "opencl/test/unit_test/mocks/mock_command_queue.h"
#include "opencl/test/unit_test/mocks/mock_context.h"
@ -72,6 +73,29 @@ TEST_F(SVMMemoryAllocatorTest, whenRequestSVMAllocsThenReturnNonNullptr) {
EXPECT_NE(svmAllocs, nullptr);
}
using MultiDeviceSVMMemoryAllocatorTest = MultiRootDeviceWithSubDevicesFixture;
TEST_F(MultiDeviceSVMMemoryAllocatorTest, givenMultipleDevicesWhenCreatingSVMAllocThenCreateOneGraphicsAllocationPerRootDeviceIndex) {
REQUIRE_SVM_OR_SKIP(device1);
auto svmManager = std::make_unique<MockSVMAllocsManager>(device1->getMemoryManager(), false);
auto ptr = svmManager->createSVMAlloc(MemoryConstants::pageSize, {}, context->getRootDeviceIndices(), context->getDeviceBitfields());
EXPECT_NE(nullptr, ptr);
auto svmData = svmManager->getSVMAlloc(ptr);
EXPECT_EQ(1u, svmManager->SVMAllocs.getNumAllocs());
ASSERT_NE(nullptr, svmData);
for (auto &rootDeviceIndex : context->getRootDeviceIndices()) {
auto svmAllocation = svmManager->getSVMAlloc(ptr)->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
EXPECT_NE(nullptr, svmAllocation);
EXPECT_EQ(GraphicsAllocation::AllocationType::SVM_ZERO_COPY, svmAllocation->getAllocationType());
EXPECT_FALSE(svmAllocation->isCoherent());
}
svmManager->freeSVMAlloc(ptr);
EXPECT_EQ(nullptr, svmManager->getSVMAlloc(ptr));
EXPECT_EQ(0u, svmManager->SVMAllocs.getNumAllocs());
}
TEST_F(SVMMemoryAllocatorTest, whenSVMAllocationIsFreedThenCannotBeGotAgain) {
auto ptr = svmManager->createSVMAlloc(MemoryConstants::pageSize, {}, rootDeviceIndices, deviceBitfields);
EXPECT_NE(nullptr, ptr);

View File

@ -112,6 +112,9 @@ void *SVMAllocsManager::createSVMAlloc(size_t size, const SvmAllocationPropertie
if (size == 0)
return nullptr;
if (rootDeviceIndices.size() > 1) {
return createZeroCopySvmAllocation(size, svmProperties, rootDeviceIndices, subdeviceBitfields);
}
if (!memoryManager->isLocalMemorySupported(*rootDeviceIndices.begin())) {
return createZeroCopySvmAllocation(size, svmProperties, rootDeviceIndices, subdeviceBitfields);
} else {
@ -349,20 +352,26 @@ void *SVMAllocsManager::createZeroCopySvmAllocation(size_t size, const SvmAlloca
false, // isMultiStorageAllocation
deviceBitfield};
MemoryPropertiesHelper::fillCachePolicyInProperties(properties, false, svmProperties.readOnly, false, properties.cacheRegion);
GraphicsAllocation *allocation = memoryManager->allocateGraphicsMemoryWithProperties(properties);
if (!allocation) {
std::vector<uint32_t> rootDeviceIndicesVector(rootDeviceIndices.begin(), rootDeviceIndices.end());
auto maxRootDeviceIndex = *std::max_element(rootDeviceIndices.begin(), rootDeviceIndices.end(), std::less<uint32_t const>());
SvmAllocationData allocData(maxRootDeviceIndex);
void *usmPtr = memoryManager->createMultiGraphicsAllocationInSystemMemoryPool(rootDeviceIndicesVector, properties, allocData.gpuAllocations);
if (!usmPtr) {
return nullptr;
}
allocation->setMemObjectsAllocationWithWritableFlags(!svmProperties.readOnly && !svmProperties.hostPtrReadOnly);
allocation->setCoherent(svmProperties.coherent);
SvmAllocationData allocData(rootDeviceIndex);
allocData.gpuAllocations.addAllocation(allocation);
for (const auto &rootDeviceIndex : rootDeviceIndices) {
auto allocation = allocData.gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
allocation->setMemObjectsAllocationWithWritableFlags(!svmProperties.readOnly && !svmProperties.hostPtrReadOnly);
allocation->setCoherent(svmProperties.coherent);
}
allocData.size = size;
std::unique_lock<SpinLock> lock(mtx);
this->SVMAllocs.insert(allocData);
return allocation->getUnderlyingBuffer();
return usmPtr;
}
void *SVMAllocsManager::createUnifiedAllocationWithDeviceStorage(size_t size, const SvmAllocationProperties &svmProperties, const UnifiedMemoryProperties &unifiedMemoryProperties) {
@ -415,10 +424,11 @@ void *SVMAllocsManager::createUnifiedAllocationWithDeviceStorage(size_t size, co
}
void SVMAllocsManager::freeZeroCopySvmAllocation(SvmAllocationData *svmData) {
GraphicsAllocation *gpuAllocation = svmData->gpuAllocations.getDefaultGraphicsAllocation();
auto gpuAllocations = svmData->gpuAllocations;
SVMAllocs.remove(*svmData);
memoryManager->freeGraphicsMemory(gpuAllocation);
for (const auto &graphicsAllocation : gpuAllocations.getGraphicsAllocations()) {
memoryManager->freeGraphicsMemory(graphicsAllocation);
}
}
void SVMAllocsManager::freeSvmAllocationWithDeviceStorage(SvmAllocationData *svmData) {