Limit system memory flag in builtin kernels to destination argument

Related-To: NEO-6959

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2022-07-15 19:47:17 +00:00 committed by Compute-Runtime-Automation
parent f89ea1685e
commit 3f8c19eec9
13 changed files with 341 additions and 30 deletions

View File

@ -61,14 +61,16 @@ class BuiltInOp<EBuiltInOps::CopyBufferToBuffer> : public BuiltinDispatchInfoBui
auto middleSizeEls = middleSizeBytes / middleElSize; // num work items in middle walker
uint32_t rootDeviceIndex = clDevice.getRootDeviceIndex();
// Set-up ISA
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Left, kernLeftLeftover->getKernel(clDevice.getRootDeviceIndex()));
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Left, kernLeftLeftover->getKernel(rootDeviceIndex));
if (isSrcMisaligned) {
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddleMisaligned->getKernel(clDevice.getRootDeviceIndex()));
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddleMisaligned->getKernel(rootDeviceIndex));
} else {
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle->getKernel(clDevice.getRootDeviceIndex()));
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle->getKernel(rootDeviceIndex));
}
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Right, kernRightLeftover->getKernel(clDevice.getRootDeviceIndex()));
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Right, kernRightLeftover->getKernel(rootDeviceIndex));
// Set-up common kernel args
if (operationParams.srcSvmAlloc) {
@ -78,13 +80,19 @@ class BuiltInOp<EBuiltInOps::CopyBufferToBuffer> : public BuiltinDispatchInfoBui
} else {
kernelSplit1DBuilder.setArgSvm(0, operationParams.size.x + operationParams.srcOffset.x, operationParams.srcPtr, nullptr, CL_MEM_READ_ONLY);
}
bool isDestinationInSystem = false;
if (operationParams.dstSvmAlloc) {
kernelSplit1DBuilder.setArgSvmAlloc(1, operationParams.dstPtr, operationParams.dstSvmAlloc);
isDestinationInSystem = Kernel::graphicsAllocationTypeUseSystemMemory(operationParams.dstSvmAlloc->getAllocationType());
} else if (operationParams.dstMemObj) {
kernelSplit1DBuilder.setArg(1, operationParams.dstMemObj);
isDestinationInSystem = Kernel::graphicsAllocationTypeUseSystemMemory(operationParams.dstMemObj->getGraphicsAllocation(rootDeviceIndex)->getAllocationType());
} else {
kernelSplit1DBuilder.setArgSvm(1, operationParams.size.x + operationParams.dstOffset.x, operationParams.dstPtr, nullptr, 0u);
isDestinationInSystem = operationParams.dstPtr != nullptr;
}
kernelSplit1DBuilder.setKernelDestinationArgumentInSystem(isDestinationInSystem);
kernelSplit1DBuilder.setUnifiedMemorySyncRequirement(operationParams.unifiedMemoryArgsRequireMemSync);
@ -185,9 +193,11 @@ class BuiltInOp<EBuiltInOps::CopyBufferRect> : public BuiltinDispatchInfoBuilder
}
}
uint32_t rootDeviceIndex = clDevice.getRootDeviceIndex();
// Set-up ISA
int dimensions = is3D ? 3 : 2;
kernelNoSplit3DBuilder.setKernel(kernelBytes[dimensions - 1]->getKernel(clDevice.getRootDeviceIndex()));
kernelNoSplit3DBuilder.setKernel(kernelBytes[dimensions - 1]->getKernel(rootDeviceIndex));
size_t srcOffsetFromAlignedPtr = 0;
size_t dstOffsetFromAlignedPtr = 0;
@ -205,9 +215,11 @@ class BuiltInOp<EBuiltInOps::CopyBufferRect> : public BuiltinDispatchInfoBuilder
kernelNoSplit3DBuilder.setArgSvm(0, hostPtrSize, srcPtrToSet, nullptr, CL_MEM_READ_ONLY);
}
bool isDestinationInSystem = false;
// arg1 = dst
if (operationParams.dstMemObj) {
kernelNoSplit3DBuilder.setArg(1, operationParams.dstMemObj);
isDestinationInSystem = Kernel::graphicsAllocationTypeUseSystemMemory(operationParams.dstMemObj->getGraphicsAllocation(rootDeviceIndex)->getAllocationType());
} else {
void *dstPtrToSet = operationParams.dstPtr;
if (!is3D) {
@ -216,7 +228,9 @@ class BuiltInOp<EBuiltInOps::CopyBufferRect> : public BuiltinDispatchInfoBuilder
dstOffsetFromAlignedPtr = ptrDiff(dstPtr, dstPtrToSet);
}
kernelNoSplit3DBuilder.setArgSvm(1, hostPtrSize, dstPtrToSet, nullptr, 0u);
isDestinationInSystem = operationParams.dstPtr != nullptr;
}
kernelNoSplit3DBuilder.setKernelDestinationArgumentInSystem(isDestinationInSystem);
// arg2 = srcOrigin
OffsetType kSrcOrigin[4] = {static_cast<OffsetType>(operationParams.srcOffset.x + srcOffsetFromAlignedPtr), static_cast<OffsetType>(operationParams.srcOffset.y), static_cast<OffsetType>(operationParams.srcOffset.z), 0};
@ -302,20 +316,26 @@ class BuiltInOp<EBuiltInOps::FillBuffer> : public BuiltinDispatchInfoBuilder {
auto middleSizeEls = middleSizeBytes / middleElSize; // num work items in middle walker
uint32_t rootDeviceIndex = clDevice.getRootDeviceIndex();
// Set-up ISA
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Left, kernLeftLeftover->getKernel(clDevice.getRootDeviceIndex()));
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle->getKernel(clDevice.getRootDeviceIndex()));
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Right, kernRightLeftover->getKernel(clDevice.getRootDeviceIndex()));
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Left, kernLeftLeftover->getKernel(rootDeviceIndex));
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle->getKernel(rootDeviceIndex));
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Right, kernRightLeftover->getKernel(rootDeviceIndex));
DEBUG_BREAK_IF((operationParams.srcMemObj == nullptr) || (operationParams.srcOffset != 0));
DEBUG_BREAK_IF((operationParams.dstMemObj == nullptr) && (operationParams.dstSvmAlloc == nullptr));
bool isDestinationInSystem = false;
// Set-up dstMemObj with buffer
if (operationParams.dstSvmAlloc) {
kernelSplit1DBuilder.setArgSvmAlloc(0, operationParams.dstPtr, operationParams.dstSvmAlloc);
isDestinationInSystem = Kernel::graphicsAllocationTypeUseSystemMemory(operationParams.dstSvmAlloc->getAllocationType());
} else {
kernelSplit1DBuilder.setArg(0, operationParams.dstMemObj);
isDestinationInSystem = Kernel::graphicsAllocationTypeUseSystemMemory(operationParams.dstMemObj->getGraphicsAllocation(rootDeviceIndex)->getAllocationType());
}
kernelSplit1DBuilder.setKernelDestinationArgumentInSystem(isDestinationInSystem);
// Set-up dstOffset
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Left, 1, static_cast<OffsetType>(operationParams.dstOffset.x));
@ -545,20 +565,26 @@ class BuiltInOp<EBuiltInOps::CopyImage3dToBuffer> : public BuiltinDispatchInfoBu
size_t hostPtrSize = operationParams.dstPtr ? Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, srcImage->getImageDesc().image_type) : 0;
hostPtrSize += operationParams.dstOffset.x;
uint32_t rootDeviceIndex = clDevice.getRootDeviceIndex();
// Set-up ISA
auto bytesExponent = Math::log2(bytesPerPixel);
DEBUG_BREAK_IF(bytesExponent >= 5);
kernelNoSplit3DBuilder.setKernel(kernelBytes[bytesExponent]->getKernel(clDevice.getRootDeviceIndex()));
kernelNoSplit3DBuilder.setKernel(kernelBytes[bytesExponent]->getKernel(rootDeviceIndex));
// Set-up source image
kernelNoSplit3DBuilder.setArg(0, srcImageRedescribed, operationParams.srcMipLevel);
bool isDestinationInSystem = false;
// Set-up destination host ptr / buffer
if (operationParams.dstPtr) {
kernelNoSplit3DBuilder.setArgSvm(1, hostPtrSize, operationParams.dstPtr, nullptr, 0u);
isDestinationInSystem = operationParams.dstPtr != nullptr;
} else {
kernelNoSplit3DBuilder.setArg(1, operationParams.dstMemObj);
isDestinationInSystem = Kernel::graphicsAllocationTypeUseSystemMemory(operationParams.dstMemObj->getGraphicsAllocation(rootDeviceIndex)->getAllocationType());
}
kernelNoSplit3DBuilder.setKernelDestinationArgumentInSystem(isDestinationInSystem);
// Set-up srcOrigin
{

View File

@ -109,8 +109,13 @@ inline void HardwareInterface<GfxFamily>::programWalker(
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups,
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder);
bool requiredSystemFence = kernel.isAnyKernelArgumentUsingSystemMemory() && walkerArgs.event != nullptr;
bool kernelSystemAllocation = false;
if (kernel.isBuiltIn) {
kernelSystemAllocation = kernel.getDestinationAllocationInSystemMemory();
} else {
kernelSystemAllocation = kernel.isAnyKernelArgumentUsingSystemMemory();
}
bool requiredSystemFence = kernelSystemAllocation && walkerArgs.event != nullptr;
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), requiredSystemFence};
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(hwInfo, walkerCmd, encodeWalkerArgs);

View File

@ -293,6 +293,12 @@ class DispatchInfoBuilder {
}
}
void setKernelDestinationArgumentInSystem(bool value) {
for (auto &dispatchInfo : dispatchInfos) {
dispatchInfo.getKernel()->setDestinationAllocationInSystemMemory(value);
}
}
DispatchInfo &getDispatchInfo(size_t index) { return dispatchInfos[index]; }
static constexpr size_t getMaxNumDispatches() { return numDispatches; }

View File

@ -881,7 +881,7 @@ cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, G
patchedArgumentsNum++;
kernelArguments[argIndex].isPatched = true;
}
if (svmPtr != nullptr) {
if (svmPtr != nullptr && isBuiltIn == false) {
this->anyKernelArgumentUsingSystemMemory |= true;
}
addAllocationToCacheFlushVector(argIndex, svmAlloc);
@ -940,9 +940,9 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio
patchedArgumentsNum++;
kernelArgInfo.isPatched = true;
}
if (!kernelArgInfo.isSetToNullptr) {
if (!kernelArgInfo.isSetToNullptr && isBuiltIn == false) {
if (svmAlloc != nullptr) {
this->anyKernelArgumentUsingSystemMemory |= graphicsAllocationTypeUseSystemMemory(svmAlloc->getAllocationType());
this->anyKernelArgumentUsingSystemMemory |= Kernel::graphicsAllocationTypeUseSystemMemory(svmAlloc->getAllocationType());
} else {
this->anyKernelArgumentUsingSystemMemory |= true;
}
@ -1406,7 +1406,9 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
}
auto gfxAllocationType = buffer->getGraphicsAllocation(rootDeviceIndex)->getAllocationType();
this->anyKernelArgumentUsingSystemMemory |= graphicsAllocationTypeUseSystemMemory(gfxAllocationType);
if (!isBuiltIn) {
this->anyKernelArgumentUsingSystemMemory |= Kernel::graphicsAllocationTypeUseSystemMemory(gfxAllocationType);
}
if (buffer->peekSharingHandler()) {
usingSharedObjArgs = true;

View File

@ -409,6 +409,14 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
return anyKernelArgumentUsingSystemMemory;
}
static bool graphicsAllocationTypeUseSystemMemory(AllocationType type);
void setDestinationAllocationInSystemMemory(bool value) {
isDestinationAllocationInSystemMemory = value;
}
bool getDestinationAllocationInSystemMemory() const {
return isDestinationAllocationInSystemMemory;
}
protected:
struct KernelConfig {
Vec3<size_t> gws;
@ -473,7 +481,6 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
bool hasTunningFinished(KernelSubmissionData &submissionData);
bool hasRunFinished(TimestampPacketContainer *timestampContainer);
bool graphicsAllocationTypeUseSystemMemory(AllocationType type);
UnifiedMemoryControls unifiedMemoryControls{};
@ -537,6 +544,7 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
bool singleSubdevicePreferredInCurrentEnqueue = false;
bool kernelHasIndirectAccess = true;
bool anyKernelArgumentUsingSystemMemory = false;
bool isDestinationAllocationInSystemMemory = false;
};
} // namespace NEO

View File

@ -271,7 +271,7 @@ TEST_F(BuiltInTests, WhenBuildingListOfBuiltinsThenBuiltinsHaveBeenGenerated) {
}
}
TEST_F(BuiltInTests, GivenCopyBufferToBufferWhenDispatchInfoIsCreatedThenParamsAreCorrect) {
TEST_F(BuiltInTests, GivenCopyBufferToSystemMemoryBufferWhenDispatchInfoIsCreatedThenParamsAreCorrect) {
BuiltinDispatchInfoBuilder &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBuffer, *pClDevice);
MockBuffer *srcPtr = new MockBuffer();
@ -280,6 +280,9 @@ TEST_F(BuiltInTests, GivenCopyBufferToBufferWhenDispatchInfoIsCreatedThenParamsA
MockBuffer &src = *srcPtr;
MockBuffer &dst = *dstPtr;
srcPtr->mockGfxAllocation.setAllocationType(AllocationType::BUFFER);
dstPtr->mockGfxAllocation.setAllocationType(AllocationType::BUFFER_HOST_MEMORY);
BuiltinOpParams builtinOpsParams;
builtinOpsParams.srcMemObj = &src;
@ -328,12 +331,44 @@ TEST_F(BuiltInTests, GivenCopyBufferToBufferWhenDispatchInfoIsCreatedThenParamsA
EXPECT_EQ(Vec3<size_t>(rightSize, 1, 1), dispatchInfo.getGWS());
}
i++;
EXPECT_TRUE(dispatchInfo.getKernel()->getDestinationAllocationInSystemMemory());
}
EXPECT_TRUE(compareBuiltinOpParams(multiDispatchInfo.peekBuiltinOpParams(), builtinOpsParams));
delete srcPtr;
delete dstPtr;
}
TEST_F(BuiltInTests, GivenCopyBufferToLocalMemoryBufferWhenDispatchInfoIsCreatedThenParamsAreCorrect) {
BuiltinDispatchInfoBuilder &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBuffer, *pClDevice);
MockBuffer *srcPtr = new MockBuffer();
MockBuffer *dstPtr = new MockBuffer();
MockBuffer &src = *srcPtr;
MockBuffer &dst = *dstPtr;
srcPtr->mockGfxAllocation.setAllocationType(AllocationType::BUFFER_HOST_MEMORY);
dstPtr->mockGfxAllocation.setAllocationType(AllocationType::BUFFER);
BuiltinOpParams builtinOpsParams;
builtinOpsParams.srcMemObj = &src;
builtinOpsParams.dstMemObj = &dst;
builtinOpsParams.srcPtr = src.getCpuAddress();
builtinOpsParams.dstPtr = dst.getCpuAddress();
builtinOpsParams.size = {dst.getSize(), 0, 0};
MultiDispatchInfo multiDispatchInfo(builtinOpsParams);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
for (auto &dispatchInfo : multiDispatchInfo) {
EXPECT_FALSE(dispatchInfo.getKernel()->getDestinationAllocationInSystemMemory());
}
delete srcPtr;
delete dstPtr;
}
HWTEST2_P(AuxBuiltInTests, givenInputBufferWhenBuildingNonAuxDispatchInfoForAuxTranslationThenPickAndSetupCorrectKernels, AuxBuiltinsMatcher) {
BuiltinDispatchInfoBuilder &baseBuilder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, *pClDevice);
auto &builder = static_cast<BuiltInOp<EBuiltInOps::AuxTranslation> &>(baseBuilder);
@ -857,8 +892,7 @@ TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyBufferToBufferStateless
EXPECT_TRUE(compareBuiltinOpParams(multiDispatchInfo.peekBuiltinOpParams(), builtinOpsParams));
}
TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyBufferToBufferRectStatelessIsUsedThenParamsAreCorrect) {
TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyBufferToSystemBufferRectStatelessIsUsedThenParamsAreCorrect) {
if (is32bit) {
GTEST_SKIP();
}
@ -874,6 +908,9 @@ TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyBufferToBufferRectState
MockBuffer dstBuffer;
dstBuffer.size = static_cast<size_t>(bigSize);
srcBuffer.mockGfxAllocation.setAllocationType(AllocationType::BUFFER);
dstBuffer.mockGfxAllocation.setAllocationType(AllocationType::BUFFER_HOST_MEMORY);
BuiltinOpParams dc;
dc.srcMemObj = &srcBuffer;
dc.dstMemObj = &dstBuffer;
@ -889,10 +926,53 @@ TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyBufferToBufferRectState
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
EXPECT_TRUE(compareBuiltinOpParams(multiDispatchInfo.peekBuiltinOpParams(), dc));
for (auto &dispatchInfo : multiDispatchInfo) {
EXPECT_TRUE(dispatchInfo.getKernel()->getDestinationAllocationInSystemMemory());
}
}
TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderFillBufferStatelessIsUsedThenParamsAreCorrect) {
TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyBufferToLocalBufferRectStatelessIsUsedThenParamsAreCorrect) {
if (is32bit) {
GTEST_SKIP();
}
BuiltinDispatchInfoBuilder &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferRectStateless, *pClDevice);
uint64_t bigSize = 10ull * MemoryConstants::gigaByte;
uint64_t bigOffset = 4ull * MemoryConstants::gigaByte;
uint64_t size = 4ull * MemoryConstants::gigaByte;
MockBuffer srcBuffer;
srcBuffer.size = static_cast<size_t>(bigSize);
MockBuffer dstBuffer;
dstBuffer.size = static_cast<size_t>(bigSize);
srcBuffer.mockGfxAllocation.setAllocationType(AllocationType::BUFFER_HOST_MEMORY);
dstBuffer.mockGfxAllocation.setAllocationType(AllocationType::BUFFER);
BuiltinOpParams dc;
dc.srcMemObj = &srcBuffer;
dc.dstMemObj = &dstBuffer;
dc.srcOffset = {static_cast<size_t>(bigOffset), 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {static_cast<size_t>(size), 1, 1};
dc.srcRowPitch = static_cast<size_t>(size);
dc.srcSlicePitch = 0;
dc.dstRowPitch = static_cast<size_t>(size);
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
EXPECT_TRUE(compareBuiltinOpParams(multiDispatchInfo.peekBuiltinOpParams(), dc));
for (auto &dispatchInfo : multiDispatchInfo) {
EXPECT_FALSE(dispatchInfo.getKernel()->getDestinationAllocationInSystemMemory());
}
}
TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderFillSystemBufferStatelessIsUsedThenParamsAreCorrect) {
if (is32bit) {
GTEST_SKIP();
}
@ -908,6 +988,9 @@ TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderFillBufferStatelessIsUsedTh
MockBuffer dstBuffer;
dstBuffer.size = static_cast<size_t>(bigSize);
srcBuffer.mockGfxAllocation.setAllocationType(AllocationType::BUFFER);
dstBuffer.mockGfxAllocation.setAllocationType(AllocationType::BUFFER_HOST_MEMORY);
BuiltinOpParams dc;
dc.srcMemObj = &srcBuffer;
dc.dstMemObj = &dstBuffer;
@ -918,6 +1001,45 @@ TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderFillBufferStatelessIsUsedTh
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
EXPECT_TRUE(compareBuiltinOpParams(multiDispatchInfo.peekBuiltinOpParams(), dc));
for (auto &dispatchInfo : multiDispatchInfo) {
EXPECT_TRUE(dispatchInfo.getKernel()->getDestinationAllocationInSystemMemory());
}
}
TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderFillLocalBufferStatelessIsUsedThenParamsAreCorrect) {
if (is32bit) {
GTEST_SKIP();
}
BuiltinDispatchInfoBuilder &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::FillBufferStateless, *pClDevice);
uint64_t bigSize = 10ull * MemoryConstants::gigaByte;
uint64_t bigOffset = 4ull * MemoryConstants::gigaByte;
uint64_t size = 4ull * MemoryConstants::gigaByte;
MockBuffer srcBuffer;
srcBuffer.size = static_cast<size_t>(bigSize);
MockBuffer dstBuffer;
dstBuffer.size = static_cast<size_t>(bigSize);
srcBuffer.mockGfxAllocation.setAllocationType(AllocationType::BUFFER_HOST_MEMORY);
dstBuffer.mockGfxAllocation.setAllocationType(AllocationType::BUFFER);
BuiltinOpParams dc;
dc.srcMemObj = &srcBuffer;
dc.dstMemObj = &dstBuffer;
dc.dstOffset = {static_cast<size_t>(bigOffset), 0, 0};
dc.size = {static_cast<size_t>(size), 0, 0};
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
EXPECT_TRUE(compareBuiltinOpParams(multiDispatchInfo.peekBuiltinOpParams(), dc));
for (auto &dispatchInfo : multiDispatchInfo) {
EXPECT_FALSE(dispatchInfo.getKernel()->getDestinationAllocationInSystemMemory());
}
}
HWTEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyBufferToImageStatelessIsUsedThenParamsAreCorrect) {
@ -954,8 +1076,7 @@ HWTEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyBufferToImageStateles
EXPECT_FALSE(kernel->getKernelInfo().getArgDescriptorAt(0).as<ArgDescPointer>().isPureStateful());
}
HWTEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyImageToBufferStatelessIsUsedThenParamsAreCorrect) {
HWTEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyImageToSystemBufferStatelessIsUsedThenParamsAreCorrect) {
if (is32bit) {
GTEST_SKIP();
}
@ -965,6 +1086,7 @@ HWTEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyImageToBufferStateles
MockBuffer dstBuffer;
dstBuffer.size = static_cast<size_t>(bigSize);
dstBuffer.mockGfxAllocation.setAllocationType(AllocationType::BUFFER_HOST_MEMORY);
std ::unique_ptr<Image> pSrcImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, pSrcImage.get());
@ -985,6 +1107,48 @@ HWTEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyImageToBufferStateles
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelAttributes.supportsBuffersBiggerThan4Gb());
for (auto &dispatchInfo : multiDispatchInfo) {
EXPECT_TRUE(dispatchInfo.getKernel()->getDestinationAllocationInSystemMemory());
}
}
HWTEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyImageToLocalBufferStatelessIsUsedThenParamsAreCorrect) {
if (is32bit) {
GTEST_SKIP();
}
uint64_t bigSize = 10ull * MemoryConstants::gigaByte;
uint64_t bigOffset = 4ull * MemoryConstants::gigaByte;
MockBuffer dstBuffer;
dstBuffer.size = static_cast<size_t>(bigSize);
dstBuffer.mockGfxAllocation.setAllocationType(AllocationType::BUFFER);
std ::unique_ptr<Image> pSrcImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, pSrcImage.get());
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImage3dToBufferStateless, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = pSrcImage.get();
dc.dstMemObj = &dstBuffer;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {static_cast<size_t>(bigOffset), 0, 0};
dc.size = {1, 1, 1};
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
EXPECT_TRUE(compareBuiltinOpParams(multiDispatchInfo.peekBuiltinOpParams(), dc));
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelAttributes.supportsBuffersBiggerThan4Gb());
for (auto &dispatchInfo : multiDispatchInfo) {
EXPECT_FALSE(dispatchInfo.getKernel()->getDestinationAllocationInSystemMemory());
}
}
TEST_F(BuiltInTests, GivenUnalignedCopyBufferToBufferWhenDispatchInfoIsCreatedThenParamsAreCorrect) {
@ -1048,6 +1212,10 @@ TEST_F(BuiltInTests, GivenReadBufferAlignedWhenDispatchInfoIsCreatedThenParamsAr
size_t middleSize = size / middleElSize;
EXPECT_EQ(Vec3<size_t>(middleSize, 1, 1), dispatchInfo->getGWS());
EXPECT_TRUE(compareBuiltinOpParams(multiDispatchInfo.peekBuiltinOpParams(), builtinOpsParams));
for (auto &dispatchInfo : multiDispatchInfo) {
EXPECT_TRUE(dispatchInfo.getKernel()->getDestinationAllocationInSystemMemory());
}
alignedFree(dstPtr);
}

View File

@ -106,7 +106,18 @@ struct CommandQueueStateless : public CommandQueueHw<FamilyType> {
if (kernel->getKernelInfo().getArgDescriptorAt(0).is<ArgDescriptor::ArgTPointer>()) {
EXPECT_FALSE(kernel->getKernelInfo().getArgDescriptorAt(0).as<ArgDescPointer>().isPureStateful());
}
if (validateKernelSystemMemory) {
if (expectedKernelSystemMemory) {
EXPECT_TRUE(kernel->getDestinationAllocationInSystemMemory());
} else {
EXPECT_FALSE(kernel->getDestinationAllocationInSystemMemory());
}
}
}
bool validateKernelSystemMemory = false;
bool expectedKernelSystemMemory = false;
};
template <typename FamilyType>
@ -120,7 +131,18 @@ struct CommandQueueStateful : public CommandQueueHw<FamilyType> {
if (HwHelperHw<FamilyType>::get().isStatelessToStatefulWithOffsetSupported()) {
EXPECT_TRUE(kernel->allBufferArgsStateful);
}
if (validateKernelSystemMemory) {
if (expectedKernelSystemMemory) {
EXPECT_TRUE(kernel->getDestinationAllocationInSystemMemory());
} else {
EXPECT_FALSE(kernel->getDestinationAllocationInSystemMemory());
}
}
}
bool validateKernelSystemMemory = false;
bool expectedKernelSystemMemory = false;
};
} // namespace NEO

View File

@ -542,7 +542,7 @@ HWTEST_F(EnqueueSvmMemCopyTest, givenEnqueueSvmMemcpyWhenSvmZeroCopyThenBuiltinK
EXPECT_EQ(Vec3<size_t>(256 / middleElSize, 1, 1), di->getGWS());
auto kernel = mdi->begin()->getKernel();
EXPECT_TRUE(kernel->isAnyKernelArgumentUsingSystemMemory());
EXPECT_TRUE(kernel->getDestinationAllocationInSystemMemory());
}
HWTEST_F(EnqueueSvmMemCopyTest, givenEnqueueSvmMemcpyWhenSvmGpuThenBuiltinKernelNotUsesSystemMemory) {
@ -616,5 +616,5 @@ HWTEST_F(EnqueueSvmMemCopyTest, givenEnqueueSvmMemcpyWhenSvmGpuThenBuiltinKernel
EXPECT_EQ(Vec3<size_t>(256 / middleElSize, 1, 1), di->getGWS());
auto kernel = mdi->begin()->getKernel();
EXPECT_FALSE(kernel->isAnyKernelArgumentUsingSystemMemory());
EXPECT_FALSE(kernel->getDestinationAllocationInSystemMemory());
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -179,10 +179,14 @@ struct EnqueueSvmMemFillHw : public ::testing::Test {
using EnqueueSvmMemFillHwTest = EnqueueSvmMemFillHw;
HWTEST_F(EnqueueSvmMemFillHwTest, givenEnqueueSVMMemFillWhenUsingCopyBufferToBufferStatelessBuilderThenSuccessIsReturned) {
HWTEST_F(EnqueueSvmMemFillHwTest, givenEnqueueSVMMemFillWhenUsingCopyBufferToSystemBufferStatelessBuilderThenSuccessIsReturned) {
auto cmdQ = std::make_unique<CommandQueueStateless<FamilyType>>(context.get(), device.get());
auto svmData = context->getSVMAllocsManager()->getSVMAlloc(svmPtr);
svmData->size = static_cast<size_t>(bigSize);
svmData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex())->setAllocationType(AllocationType::SVM_ZERO_COPY);
cmdQ->validateKernelSystemMemory = true;
cmdQ->expectedKernelSystemMemory = true;
auto retVal = cmdQ->enqueueSVMMemFill(
svmPtr, // void *svm_ptr
@ -196,8 +200,14 @@ HWTEST_F(EnqueueSvmMemFillHwTest, givenEnqueueSVMMemFillWhenUsingCopyBufferToBuf
EXPECT_EQ(CL_SUCCESS, retVal);
}
HWTEST_F(EnqueueSvmMemFillHwTest, givenEnqueueSVMMemFillWhenUsingCopyBufferToBufferStatefulBuilderThenSuccessIsReturned) {
HWTEST_F(EnqueueSvmMemFillHwTest, givenEnqueueSVMMemFillWhenUsingCopyBufferToLocalBufferStatefulBuilderThenSuccessIsReturned) {
auto cmdQ = std::make_unique<CommandQueueStateful<FamilyType>>(context.get(), device.get());
auto svmData = context->getSVMAllocsManager()->getSVMAlloc(svmPtr);
svmData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex())->setAllocationType(AllocationType::SVM_GPU);
cmdQ->validateKernelSystemMemory = true;
cmdQ->expectedKernelSystemMemory = false;
auto retVal = cmdQ->enqueueSVMMemFill(
svmPtr, // void *svm_ptr
pattern, // const void *pattern

View File

@ -93,6 +93,9 @@ struct EnqueueThreadingFixture : public ClDeviceFixture {
return new MyCommandQueue<FamilyType>(context, device, properties);
}
bool validateKernelSystemMemory = false;
bool expectedKernelSystemMemory = false;
protected:
~MyCommandQueue() override {
if (kernel) {
@ -103,6 +106,14 @@ struct EnqueueThreadingFixture : public ClDeviceFixture {
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
EXPECT_TRUE(kernel.getMultiDeviceKernel()->hasOwnership());
if (validateKernelSystemMemory) {
if (expectedKernelSystemMemory) {
EXPECT_TRUE(kernel.getDestinationAllocationInSystemMemory());
} else {
EXPECT_FALSE(kernel.getDestinationAllocationInSystemMemory());
}
}
}
}
@ -349,6 +360,8 @@ HWTEST_F(EnqueueThreadingImage, WhenEnqueuingFillImageThenKernelHasOwnership) {
HWTEST_F(EnqueueThreading, WhenEnqueuingReadBufferRectThenKernelHasOwnership) {
createCQ<FamilyType>();
cl_int retVal;
static_cast<MyCommandQueue<FamilyType> *>(pCmdQ)->validateKernelSystemMemory = true;
static_cast<MyCommandQueue<FamilyType> *>(pCmdQ)->expectedKernelSystemMemory = true;
std::unique_ptr<Buffer> buffer(Buffer::create(context, CL_MEM_READ_WRITE, 1024u, nullptr, retVal));
ASSERT_NE(nullptr, buffer.get());
@ -368,6 +381,8 @@ HWTEST_F(EnqueueThreading, WhenEnqueuingReadBufferRectThenKernelHasOwnership) {
HWTEST_F(EnqueueThreadingImage, WhenEnqueuingReadImageThenKernelHasOwnership) {
createCQ<FamilyType>();
cl_int retVal;
static_cast<MyCommandQueue<FamilyType> *>(pCmdQ)->validateKernelSystemMemory = true;
static_cast<MyCommandQueue<FamilyType> *>(pCmdQ)->expectedKernelSystemMemory = true;
cl_image_format imageFormat;
imageFormat.image_channel_data_type = CL_UNORM_INT8;

View File

@ -3269,7 +3269,7 @@ TEST_F(KernelTests, GivenCorrectAllocationTypeThenFunctionCheckingSystemMemoryRe
allocationTypeIndex < static_cast<uint32_t>(NEO::AllocationType::COUNT);
allocationTypeIndex++) {
auto currentAllocationType = static_cast<NEO::AllocationType>(allocationTypeIndex);
bool ret = kernel->graphicsAllocationTypeUseSystemMemory(currentAllocationType);
bool ret = Kernel::graphicsAllocationTypeUseSystemMemory(currentAllocationType);
if (std::find(systemMemoryAllocationType.begin(),
systemMemoryAllocationType.end(),
currentAllocationType) != systemMemoryAllocationType.end()) {

View File

@ -469,3 +469,51 @@ XE_HPC_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTest,
auto event = castToObject<Event>(kernelEvent);
event->release();
}
XE_HPC_CORETEST_F(SystemMemoryFenceInDefaultConfigurationTest,
givenEventProvidedWhenEnqueueBuiltinKernelUsingSystemMemoryInDestinationArgumentThenPostSyncFenceRequestDispatched) {
using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE;
VariableBackup<unsigned short> revisionId(&defaultHwInfo->platform.usRevId);
constexpr unsigned short revision = 0x3;
revisionId = revision;
UltClDeviceFactory ultClDeviceFactory{1, 0};
auto &clDevice = *ultClDeviceFactory.rootDevices[0];
MockKernelWithInternals kernel(clDevice);
MockContext context(&clDevice);
MockCommandQueueHw<FamilyType> commandQueue(&context, &clDevice, nullptr);
auto &commandStreamReceiver = clDevice.getUltCommandStreamReceiver<FamilyType>();
size_t globalWorkSize[3] = {1, 1, 1};
cl_event kernelEvent{};
kernel.mockKernel->isBuiltIn = true;
kernel.mockKernel->setDestinationAllocationInSystemMemory(true);
commandQueue.enqueueKernel(kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, &kernelEvent);
ClHardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandQueue);
auto itorSystemMemFenceAddress = find<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), itorSystemMemFenceAddress);
auto systemMemFenceAddressCmd = genCmdCast<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(*itorSystemMemFenceAddress);
EXPECT_EQ(commandStreamReceiver.globalFenceAllocation->getGpuAddress(), systemMemFenceAddressCmd->getSystemMemoryFenceAddress());
auto itorComputeWalker = find<COMPUTE_WALKER *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), itorComputeWalker);
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*itorComputeWalker);
auto &postSyncData = walkerCmd->getPostSync();
EXPECT_TRUE(postSyncData.getSystemMemoryFenceRequest());
auto itorMiMemFence = find<MI_MEM_FENCE *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
ASSERT_NE(hwParser.cmdList.end(), itorMiMemFence);
auto fenceCmd = genCmdCast<MI_MEM_FENCE *>(*itorMiMemFence);
ASSERT_NE(nullptr, fenceCmd);
EXPECT_EQ(MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE, fenceCmd->getFenceType());
auto event = castToObject<Event>(kernelEvent);
event->release();
}

View File

@ -15,7 +15,8 @@ HWTEST_EXCLUDE_PRODUCT(QueueFamilyNameTest, givenRcsWhenGettingQueueFamilyNameTh
HWTEST_EXCLUDE_PRODUCT(PipeControlHelperTests, givenHwHelperwhenAskingForDcFlushThenReturnTrue, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(EnqueueCopyBufferToImageStatelessTest, givenBigBufferWhenCopyingBufferToImageStatelessThenSuccessIsReturned, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(EnqueueCopyImageToBufferHwStatelessTest, givenBigBufferWhenCopyingImageToBufferStatelessThenSuccessIsReturned, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyImageToBufferStatelessIsUsedThenParamsAreCorrect, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyImageToSystemBufferStatelessIsUsedThenParamsAreCorrect, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyImageToLocalBufferStatelessIsUsedThenParamsAreCorrect, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(ClDeviceHelperTests, givenDeviceWithoutClosBasedCacheReservationSupportWhenQueryingNumCacheClosDeviceInfoThenReturnZeroCacheClosRegions, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(HwHelperTest, whenGettingNumberOfCacheRegionsThenReturnZero, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(LocalWorkSizeTest, givenDispatchInfoWhenWorkSizeInfoIsCreatedThenTestEuFusionFtr, IGFX_XE_HPC_CORE);
@ -41,7 +42,7 @@ HWTEST_EXCLUDE_PRODUCT(BuiltInSharedTest, GivenBuiltinTypeBinaryWhenGettingBuilt
HWTEST_EXCLUDE_PRODUCT(EnqueueReadBufferRectStatefulTest, WhenReadingBufferRectStatefulThenSuccessIsReturned, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(EnqueueCopyBufferRectStateful, GivenValidParametersWhenCopyingBufferRectStatefulThenSuccessIsReturned, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(EnqueueSvmMemCopyHwTest, givenEnqueueSVMMemCopyWhenUsingCopyBufferToBufferStatefulBuilderThenSuccessIsReturned, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(EnqueueSvmMemFillHwTest, givenEnqueueSVMMemFillWhenUsingCopyBufferToBufferStatefulBuilderThenSuccessIsReturned, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(EnqueueSvmMemFillHwTest, givenEnqueueSVMMemFillWhenUsingCopyBufferToLocalBufferStatefulBuilderThenSuccessIsReturned, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(EnqueueFillBufferStatefulTest, givenBuffersWhenFillingBufferStatefulThenSuccessIsReturned, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(EnqueueCopyBufferStatefulTest, givenBuffersWhenCopyingBufferStatefulThenSuccessIsReturned, IGFX_XE_HPC_CORE);
HWTEST_EXCLUDE_PRODUCT(EnqueueWriteBufferStatefulTest, WhenWritingBufferStatefulThenSuccessIsReturned, IGFX_XE_HPC_CORE);