feature: optimize img2buf and buf2img builtins

Related-To: NEO-16652

Signed-off-by: Narendra Bagria <narendra.bagria@intel.com>
This commit is contained in:
Narendra Bagria
2025-11-14 12:10:22 +00:00
committed by Compute-Runtime-Automation
parent 4d4da44aff
commit f460980d14
17 changed files with 1678 additions and 172 deletions

View File

@@ -56,8 +56,11 @@ enum class Builtin : uint32_t {
enum class ImageBuiltin : uint32_t {
copyBufferToImage3d16Bytes = 0u,
copyBufferToImage3d16BytesAligned,
copyBufferToImage3d16BytesStateless,
copyBufferToImage3d16BytesAlignedStateless,
copyBufferToImage3d16BytesHeapless,
copyBufferToImage3d16BytesAlignedHeapless,
copyBufferToImage3d2Bytes,
copyBufferToImage3d2BytesStateless,
copyBufferToImage3d2BytesHeapless,
@@ -77,8 +80,11 @@ enum class ImageBuiltin : uint32_t {
copyBufferToImage3dBytesStateless,
copyBufferToImage3dBytesHeapless,
copyImage3dToBuffer16Bytes,
copyImage3dToBuffer16BytesAligned,
copyImage3dToBuffer16BytesStateless,
copyImage3dToBuffer16BytesAlignedStateless,
copyImage3dToBuffer16BytesHeapless,
copyImage3dToBuffer16BytesAlignedHeapless,
copyImage3dToBuffer2Bytes,
copyImage3dToBuffer2BytesStateless,
copyImage3dToBuffer2BytesHeapless,
@@ -252,6 +258,7 @@ constexpr ImageBuiltin adjustImageBuiltinType(const bool isStateless, const bool
}
DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d16Bytes);
DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d16BytesAligned);
DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d2Bytes);
DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d4Bytes);
DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d3To4Bytes);
@@ -259,6 +266,7 @@ DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d8Bytes);
DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d6To8Bytes);
DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3dBytes);
DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer16Bytes);
DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer16BytesAligned);
DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer2Bytes);
DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer3Bytes);
DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer4Bytes);

View File

@@ -177,14 +177,26 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtinName = "CopyBufferToImage3d16Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3d;
break;
case ImageBuiltin::copyBufferToImage3d16BytesAligned:
builtinName = "CopyBufferToImage3d16BytesAligned";
builtin = NEO::EBuiltInOps::copyBufferToImage3d;
break;
case ImageBuiltin::copyBufferToImage3d16BytesStateless:
builtinName = "CopyBufferToImage3d16BytesStateless";
builtin = NEO::EBuiltInOps::copyBufferToImage3dStateless;
break;
case ImageBuiltin::copyBufferToImage3d16BytesAlignedStateless:
builtinName = "CopyBufferToImage3d16BytesAlignedStateless";
builtin = NEO::EBuiltInOps::copyBufferToImage3dStateless;
break;
case ImageBuiltin::copyBufferToImage3d16BytesHeapless:
builtinName = "CopyBufferToImage3d16BytesStateless";
builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless;
break;
case ImageBuiltin::copyBufferToImage3d16BytesAlignedHeapless:
builtinName = "CopyBufferToImage3d16BytesAlignedStateless";
builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless;
break;
case ImageBuiltin::copyBufferToImage3d2Bytes:
builtinName = "CopyBufferToImage3d2Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3d;
@@ -261,14 +273,26 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtinName = "CopyImage3dToBuffer16Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBuffer;
break;
case ImageBuiltin::copyImage3dToBuffer16BytesAligned:
builtinName = "CopyImage3dToBuffer16BytesAligned";
builtin = NEO::EBuiltInOps::copyImage3dToBuffer;
break;
case ImageBuiltin::copyImage3dToBuffer16BytesStateless:
builtinName = "CopyImage3dToBuffer16BytesStateless";
builtin = NEO::EBuiltInOps::copyImage3dToBufferStateless;
break;
case ImageBuiltin::copyImage3dToBuffer16BytesAlignedStateless:
builtinName = "CopyImage3dToBuffer16BytesAlignedStateless";
builtin = NEO::EBuiltInOps::copyImage3dToBufferStateless;
break;
case ImageBuiltin::copyImage3dToBuffer16BytesHeapless:
builtinName = "CopyImage3dToBuffer16BytesStateless";
builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless;
break;
case ImageBuiltin::copyImage3dToBuffer16BytesAlignedHeapless:
builtinName = "CopyImage3dToBuffer16BytesAlignedStateless";
builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless;
break;
case ImageBuiltin::copyImage3dToBuffer2Bytes:
builtinName = "CopyImage3dToBuffer2Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBuffer;

View File

@@ -878,9 +878,16 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendImageCopyFromMemoryExt(z
builtInType = BuiltinTypeHelper::adjustImageBuiltinType<ImageBuiltin::copyBufferToImage3d8Bytes>(isStateless, isHeaplessEnabled);
}
break;
case 16u:
builtInType = BuiltinTypeHelper::adjustImageBuiltinType<ImageBuiltin::copyBufferToImage3d16Bytes>(isStateless, isHeaplessEnabled);
case 16u: {
bool isSrc16BytesAligned = isAligned<16>(allocationStruct.alignedAllocationPtr, allocationStruct.offset,
srcRowPitch, srcSlicePitch);
if (isSrc16BytesAligned) {
builtInType = BuiltinTypeHelper::adjustImageBuiltinType<ImageBuiltin::copyBufferToImage3d16BytesAligned>(isStateless, isHeaplessEnabled);
} else {
builtInType = BuiltinTypeHelper::adjustImageBuiltinType<ImageBuiltin::copyBufferToImage3d16Bytes>(isStateless, isHeaplessEnabled);
}
break;
}
default:
UNRECOVERABLE_IF(true);
break;
@@ -1089,9 +1096,17 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendImageCopyToMemoryExt(voi
builtInType = BuiltinTypeHelper::adjustImageBuiltinType<ImageBuiltin::copyImage3dToBuffer8Bytes>(isStateless, isHeaplessEnabled);
}
break;
case 16u:
builtInType = BuiltinTypeHelper::adjustImageBuiltinType<ImageBuiltin::copyImage3dToBuffer16Bytes>(isStateless, isHeaplessEnabled);
case 16u: {
bool isDst16BytesAligned = isAligned<16>(allocationStruct.alignedAllocationPtr, allocationStruct.offset,
destRowPitch, destSlicePitch);
if (isDst16BytesAligned) {
builtInType = BuiltinTypeHelper::adjustImageBuiltinType<ImageBuiltin::copyImage3dToBuffer16BytesAligned>(isStateless, isHeaplessEnabled);
} else {
builtInType = BuiltinTypeHelper::adjustImageBuiltinType<ImageBuiltin::copyImage3dToBuffer16Bytes>(isStateless, isHeaplessEnabled);
}
break;
}
default: {
CREATE_DEBUG_STRING(str, "Invalid bytesPerPixel of size: %u\n", bytesPerPixel);
driverHandle->setErrorDescription(std::string(str.get()));

View File

@@ -321,6 +321,34 @@ HWTEST_F(TestBuiltinFunctionsLibImpl, givenStatelessImageBuiltinsWhenInitBuiltin
EXPECT_STREQ("CopyImage3dToBuffer6BytesStateless", lib.kernelNamePassed.c_str());
}
HWTEST_F(TestBuiltinFunctionsLibImpl, givenAlignedImageBuiltinsWhenInitBuiltinKernelThenCorrectArgumentsArePassed) {
MockCheckPassedArgumentsBuiltinFunctionsLibImpl lib(device, device->getNEODevice()->getBuiltIns());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d16BytesAligned);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3d, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d16BytesAligned", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d16BytesAlignedStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3dStateless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d16BytesAlignedStateless", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d16BytesAlignedHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3dHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d16BytesAlignedStateless", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer16BytesAligned);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBuffer, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer16BytesAligned", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer16BytesAlignedStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferStateless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer16BytesAlignedStateless", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer16BytesAlignedHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer16BytesAlignedStateless", lib.kernelNamePassed.c_str());
}
HWTEST_F(TestBuiltinFunctionsLibImpl, givenCompilerInterfaceWhenCreateDeviceAndImageSupportedThenBuiltinsImageFunctionsAreLoaded) {
ze_result_t returnValue = ZE_RESULT_SUCCESS;
neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[neoDevice->getRootDeviceIndex()]->compilerInterface.reset(new NEO::MockCompilerInterfaceSpirv());

View File

@@ -1190,6 +1190,516 @@ HWTEST2_F(CommandListTest, givenHeaplessWhenAppendImageCopyToMemoryThenCorrectRo
}
}
HWTEST2_F(CommandListTest, givenAlignedBufferWhenAppendImageCopyFromMemoryWith16BytesPixelThenUseAlignedBuiltin, HeaplessSupport) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
constexpr size_t allocSize = 4096;
void *srcBuffer = nullptr;
ze_host_mem_alloc_desc_t hostDesc = {};
auto result = context->allocHostMem(&hostDesc, allocSize, 16u, &srcBuffer);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
bool isHeapless = device->getNEODevice()->getCompilerProductHelper().isHeaplessModeEnabled(device->getHwInfo());
bool isStateless = device->getNEODevice()->getCompilerProductHelper().isForceToStatelessRequired();
ImageBuiltin expectedBuiltin;
if (isHeapless) {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16BytesAlignedHeapless;
} else if (isStateless) {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16BytesAlignedStateless;
} else {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16BytesAligned;
}
auto expectedKernel = device->getBuiltinFunctionsLib()->getImageFunction(expectedBuiltin);
auto mockBuiltinKernel = static_cast<Mock<::L0::KernelImp> *>(expectedKernel);
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
mockBuiltinKernel->setArgRedescribedImageCalled = 0u;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
ze_image_desc_t zeDesc = {};
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
zeDesc.type = ZE_IMAGE_TYPE_3D;
zeDesc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
zeDesc.format.type = ZE_IMAGE_FORMAT_TYPE_FLOAT;
zeDesc.width = 4;
zeDesc.height = 2;
zeDesc.depth = 2;
auto imageHw = std::make_unique<WhiteBox<::L0::ImageCoreFamily<FamilyType::gfxCoreFamily>>>();
imageHw->initialize(device, &zeDesc);
auto bytesPerPixel = static_cast<uint32_t>(imageHw->getImageInfo().surfaceFormat->imageElementSizeInBytes);
EXPECT_EQ(16u, bytesPerPixel);
CmdListMemoryCopyParams copyParams = {};
ze_image_region_t dstRegion = {0, 0, 0, 4, 2, 2};
commandList->appendImageCopyFromMemory(imageHw->toHandle(), srcBuffer, &dstRegion, nullptr, 0, nullptr, copyParams);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_EQ(1u, mockBuiltinKernel->setArgRedescribedImageCalled);
context->freeMem(srcBuffer);
}
HWTEST2_F(CommandListTest, givenUnalignedBufferWhenAppendImageCopyFromMemoryWith16BytesPixelThenUseRegularBuiltin, HeaplessSupport) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
constexpr size_t allocSize = 4096;
void *srcBuffer = nullptr;
ze_host_mem_alloc_desc_t hostDesc = {};
auto result = context->allocHostMem(&hostDesc, allocSize, 8u, &srcBuffer);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
bool isHeapless = device->getNEODevice()->getCompilerProductHelper().isHeaplessModeEnabled(device->getHwInfo());
bool isStateless = device->getNEODevice()->getCompilerProductHelper().isForceToStatelessRequired();
ImageBuiltin expectedBuiltin;
if (isHeapless) {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16BytesHeapless;
} else if (isStateless) {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16BytesStateless;
} else {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16Bytes;
}
auto expectedKernel = device->getBuiltinFunctionsLib()->getImageFunction(expectedBuiltin);
auto mockBuiltinKernel = static_cast<Mock<::L0::KernelImp> *>(expectedKernel);
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
mockBuiltinKernel->setArgRedescribedImageCalled = 0u;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
ze_image_desc_t zeDesc = {};
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
zeDesc.type = ZE_IMAGE_TYPE_3D;
zeDesc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
zeDesc.format.type = ZE_IMAGE_FORMAT_TYPE_FLOAT;
zeDesc.width = 4;
zeDesc.height = 2;
zeDesc.depth = 2;
auto imageHw = std::make_unique<WhiteBox<::L0::ImageCoreFamily<FamilyType::gfxCoreFamily>>>();
imageHw->initialize(device, &zeDesc);
auto bytesPerPixel = static_cast<uint32_t>(imageHw->getImageInfo().surfaceFormat->imageElementSizeInBytes);
EXPECT_EQ(16u, bytesPerPixel);
CmdListMemoryCopyParams copyParams = {};
ze_image_region_t dstRegion = {0, 0, 0, 4, 2, 2};
commandList->appendImageCopyFromMemory(imageHw->toHandle(), srcBuffer, &dstRegion, nullptr, 0, nullptr, copyParams);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_EQ(1u, mockBuiltinKernel->setArgRedescribedImageCalled);
context->freeMem(srcBuffer);
}
HWTEST2_F(CommandListTest, givenHeaplessAndAlignedBufferWhenAppendImageCopyFromMemoryWithNon16BytesPixelThenUse8BytesHeaplessBuiltin, HeaplessSupport) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
constexpr size_t allocSize = 4096;
void *srcBuffer = nullptr;
ze_host_mem_alloc_desc_t hostDesc = {};
auto result = context->allocHostMem(&hostDesc, allocSize, 16u, &srcBuffer);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
auto expectedKernel = device->getBuiltinFunctionsLib()->getImageFunction(ImageBuiltin::copyBufferToImage3d8BytesHeapless);
auto mockBuiltinKernel = static_cast<Mock<::L0::KernelImp> *>(expectedKernel);
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
mockBuiltinKernel->setArgRedescribedImageCalled = 0u;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
commandList->heaplessModeEnabled = true;
commandList->scratchAddressPatchingEnabled = true;
commandList->statelessBuiltinsEnabled = false;
ze_image_desc_t zeDesc = {};
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
zeDesc.type = ZE_IMAGE_TYPE_3D;
zeDesc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_32_32; // Only 8 bytes per pixel
zeDesc.format.type = ZE_IMAGE_FORMAT_TYPE_FLOAT;
zeDesc.width = 4;
zeDesc.height = 2;
zeDesc.depth = 2;
auto imageHw = std::make_unique<WhiteBox<::L0::ImageCoreFamily<FamilyType::gfxCoreFamily>>>();
imageHw->initialize(device, &zeDesc);
auto bytesPerPixel = static_cast<uint32_t>(imageHw->getImageInfo().surfaceFormat->imageElementSizeInBytes);
EXPECT_EQ(8u, bytesPerPixel);
CmdListMemoryCopyParams copyParams = {};
ze_image_region_t dstRegion = {0, 0, 0, 4, 2, 2};
commandList->appendImageCopyFromMemory(imageHw->toHandle(), srcBuffer, &dstRegion, nullptr, 0, nullptr, copyParams);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_EQ(1u, mockBuiltinKernel->setArgRedescribedImageCalled);
context->freeMem(srcBuffer);
}
HWTEST2_F(CommandListTest, givenAlignedBufferWhenAppendImageCopyToMemoryWith16BytesPixelThenUseAlignedBuiltin, HeaplessSupport) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
constexpr size_t allocSize = 4096;
void *dstBuffer = nullptr;
ze_host_mem_alloc_desc_t hostDesc = {};
auto result = context->allocHostMem(&hostDesc, allocSize, 16u, &dstBuffer);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
bool isHeapless = device->getNEODevice()->getCompilerProductHelper().isHeaplessModeEnabled(device->getHwInfo());
bool isStateless = device->getNEODevice()->getCompilerProductHelper().isForceToStatelessRequired();
ImageBuiltin expectedBuiltin;
if (isHeapless) {
expectedBuiltin = ImageBuiltin::copyImage3dToBuffer16BytesAlignedHeapless;
} else if (isStateless) {
expectedBuiltin = ImageBuiltin::copyImage3dToBuffer16BytesAlignedStateless;
} else {
expectedBuiltin = ImageBuiltin::copyImage3dToBuffer16BytesAligned;
}
auto expectedKernel = device->getBuiltinFunctionsLib()->getImageFunction(expectedBuiltin);
auto mockBuiltinKernel = static_cast<Mock<::L0::KernelImp> *>(expectedKernel);
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
mockBuiltinKernel->setArgRedescribedImageCalled = 0u;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
ze_image_desc_t zeDesc = {};
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
zeDesc.type = ZE_IMAGE_TYPE_3D;
zeDesc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
zeDesc.format.type = ZE_IMAGE_FORMAT_TYPE_FLOAT;
zeDesc.width = 4;
zeDesc.height = 2;
zeDesc.depth = 2;
auto imageHw = std::make_unique<WhiteBox<::L0::ImageCoreFamily<FamilyType::gfxCoreFamily>>>();
imageHw->initialize(device, &zeDesc);
auto bytesPerPixel = static_cast<uint32_t>(imageHw->getImageInfo().surfaceFormat->imageElementSizeInBytes);
EXPECT_EQ(16u, bytesPerPixel);
CmdListMemoryCopyParams copyParams = {};
ze_image_region_t srcRegion = {0, 0, 0, 4, 2, 2};
commandList->appendImageCopyToMemory(dstBuffer, imageHw->toHandle(), &srcRegion, nullptr, 0, nullptr, copyParams);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_EQ(1u, mockBuiltinKernel->setArgRedescribedImageCalled);
context->freeMem(dstBuffer);
}
HWTEST2_F(CommandListTest, givenUnalignedBufferWhenAppendImageCopyToMemoryWith16BytesPixelThenUseRegularBuiltin, HeaplessSupport) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
constexpr size_t allocSize = 4096;
void *dstBuffer = nullptr;
ze_host_mem_alloc_desc_t hostDesc = {};
auto result = context->allocHostMem(&hostDesc, allocSize, 8u, &dstBuffer);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
bool isHeapless = device->getNEODevice()->getCompilerProductHelper().isHeaplessModeEnabled(device->getHwInfo());
bool isStateless = device->getNEODevice()->getCompilerProductHelper().isForceToStatelessRequired();
ImageBuiltin expectedBuiltin;
if (isHeapless) {
expectedBuiltin = ImageBuiltin::copyImage3dToBuffer16BytesHeapless;
} else if (isStateless) {
expectedBuiltin = ImageBuiltin::copyImage3dToBuffer16BytesStateless;
} else {
expectedBuiltin = ImageBuiltin::copyImage3dToBuffer16Bytes;
}
auto expectedKernel = device->getBuiltinFunctionsLib()->getImageFunction(expectedBuiltin);
auto mockBuiltinKernel = static_cast<Mock<::L0::KernelImp> *>(expectedKernel);
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
mockBuiltinKernel->setArgRedescribedImageCalled = 0u;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
ze_image_desc_t zeDesc = {};
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
zeDesc.type = ZE_IMAGE_TYPE_3D;
zeDesc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
zeDesc.format.type = ZE_IMAGE_FORMAT_TYPE_FLOAT;
zeDesc.width = 4;
zeDesc.height = 2;
zeDesc.depth = 2;
auto imageHw = std::make_unique<WhiteBox<::L0::ImageCoreFamily<FamilyType::gfxCoreFamily>>>();
imageHw->initialize(device, &zeDesc);
auto bytesPerPixel = static_cast<uint32_t>(imageHw->getImageInfo().surfaceFormat->imageElementSizeInBytes);
EXPECT_EQ(16u, bytesPerPixel);
CmdListMemoryCopyParams copyParams = {};
ze_image_region_t srcRegion = {0, 0, 0, 4, 2, 2};
commandList->appendImageCopyToMemory(dstBuffer, imageHw->toHandle(), &srcRegion, nullptr, 0, nullptr, copyParams);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_EQ(1u, mockBuiltinKernel->setArgRedescribedImageCalled);
context->freeMem(dstBuffer);
}
HWTEST2_F(CommandListTest, givenHeaplessAndAlignedBufferWhenAppendImageCopyToMemoryWithNon16BytesPixelThenUse8BytesHeaplessBuiltin, HeaplessSupport) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
constexpr size_t allocSize = 4096;
void *dstBuffer = nullptr;
ze_host_mem_alloc_desc_t hostDesc = {};
auto result = context->allocHostMem(&hostDesc, allocSize, 16u, &dstBuffer);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
auto expectedKernel = device->getBuiltinFunctionsLib()->getImageFunction(ImageBuiltin::copyImage3dToBuffer8BytesHeapless);
auto mockBuiltinKernel = static_cast<Mock<::L0::KernelImp> *>(expectedKernel);
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
mockBuiltinKernel->setArgRedescribedImageCalled = 0u;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
commandList->heaplessModeEnabled = true;
commandList->scratchAddressPatchingEnabled = true;
commandList->statelessBuiltinsEnabled = false;
ze_image_desc_t zeDesc = {};
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
zeDesc.type = ZE_IMAGE_TYPE_3D;
zeDesc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_32_32;
zeDesc.format.type = ZE_IMAGE_FORMAT_TYPE_FLOAT;
zeDesc.width = 4;
zeDesc.height = 2;
zeDesc.depth = 2;
auto imageHw = std::make_unique<WhiteBox<::L0::ImageCoreFamily<FamilyType::gfxCoreFamily>>>();
imageHw->initialize(device, &zeDesc);
auto bytesPerPixel = static_cast<uint32_t>(imageHw->getImageInfo().surfaceFormat->imageElementSizeInBytes);
EXPECT_EQ(8u, bytesPerPixel);
CmdListMemoryCopyParams copyParams = {};
ze_image_region_t srcRegion = {0, 0, 0, 4, 2, 2};
commandList->appendImageCopyToMemory(dstBuffer, imageHw->toHandle(), &srcRegion, nullptr, 0, nullptr, copyParams);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_EQ(1u, mockBuiltinKernel->setArgRedescribedImageCalled);
context->freeMem(dstBuffer);
}
HWTEST2_F(CommandListTest, givenAddressMisalignedWhenAppendImageCopyFromMemoryWith16BytesPixelThenUseRegularBuiltin, HeaplessSupport) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
constexpr size_t allocSize = 4096;
void *srcBuffer = nullptr;
ze_host_mem_alloc_desc_t hostDesc = {};
auto result = context->allocHostMem(&hostDesc, allocSize, 8u, &srcBuffer);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
bool isHeapless = device->getNEODevice()->getCompilerProductHelper().isHeaplessModeEnabled(device->getHwInfo());
bool isStateless = device->getNEODevice()->getCompilerProductHelper().isForceToStatelessRequired();
ImageBuiltin expectedBuiltin;
if (isHeapless) {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16BytesHeapless;
} else if (isStateless) {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16BytesStateless;
} else {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16Bytes;
}
auto expectedKernel = device->getBuiltinFunctionsLib()->getImageFunction(expectedBuiltin);
auto mockBuiltinKernel = static_cast<Mock<::L0::KernelImp> *>(expectedKernel);
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
mockBuiltinKernel->setArgRedescribedImageCalled = 0u;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
ze_image_desc_t zeDesc = {};
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
zeDesc.type = ZE_IMAGE_TYPE_3D;
zeDesc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
zeDesc.format.type = ZE_IMAGE_FORMAT_TYPE_FLOAT;
zeDesc.width = 4;
zeDesc.height = 2;
zeDesc.depth = 2;
auto imageHw = std::make_unique<WhiteBox<::L0::ImageCoreFamily<FamilyType::gfxCoreFamily>>>();
imageHw->initialize(device, &zeDesc);
CmdListMemoryCopyParams copyParams = {};
ze_image_region_t dstRegion = {0, 0, 0, 4, 2, 2};
commandList->appendImageCopyFromMemory(imageHw->toHandle(), srcBuffer, &dstRegion, nullptr, 0, nullptr, copyParams);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_EQ(1u, mockBuiltinKernel->setArgRedescribedImageCalled);
context->freeMem(srcBuffer);
}
HWTEST2_F(CommandListTest, givenOffsetMisalignedWhenAppendImageCopyFromMemoryWith16BytesPixelThenUseRegularBuiltin, HeaplessSupport) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
constexpr size_t allocSize = 4096;
void *srcBuffer = nullptr;
ze_host_mem_alloc_desc_t hostDesc = {};
auto result = context->allocHostMem(&hostDesc, allocSize, 16u, &srcBuffer);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
bool isHeapless = device->getNEODevice()->getCompilerProductHelper().isHeaplessModeEnabled(device->getHwInfo());
bool isStateless = device->getNEODevice()->getCompilerProductHelper().isForceToStatelessRequired();
ImageBuiltin expectedBuiltin;
if (isHeapless) {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16BytesHeapless;
} else if (isStateless) {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16BytesStateless;
} else {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16Bytes;
}
auto expectedKernel = device->getBuiltinFunctionsLib()->getImageFunction(expectedBuiltin);
auto mockBuiltinKernel = static_cast<Mock<::L0::KernelImp> *>(expectedKernel);
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
mockBuiltinKernel->setArgRedescribedImageCalled = 0u;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
ze_image_desc_t zeDesc = {};
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
zeDesc.type = ZE_IMAGE_TYPE_3D;
zeDesc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
zeDesc.format.type = ZE_IMAGE_FORMAT_TYPE_FLOAT;
zeDesc.width = 4;
zeDesc.height = 2;
zeDesc.depth = 2;
auto imageHw = std::make_unique<WhiteBox<::L0::ImageCoreFamily<FamilyType::gfxCoreFamily>>>();
imageHw->initialize(device, &zeDesc);
CmdListMemoryCopyParams copyParams = {};
ze_image_region_t dstRegion = {8, 0, 0, 4, 2, 2};
commandList->appendImageCopyFromMemoryExt(imageHw->toHandle(), srcBuffer, &dstRegion, 0, 0, nullptr, 0, nullptr, copyParams);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_EQ(1u, mockBuiltinKernel->setArgRedescribedImageCalled);
context->freeMem(srcBuffer);
}
HWTEST2_F(CommandListTest, givenRowPitchMisalignedWhenAppendImageCopyToMemoryWith16BytesPixelThenUseRegularBuiltin, HeaplessSupport) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
constexpr size_t allocSize = 4096;
void *dstBuffer = nullptr;
ze_host_mem_alloc_desc_t hostDesc = {};
auto result = context->allocHostMem(&hostDesc, allocSize, 16u, &dstBuffer);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
bool isHeapless = device->getNEODevice()->getCompilerProductHelper().isHeaplessModeEnabled(device->getHwInfo());
bool isStateless = device->getNEODevice()->getCompilerProductHelper().isForceToStatelessRequired();
ImageBuiltin expectedBuiltin;
if (isHeapless) {
expectedBuiltin = ImageBuiltin::copyImage3dToBuffer16BytesHeapless;
} else if (isStateless) {
expectedBuiltin = ImageBuiltin::copyImage3dToBuffer16BytesStateless;
} else {
expectedBuiltin = ImageBuiltin::copyImage3dToBuffer16Bytes;
}
auto expectedKernel = device->getBuiltinFunctionsLib()->getImageFunction(expectedBuiltin);
auto mockBuiltinKernel = static_cast<Mock<::L0::KernelImp> *>(expectedKernel);
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
mockBuiltinKernel->setArgRedescribedImageCalled = 0u;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
ze_image_desc_t zeDesc = {};
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
zeDesc.type = ZE_IMAGE_TYPE_3D;
zeDesc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
zeDesc.format.type = ZE_IMAGE_FORMAT_TYPE_FLOAT;
zeDesc.width = 4;
zeDesc.height = 2;
zeDesc.depth = 2;
auto imageHw = std::make_unique<WhiteBox<::L0::ImageCoreFamily<FamilyType::gfxCoreFamily>>>();
imageHw->initialize(device, &zeDesc);
CmdListMemoryCopyParams copyParams = {};
ze_image_region_t srcRegion = {0, 0, 0, 4, 2, 2};
uint32_t misalignedRowPitch = 68;
uint32_t misalignedSlicePitch = misalignedRowPitch * 2;
commandList->appendImageCopyToMemoryExt(dstBuffer, imageHw->toHandle(), &srcRegion, misalignedRowPitch, misalignedSlicePitch, nullptr, 0, nullptr, copyParams);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_EQ(1u, mockBuiltinKernel->setArgRedescribedImageCalled);
context->freeMem(dstBuffer);
}
HWTEST2_F(CommandListTest, givenSlicePitchMisalignedWhenAppendImageCopyFromMemoryWith16BytesPixelThenUseRegularBuiltin, HeaplessSupport) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
constexpr size_t allocSize = 4096;
void *srcBuffer = nullptr;
ze_host_mem_alloc_desc_t hostDesc = {};
auto result = context->allocHostMem(&hostDesc, allocSize, 16u, &srcBuffer);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
bool isHeapless = device->getNEODevice()->getCompilerProductHelper().isHeaplessModeEnabled(device->getHwInfo());
bool isStateless = device->getNEODevice()->getCompilerProductHelper().isForceToStatelessRequired();
ImageBuiltin expectedBuiltin;
if (isHeapless) {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16BytesHeapless;
} else if (isStateless) {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16BytesStateless;
} else {
expectedBuiltin = ImageBuiltin::copyBufferToImage3d16Bytes;
}
auto expectedKernel = device->getBuiltinFunctionsLib()->getImageFunction(expectedBuiltin);
auto mockBuiltinKernel = static_cast<Mock<::L0::KernelImp> *>(expectedKernel);
mockBuiltinKernel->setArgRedescribedImageCallBase = false;
mockBuiltinKernel->setArgRedescribedImageCalled = 0u;
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<FamilyType::gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
ze_image_desc_t zeDesc = {};
zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC;
zeDesc.type = ZE_IMAGE_TYPE_3D;
zeDesc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
zeDesc.format.type = ZE_IMAGE_FORMAT_TYPE_FLOAT;
zeDesc.width = 4;
zeDesc.height = 2;
zeDesc.depth = 2;
auto imageHw = std::make_unique<WhiteBox<::L0::ImageCoreFamily<FamilyType::gfxCoreFamily>>>();
imageHw->initialize(device, &zeDesc);
CmdListMemoryCopyParams copyParams = {};
ze_image_region_t dstRegion = {0, 0, 0, 4, 2, 2};
uint32_t alignedRowPitch = 64;
uint32_t misalignedSlicePitch = 132;
commandList->appendImageCopyFromMemoryExt(imageHw->toHandle(), srcBuffer, &dstRegion, alignedRowPitch, misalignedSlicePitch, nullptr, 0, nullptr, copyParams);
EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
EXPECT_EQ(1u, mockBuiltinKernel->setArgRedescribedImageCalled);
context->freeMem(srcBuffer);
}
HWTEST_F(CommandListTest, givenStatelessWhenAppendImageCopyFromMemoryThenCorrectRowAndSlicePitchArePassed) {
auto kernel = device->getBuiltinFunctionsLib()->getImageFunction(ImageBuiltin::copyImage3dToBufferBytes);
auto mockBuiltinKernel = static_cast<Mock<::L0::KernelImp> *>(kernel);

View File

@@ -41,7 +41,7 @@ components:
dest_dir: kernels_bin
type: git
branch: kernels_bin
revision: 3764-5777
revision: 3764-5782
level_zero:
asset_name: level_zero
dest_dir: level_zero

View File

@@ -26,6 +26,20 @@
#include <cstdint>
namespace NEO {
// Helper function to get buffer address for alignment checking
static inline uintptr_t getBufferAddress(const void *ptr, MemObj *memObj, uint32_t rootDeviceIndex) {
if (ptr) {
return reinterpret_cast<uintptr_t>(ptr);
} else if (memObj) {
auto buffer = castToObject<Buffer>(memObj);
if (buffer) {
return static_cast<uintptr_t>(buffer->getBufferAddress(rootDeviceIndex));
}
}
return 0;
}
template <>
class BuiltInOp<EBuiltInOps::copyBufferToBuffer> : public BuiltinDispatchInfoBuilder {
public:
@@ -589,6 +603,7 @@ class BuiltInOp<EBuiltInOps::copyBufferToImage3d> : public BuiltinDispatchInfoBu
protected:
MultiDeviceKernel *kernelBytes[5] = {nullptr};
MultiDeviceKernel *kernel16BytesAligned = nullptr;
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, bool populateKernels)
: BuiltinDispatchInfoBuilder(kernelsLib, device) {
if (populateKernels) {
@@ -598,7 +613,8 @@ class BuiltInOp<EBuiltInOps::copyBufferToImage3d> : public BuiltinDispatchInfoBu
"CopyBufferToImage3d2Bytes", kernelBytes[1],
"CopyBufferToImage3d4Bytes", kernelBytes[2],
"CopyBufferToImage3d8Bytes", kernelBytes[3],
"CopyBufferToImage3d16Bytes", kernelBytes[4]);
"CopyBufferToImage3d16Bytes", kernelBytes[4],
"CopyBufferToImage3d16BytesAligned", kernel16BytesAligned);
}
}
@@ -632,7 +648,18 @@ class BuiltInOp<EBuiltInOps::copyBufferToImage3d> : public BuiltinDispatchInfoBu
// Set-up kernel
auto bytesExponent = Math::log2(bytesPerPixel);
DEBUG_BREAK_IF(bytesExponent >= 5);
kernelNoSplit3DBuilder.setKernel(kernelBytes[bytesExponent]->getKernel(clDevice.getRootDeviceIndex()));
MultiDeviceKernel *selectedKernel = nullptr;
// Determine address, offset, rowPitch and slicePitch are 16-byte aligned
uintptr_t srcAddress = getBufferAddress(operationParams.srcPtr, operationParams.srcMemObj, clDevice.getRootDeviceIndex());
if (bytesExponent == 4 && srcAddress &&
isAligned<16>(srcAddress, operationParams.srcOffset.x, srcRowPitch, srcSlicePitch)) {
selectedKernel = kernel16BytesAligned;
} else {
selectedKernel = kernelBytes[bytesExponent];
}
kernelNoSplit3DBuilder.setKernel(selectedKernel->getKernel(clDevice.getRootDeviceIndex()));
// Set-up source host ptr / buffer
if (operationParams.srcPtr) {
@@ -684,7 +711,8 @@ class BuiltInOp<EBuiltInOps::copyBufferToImage3dStateless> : public BuiltInOp<EB
"CopyBufferToImage3d2BytesStateless", kernelBytes[1],
"CopyBufferToImage3d4BytesStateless", kernelBytes[2],
"CopyBufferToImage3d8BytesStateless", kernelBytes[3],
"CopyBufferToImage3d16BytesStateless", kernelBytes[4]);
"CopyBufferToImage3d16BytesStateless", kernelBytes[4],
"CopyBufferToImage3d16BytesAlignedStateless", kernel16BytesAligned);
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {
@@ -703,7 +731,8 @@ class BuiltInOp<EBuiltInOps::copyBufferToImage3dHeapless> : public BuiltInOp<EBu
"CopyBufferToImage3d2BytesStateless", kernelBytes[1],
"CopyBufferToImage3d4BytesStateless", kernelBytes[2],
"CopyBufferToImage3d8BytesStateless", kernelBytes[3],
"CopyBufferToImage3d16BytesStateless", kernelBytes[4]);
"CopyBufferToImage3d16BytesStateless", kernelBytes[4],
"CopyBufferToImage3d16BytesAlignedStateless", kernel16BytesAligned);
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {
@@ -723,6 +752,7 @@ class BuiltInOp<EBuiltInOps::copyImage3dToBuffer> : public BuiltinDispatchInfoBu
protected:
MultiDeviceKernel *kernelBytes[5] = {nullptr};
MultiDeviceKernel *kernel16BytesAligned = nullptr;
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, bool populateKernels)
: BuiltinDispatchInfoBuilder(kernelsLib, device) {
@@ -733,7 +763,8 @@ class BuiltInOp<EBuiltInOps::copyImage3dToBuffer> : public BuiltinDispatchInfoBu
"CopyImage3dToBuffer2Bytes", kernelBytes[1],
"CopyImage3dToBuffer4Bytes", kernelBytes[2],
"CopyImage3dToBuffer8Bytes", kernelBytes[3],
"CopyImage3dToBuffer16Bytes", kernelBytes[4]);
"CopyImage3dToBuffer16Bytes", kernelBytes[4],
"CopyImage3dToBuffer16BytesAligned", kernel16BytesAligned);
}
}
@@ -769,7 +800,18 @@ class BuiltInOp<EBuiltInOps::copyImage3dToBuffer> : public BuiltinDispatchInfoBu
// Set-up ISA
auto bytesExponent = Math::log2(bytesPerPixel);
DEBUG_BREAK_IF(bytesExponent >= 5);
kernelNoSplit3DBuilder.setKernel(kernelBytes[bytesExponent]->getKernel(rootDeviceIndex));
MultiDeviceKernel *selectedKernel = nullptr;
// Determine address, offset, rowPitch and slicePitch are 16-byte aligned
uintptr_t dstAddress = getBufferAddress(operationParams.dstPtr, operationParams.dstMemObj, rootDeviceIndex);
if (bytesExponent == 4 && dstAddress &&
isAligned<16>(dstAddress, operationParams.dstOffset.x, dstRowPitch, dstSlicePitch)) {
selectedKernel = kernel16BytesAligned;
} else {
selectedKernel = kernelBytes[bytesExponent];
}
kernelNoSplit3DBuilder.setKernel(selectedKernel->getKernel(rootDeviceIndex));
// Set-up source image
kernelNoSplit3DBuilder.setArg(0, srcImageRedescribed, operationParams.srcMipLevel);
@@ -825,7 +867,8 @@ class BuiltInOp<EBuiltInOps::copyImage3dToBufferStateless> : public BuiltInOp<EB
"CopyImage3dToBuffer2BytesStateless", kernelBytes[1],
"CopyImage3dToBuffer4BytesStateless", kernelBytes[2],
"CopyImage3dToBuffer8BytesStateless", kernelBytes[3],
"CopyImage3dToBuffer16BytesStateless", kernelBytes[4]);
"CopyImage3dToBuffer16BytesStateless", kernelBytes[4],
"CopyImage3dToBuffer16BytesAlignedStateless", kernel16BytesAligned);
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {
@@ -844,7 +887,8 @@ class BuiltInOp<EBuiltInOps::copyImage3dToBufferHeapless> : public BuiltInOp<EBu
"CopyImage3dToBuffer2BytesStateless", kernelBytes[1],
"CopyImage3dToBuffer4BytesStateless", kernelBytes[2],
"CopyImage3dToBuffer8BytesStateless", kernelBytes[3],
"CopyImage3dToBuffer16BytesStateless", kernelBytes[4]);
"CopyImage3dToBuffer16BytesStateless", kernelBytes[4],
"CopyImage3dToBuffer16BytesAlignedStateless", kernel16BytesAligned);
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {

View File

@@ -1085,7 +1085,6 @@ TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderFillLocalBufferStatelessIsU
}
HWTEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyBufferToImageStatelessIsUsedThenParamsAreCorrect) {
REQUIRE_64BIT_OR_SKIP();
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
uint64_t bigSize = 10ull * MemoryConstants::gigaByte;
@@ -2139,3 +2138,802 @@ HWTEST2_F(BuiltInTests, whenBuilderFillLocalBufferStatelessHeaplessIsUsedThenPar
EXPECT_FALSE(dispatchInfo.getKernel()->getDestinationAllocationInSystemMemory());
}
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesAlignedSrcPtrWhenBuilderIsUsedThenAlignedKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *srcPtr = alignedBuffer;
ASSERT_EQ(0u, reinterpret_cast<uintptr_t>(srcPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcPtr = srcPtr;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") != std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesAlignedSrcMemObjWhenBuilderIsUsedThenAlignedKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
MockBuffer srcBuffer;
srcBuffer.size = 1024;
srcBuffer.mockGfxAllocation.gpuAddress = 0x1000;
ASSERT_EQ(0u, srcBuffer.mockGfxAllocation.gpuAddress & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = &srcBuffer;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") != std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithAddressMisalignedSrcPtrWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
uint8_t buffer[bufferSize + 16];
void *srcPtr = buffer + 1;
ASSERT_NE(0u, reinterpret_cast<uintptr_t>(srcPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcPtr = srcPtr;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyImage3dToBuffer16BytesAlignedDstPtrWhenBuilderIsUsedThenAlignedKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *dstPtr = alignedBuffer;
ASSERT_EQ(0u, reinterpret_cast<uintptr_t>(dstPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = image;
dc.dstPtr = dstPtr;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") != std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyImage3dToBuffer16BytesAlignedDstMemObjWhenBuilderIsUsedThenAlignedKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
MockBuffer dstBuffer;
dstBuffer.size = 1024;
dstBuffer.mockGfxAllocation.gpuAddress = 0x2000;
ASSERT_EQ(0u, dstBuffer.mockGfxAllocation.gpuAddress & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = image;
dc.dstMemObj = &dstBuffer;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") != std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyImage3dToBuffer16BytesWithAddressMisalignedDstPtrWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
uint8_t buffer[bufferSize + 16];
void *dstPtr = buffer + 1;
ASSERT_NE(0u, reinterpret_cast<uintptr_t>(dstPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = image;
dc.dstPtr = dstPtr;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyImage3dToBufferNon16BytesFormatWhenBuilderIsUsedThenAlignmentCheckIsSkipped) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_R, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *dstPtr = alignedBuffer;
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = image;
dc.dstPtr = dstPtr;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("4Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") == std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImageNon16BytesFormatWhenBuilderIsUsedThenAlignmentCheckIsSkipped) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_R, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *srcPtr = alignedBuffer;
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcPtr = srcPtr;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("4Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") == std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithImageAsSrcMemObjWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto dstImage = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, dstImage);
auto srcImage = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, srcImage);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = srcImage;
dc.dstMemObj = dstImage;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete dstImage;
delete srcImage;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithAlignedBufferWhenBuilderIsUsedThenAlignedKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
cl_int retVal = CL_SUCCESS;
auto srcBuffer = Buffer::create(pContext, 0, bufferSize, nullptr, retVal);
ASSERT_NE(nullptr, srcBuffer);
ASSERT_EQ(CL_SUCCESS, retVal);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = srcBuffer;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") != std::string::npos);
delete image;
delete srcBuffer;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithOffsetMisalignedBufferWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
cl_int retVal = CL_SUCCESS;
auto srcBuffer = Buffer::create(pContext, 0, bufferSize, nullptr, retVal);
ASSERT_NE(nullptr, srcBuffer);
ASSERT_EQ(CL_SUCCESS, retVal);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = srcBuffer;
dc.dstMemObj = image;
dc.srcOffset = {1, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
delete srcBuffer;
}
HWTEST_F(BuiltInTests, givenCopyImageToBuffer16BytesWithImageAsDstMemObjWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto srcImage = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, srcImage);
auto dstImage = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, dstImage);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = srcImage;
dc.dstMemObj = dstImage;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete srcImage;
delete dstImage;
}
HWTEST_F(BuiltInTests, givenCopyImageToBuffer16BytesWithAlignedBufferWhenBuilderIsUsedThenAlignedKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto srcImage = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, srcImage);
constexpr size_t bufferSize = 1024;
cl_int retVal = CL_SUCCESS;
auto dstBuffer = Buffer::create(pContext, 0, bufferSize, nullptr, retVal);
ASSERT_NE(nullptr, dstBuffer);
ASSERT_EQ(CL_SUCCESS, retVal);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = srcImage;
dc.dstMemObj = dstBuffer;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") != std::string::npos);
delete srcImage;
delete dstBuffer;
}
HWTEST_F(BuiltInTests, givenCopyImageToBuffer16BytesWithOffsetMisalignedBufferWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto srcImage = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, srcImage);
constexpr size_t bufferSize = 1024;
cl_int retVal = CL_SUCCESS;
auto dstBuffer = Buffer::create(pContext, 0, bufferSize, nullptr, retVal);
ASSERT_NE(nullptr, dstBuffer);
ASSERT_EQ(CL_SUCCESS, retVal);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = srcImage;
dc.dstMemObj = dstBuffer;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {1, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete srcImage;
delete dstBuffer;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithRowPitchMisalignedWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *srcPtr = alignedBuffer;
ASSERT_EQ(0u, reinterpret_cast<uintptr_t>(srcPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcPtr = srcPtr;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 68; // 16-byte misaligned row pitch
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyImage3dToBuffer16BytesWithSlicePitchMisalignedWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *dstPtr = alignedBuffer;
ASSERT_EQ(0u, reinterpret_cast<uintptr_t>(dstPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = image;
dc.dstPtr = dstPtr;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 132; // 16-byte misaligned slice pitch
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithOffsetMisalignedWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *srcPtr = alignedBuffer;
ASSERT_EQ(0u, reinterpret_cast<uintptr_t>(srcPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcPtr = srcPtr;
dc.dstMemObj = image;
dc.srcOffset = {8, 0, 0}; // 16-byte misaligned offset
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyImage3dToBuffer16BytesWithNullAddressWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
MockBuffer dstBuffer;
dstBuffer.size = 1024;
dstBuffer.mockGfxAllocation.gpuAddress = 0;
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = image;
dc.dstMemObj = &dstBuffer;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithNullPtrAndNullMemObjWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcPtr = nullptr;
dc.srcMemObj = nullptr;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}

View File

@@ -442,9 +442,9 @@ HWTEST2_TEMPLATED_F(ClDrmMemoryManagerTest, givenDrmMemoryManagerWhenTiledImageI
mock->ioctlExpected.execbuffer2 = 1;
// builtins kernels
mock->ioctlExpected.gemUserptr += 7;
mock->ioctlExpected.gemClose += 7; // builtins cleaned up in ClDevice destructor
mock->ioctlExpected.gemWait += 7;
mock->ioctlExpected.gemUserptr += 8;
mock->ioctlExpected.gemClose += 8; // builtins cleaned up in ClDevice destructor
mock->ioctlExpected.gemWait += 8;
// command buffers
mock->ioctlExpected.gemUserptr += 2;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2024 Intel Corporation
* Copyright (C) 2019-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -182,35 +182,47 @@ __kernel void CopyBufferToImage3d16Bytes(__global uchar *src,
uint4 c = (uint4)(0, 0, 0, 0);
if(( ulong )(src + srcOffset) & 0x0000000f){
uint upper2 = *((__global uchar*)(src + LOffset + x * 16 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 16 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 16 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 16));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 7));
upper = *((__global uchar*)(src + LOffset + x * 16 + 6));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 5));
lower = *((__global uchar*)(src + LOffset + x * 16 + 4));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.y = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 11));
upper = *((__global uchar*)(src + LOffset + x * 16 + 10));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 9));
lower = *((__global uchar*)(src + LOffset + x * 16 + 8));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.z = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 15));
upper = *((__global uchar*)(src + LOffset + x * 16 + 14));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 13));
lower = *((__global uchar*)(src + LOffset + x * 16 + 12));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.w = combined;
}
else{
c = *((__global uint4 *)(src + LOffset + x * 16));
}
uint upper2 = *((__global uchar*)(src + LOffset + x * 16 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 16 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 16 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 16));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 7));
upper = *((__global uchar*)(src + LOffset + x * 16 + 6));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 5));
lower = *((__global uchar*)(src + LOffset + x * 16 + 4));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.y = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 11));
upper = *((__global uchar*)(src + LOffset + x * 16 + 10));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 9));
lower = *((__global uchar*)(src + LOffset + x * 16 + 8));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.z = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 15));
upper = *((__global uchar*)(src + LOffset + x * 16 + 14));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 13));
lower = *((__global uchar*)(src + LOffset + x * 16 + 12));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.w = combined;
write_imageui(output, dstCoord, c);
}
__kernel void CopyBufferToImage3d16BytesAligned(__global uint4 *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = src[(LOffset >> 4) + x];
write_imageui(output, dstCoord, c);
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2024 Intel Corporation
* Copyright (C) 2019-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -181,35 +181,47 @@ __kernel void CopyBufferToImage3d16BytesStateless(__global uchar *src,
uint4 c = (uint4)(0, 0, 0, 0);
if(( ulong )(src + srcOffset) & 0x0000000f){
uint upper2 = *((__global uchar*)(src + LOffset + x * 16 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 16 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 16 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 16));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 7));
upper = *((__global uchar*)(src + LOffset + x * 16 + 6));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 5));
lower = *((__global uchar*)(src + LOffset + x * 16 + 4));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.y = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 11));
upper = *((__global uchar*)(src + LOffset + x * 16 + 10));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 9));
lower = *((__global uchar*)(src + LOffset + x * 16 + 8));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.z = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 15));
upper = *((__global uchar*)(src + LOffset + x * 16 + 14));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 13));
lower = *((__global uchar*)(src + LOffset + x * 16 + 12));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.w = combined;
}
else{
c = *((__global uint4 *)(src + LOffset + x * 16));
}
uint upper2 = *((__global uchar*)(src + LOffset + x * 16 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 16 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 16 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 16));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 7));
upper = *((__global uchar*)(src + LOffset + x * 16 + 6));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 5));
lower = *((__global uchar*)(src + LOffset + x * 16 + 4));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.y = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 11));
upper = *((__global uchar*)(src + LOffset + x * 16 + 10));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 9));
lower = *((__global uchar*)(src + LOffset + x * 16 + 8));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.z = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 15));
upper = *((__global uchar*)(src + LOffset + x * 16 + 14));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 13));
lower = *((__global uchar*)(src + LOffset + x * 16 + 12));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.w = combined;
write_imageui(output, dstCoord, c);
}
__kernel void CopyBufferToImage3d16BytesAlignedStateless(__global uint4 *src,
__write_only image3d_t output,
ulong srcOffset,
int4 dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
ulong LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = src[(LOffset >> 4) + x];
write_imageui(output, dstCoord, c);
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2024 Intel Corporation
* Copyright (C) 2019-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -199,26 +199,38 @@ __kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input,
const uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x0000000f){
*((__global uchar*)(dst + DstOffset + x * 16 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 4)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 11)) = convert_uchar_sat((c.z >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 10)) = convert_uchar_sat((c.z >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 9)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 8)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 15)) = convert_uchar_sat((c.w >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 14)) = convert_uchar_sat((c.w >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 13)) = convert_uchar_sat((c.w >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 12)) = convert_uchar_sat(c.w & 0xff);
}
else{
*(__global uint4*)(dst + DstOffset + x * 16) = c;
}
*((__global uchar*)(dst + DstOffset + x * 16 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 4)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 11)) = convert_uchar_sat((c.z >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 10)) = convert_uchar_sat((c.z >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 9)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 8)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 15)) = convert_uchar_sat((c.w >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 14)) = convert_uchar_sat((c.w >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 13)) = convert_uchar_sat((c.w >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 12)) = convert_uchar_sat(c.w & 0xff);
}
__kernel void CopyImage3dToBuffer16BytesAligned(__read_only image3d_t input,
__global uint4 *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
const uint4 c = read_imageui(input, srcCoord);
dst[(DstOffset >> 4) + x] = c;
}
)==="

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2024 Intel Corporation
* Copyright (C) 2019-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -201,26 +201,38 @@ __kernel void CopyImage3dToBuffer16BytesStateless(__read_only image3d_t input,
const uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x0000000f){
*((__global uchar*)(dst + DstOffset + x * 16 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 4)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 11)) = convert_uchar_sat((c.z >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 10)) = convert_uchar_sat((c.z >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 9)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 8)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 15)) = convert_uchar_sat((c.w >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 14)) = convert_uchar_sat((c.w >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 13)) = convert_uchar_sat((c.w >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 12)) = convert_uchar_sat(c.w & 0xff);
}
else{
*(__global uint4*)(dst + DstOffset + x * 16) = c;
}
*((__global uchar*)(dst + DstOffset + x * 16 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 4)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 11)) = convert_uchar_sat((c.z >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 10)) = convert_uchar_sat((c.z >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 9)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 8)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 15)) = convert_uchar_sat((c.w >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 14)) = convert_uchar_sat((c.w >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 13)) = convert_uchar_sat((c.w >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 12)) = convert_uchar_sat(c.w & 0xff);
}
__kernel void CopyImage3dToBuffer16BytesAlignedStateless(__read_only image3d_t input,
__global uint4 *dst,
int4 srcOffset,
ulong dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
ulong DstOffset = (dstOffset + (y * Pitch.x) + (z * Pitch.y));
const uint4 c = read_imageui(input, srcCoord);
dst[(DstOffset >> 4) + x] = c;
}
)==="

View File

@@ -119,6 +119,13 @@ inline bool isAligned(T *ptr) {
// alignment requirement (returned by alignof) is always a power of 2
return (reinterpret_cast<uintptr_t>(ptr) & (alignof(T) - 1)) == 0;
}
// Variadic template to check if all values are aligned
template <size_t alignment, typename... Ts>
inline constexpr bool isAligned(Ts... vals) {
return (isAligned<alignment>(vals) && ...);
}
inline auto allocateAlignedMemory(size_t bytes, size_t alignment) {
return std::unique_ptr<void, std::function<decltype(alignedFree)>>(alignedMalloc(bytes, alignment), alignedFree);
}

View File

@@ -8,4 +8,4 @@
#include "shared/test/common/helpers/kernel_binary_helper.h"
const std::string KernelBinaryHelper::BUILT_INS("6133084427540774618");
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("15342443153856668610_images");
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("10619261412647190096_images");

View File

@@ -709,35 +709,47 @@ __kernel void CopyBufferToImage3d16Bytes(__global uchar *src,
uint4 c = (uint4)(0, 0, 0, 0);
if(( ulong )(src + srcOffset) & 0x0000000f){
uint upper2 = *((__global uchar*)(src + LOffset + x * 16 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 16 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 16 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 16));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 7));
upper = *((__global uchar*)(src + LOffset + x * 16 + 6));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 5));
lower = *((__global uchar*)(src + LOffset + x * 16 + 4));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.y = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 11));
upper = *((__global uchar*)(src + LOffset + x * 16 + 10));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 9));
lower = *((__global uchar*)(src + LOffset + x * 16 + 8));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.z = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 15));
upper = *((__global uchar*)(src + LOffset + x * 16 + 14));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 13));
lower = *((__global uchar*)(src + LOffset + x * 16 + 12));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.w = combined;
}
else{
c = *((__global uint4 *)(src + LOffset + x * 16));
}
uint upper2 = *((__global uchar*)(src + LOffset + x * 16 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 16 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 16 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 16));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 7));
upper = *((__global uchar*)(src + LOffset + x * 16 + 6));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 5));
lower = *((__global uchar*)(src + LOffset + x * 16 + 4));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.y = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 11));
upper = *((__global uchar*)(src + LOffset + x * 16 + 10));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 9));
lower = *((__global uchar*)(src + LOffset + x * 16 + 8));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.z = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 15));
upper = *((__global uchar*)(src + LOffset + x * 16 + 14));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 13));
lower = *((__global uchar*)(src + LOffset + x * 16 + 12));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.w = combined;
write_imageui(output, dstCoord, c);
}
__kernel void CopyBufferToImage3d16BytesAligned(__global uint4 *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = src[(LOffset >> 4) + x];
write_imageui(output, dstCoord, c);
}
@@ -935,27 +947,39 @@ __kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input,
const uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x0000000f){
*((__global uchar*)(dst + DstOffset + x * 16 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 4)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 11)) = convert_uchar_sat((c.z >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 10)) = convert_uchar_sat((c.z >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 9)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 8)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 15)) = convert_uchar_sat((c.w >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 14)) = convert_uchar_sat((c.w >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 13)) = convert_uchar_sat((c.w >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 12)) = convert_uchar_sat(c.w & 0xff);
}
else{
*(__global uint4*)(dst + DstOffset + x * 16) = c;
}
*((__global uchar*)(dst + DstOffset + x * 16 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 4)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 11)) = convert_uchar_sat((c.z >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 10)) = convert_uchar_sat((c.z >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 9)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 8)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 15)) = convert_uchar_sat((c.w >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 14)) = convert_uchar_sat((c.w >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 13)) = convert_uchar_sat((c.w >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 12)) = convert_uchar_sat(c.w & 0xff);
}
__kernel void CopyImage3dToBuffer16BytesAligned(__read_only image3d_t input,
__global uint4 *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
const uint4 c = read_imageui(input, srcCoord);
dst[(DstOffset >> 4) + x] = c;
}
__kernel void FillImage1dBuffer(