From badbecb12637ede642d79242281a99a77d3e48a1 Mon Sep 17 00:00:00 2001 From: "Kulkarni, Ashwin Kumar" Date: Tue, 31 Dec 2024 18:37:27 +0000 Subject: [PATCH] feature: adds support for 3 channel (2/N) Allows user to create L0 images with 3 channel Related-To: NEO-12887 Signed-off-by: Kulkarni, Ashwin Kumar --- .../source/builtin/builtin_functions_lib.h | 12 + .../builtin/builtin_functions_lib_impl.cpp | 32 + level_zero/core/source/cmdlist/cmdlist_hw.inl | 57 +- level_zero/core/source/image/image.h | 1 + level_zero/core/source/image/image_imp.cpp | 62 +- level_zero/core/source/image/image_imp.h | 9 + .../unit_tests/sources/image/test_image.cpp | 131 +++ manifests/manifest.yml | 2 +- .../linux/cl_drm_memory_manager_tests.cpp | 2 +- .../copy_buffer_to_image3d.builtin_kernel | 58 +- ...buffer_to_image3d_stateless.builtin_kernel | 55 ++ .../copy_image3d_to_buffer.builtin_kernel | 42 + ...image3d_to_buffer_stateless.builtin_kernel | 44 + .../kernel_binary_helper_hash_value.cpp | 2 +- .../test_files/11207026507167120706_images.cl | 790 --------------- .../test_files/16235226214855072632_images.cl | 914 ++++++++++++++++++ ...> 16235226214855072632_images_options.txt} | 0 17 files changed, 1409 insertions(+), 804 deletions(-) delete mode 100644 shared/test/common/test_files/11207026507167120706_images.cl create mode 100644 shared/test/common/test_files/16235226214855072632_images.cl rename shared/test/common/test_files/{11207026507167120706_images_options.txt => 16235226214855072632_images_options.txt} (100%) diff --git a/level_zero/core/source/builtin/builtin_functions_lib.h b/level_zero/core/source/builtin/builtin_functions_lib.h index a0bd0f67bf..dccf787df1 100644 --- a/level_zero/core/source/builtin/builtin_functions_lib.h +++ b/level_zero/core/source/builtin/builtin_functions_lib.h @@ -57,8 +57,12 @@ enum class ImageBuiltin : uint32_t { copyBufferToImage3d2BytesHeapless, copyBufferToImage3d4Bytes, copyBufferToImage3d4BytesHeapless, + copyBufferToImage3d3To4Bytes, + copyBufferToImage3d3To4BytesHeapless, copyBufferToImage3d8Bytes, copyBufferToImage3d8BytesHeapless, + copyBufferToImage3d6To8Bytes, + copyBufferToImage3d6To8BytesHeapless, copyBufferToImage3dBytes, copyBufferToImage3dBytesHeapless, copyImage3dToBuffer16Bytes, @@ -69,10 +73,14 @@ enum class ImageBuiltin : uint32_t { copyImage3dToBuffer3BytesHeapless, copyImage3dToBuffer4Bytes, copyImage3dToBuffer4BytesHeapless, + copyImage3dToBuffer4To3Bytes, + copyImage3dToBuffer4To3BytesHeapless, copyImage3dToBuffer6Bytes, copyImage3dToBuffer6BytesHeapless, copyImage3dToBuffer8Bytes, copyImage3dToBuffer8BytesHeapless, + copyImage3dToBuffer8To6Bytes, + copyImage3dToBuffer8To6BytesHeapless, copyImage3dToBufferBytes, copyImage3dToBufferBytesHeapless, copyImageRegion, @@ -200,14 +208,18 @@ constexpr ImageBuiltin adjustImageBuiltinType(const bool isHeapless) { DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d16Bytes); DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d2Bytes); DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d4Bytes); +DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d3To4Bytes); DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d8Bytes); +DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3d6To8Bytes); DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyBufferToImage3dBytes); DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer16Bytes); DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer2Bytes); DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer3Bytes); DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer4Bytes); +DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer4To3Bytes); DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer6Bytes); DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer8Bytes); +DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBuffer8To6Bytes); DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImage3dToBufferBytes); DEFINE_ADJUST_IMAGE_BUILTIN_TYPE(ImageBuiltin::copyImageRegion); diff --git a/level_zero/core/source/builtin/builtin_functions_lib_impl.cpp b/level_zero/core/source/builtin/builtin_functions_lib_impl.cpp index 29c9a91246..7f814939e0 100644 --- a/level_zero/core/source/builtin/builtin_functions_lib_impl.cpp +++ b/level_zero/core/source/builtin/builtin_functions_lib_impl.cpp @@ -180,6 +180,14 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) { builtinName = "CopyBufferToImage3d4BytesStateless"; builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless; break; + case ImageBuiltin::copyBufferToImage3d3To4Bytes: + builtinName = "CopyBufferToImage3d3To4Bytes"; + builtin = NEO::EBuiltInOps::copyBufferToImage3d; + break; + case ImageBuiltin::copyBufferToImage3d3To4BytesHeapless: + builtinName = "CopyBufferToImage3d3To4BytesStateless"; + builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless; + break; case ImageBuiltin::copyBufferToImage3d8Bytes: builtinName = "CopyBufferToImage3d8Bytes"; builtin = NEO::EBuiltInOps::copyBufferToImage3d; @@ -188,6 +196,14 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) { builtinName = "CopyBufferToImage3d8BytesStateless"; builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless; break; + case ImageBuiltin::copyBufferToImage3d6To8Bytes: + builtinName = "CopyBufferToImage3d6To8Bytes"; + builtin = NEO::EBuiltInOps::copyBufferToImage3d; + break; + case ImageBuiltin::copyBufferToImage3d6To8BytesHeapless: + builtinName = "CopyBufferToImage3d6To8BytesStateless"; + builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless; + break; case ImageBuiltin::copyBufferToImage3dBytes: builtinName = "CopyBufferToImage3dBytes"; builtin = NEO::EBuiltInOps::copyBufferToImage3d; @@ -228,6 +244,14 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) { builtinName = "CopyImage3dToBuffer4BytesStateless"; builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless; break; + case ImageBuiltin::copyImage3dToBuffer4To3Bytes: + builtinName = "CopyImage3dToBuffer4To3Bytes"; + builtin = NEO::EBuiltInOps::copyImage3dToBuffer; + break; + case ImageBuiltin::copyImage3dToBuffer4To3BytesHeapless: + builtinName = "CopyImage3dToBuffer4To3BytesStateless"; + builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless; + break; case ImageBuiltin::copyImage3dToBuffer6Bytes: builtinName = "CopyImage3dToBuffer6Bytes"; builtin = NEO::EBuiltInOps::copyImage3dToBuffer; @@ -244,6 +268,14 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) { builtinName = "CopyImage3dToBuffer8BytesStateless"; builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless; break; + case ImageBuiltin::copyImage3dToBuffer8To6Bytes: + builtinName = "CopyImage3dToBuffer8To6Bytes"; + builtin = NEO::EBuiltInOps::copyImage3dToBuffer; + break; + case ImageBuiltin::copyImage3dToBuffer8To6BytesHeapless: + builtinName = "CopyImage3dToBuffer8To6BytesStateless"; + builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless; + break; case ImageBuiltin::copyImage3dToBufferBytes: builtinName = "CopyImage3dToBufferBytes"; builtin = NEO::EBuiltInOps::copyImage3dToBuffer; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index e57faba43f..98ca5b2158 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -695,7 +695,6 @@ ze_result_t CommandListCoreFamily::appendImageCopyFromMemoryExt(z auto image = Image::fromHandle(hDstImage); auto bytesPerPixel = static_cast(image->getImageInfo().surfaceFormat->imageElementSizeInBytes); - Vec3 imgSize = {image->getImageDesc().width, image->getImageDesc().height, image->getImageDesc().depth}; @@ -718,7 +717,18 @@ ze_result_t CommandListCoreFamily::appendImageCopyFromMemoryExt(z } if (srcRowPitch == 0) { - srcRowPitch = pDstRegion->width * bytesPerPixel; + if (image->isMimickedImage()) { + uint32_t srcBytesPerPixel = bytesPerPixel; + if (bytesPerPixel == 8) { + srcBytesPerPixel = 6; + } + if (bytesPerPixel == 4) { + srcBytesPerPixel = 3; + } + srcRowPitch = pDstRegion->width * srcBytesPerPixel; + } else { + srcRowPitch = pDstRegion->width * bytesPerPixel; + } } if (srcSlicePitch == 0) { srcSlicePitch = (image->getImageInfo().imgDesc.imageType == NEO::ImageType::image1DArray ? 1 : pDstRegion->height) * srcRowPitch; @@ -743,6 +753,9 @@ ze_result_t CommandListCoreFamily::appendImageCopyFromMemoryExt(z } if (isCopyOnly(false)) { + if ((bytesPerPixel == 3) || (bytesPerPixel == 6) || image->isMimickedImage()) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + } size_t imgRowPitch = image->getImageInfo().rowPitch; size_t imgSlicePitch = image->getImageInfo().slicePitch; auto status = appendCopyImageBlit(allocationStruct.alloc, image->getAllocation(), @@ -763,10 +776,18 @@ ze_result_t CommandListCoreFamily::appendImageCopyFromMemoryExt(z builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); break; case 4u: - builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); + if (image->isMimickedImage()) { + builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); + } else { + builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); + } break; case 8u: - builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); + if (image->isMimickedImage()) { + builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); + } else { + builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); + } break; case 16u: builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); @@ -868,7 +889,6 @@ ze_result_t CommandListCoreFamily::appendImageCopyToMemoryExt(voi auto image = Image::fromHandle(hSrcImage); auto bytesPerPixel = static_cast(image->getImageInfo().surfaceFormat->imageElementSizeInBytes); - Vec3 imgSize = {image->getImageDesc().width, image->getImageDesc().height, image->getImageDesc().depth}; @@ -891,7 +911,18 @@ ze_result_t CommandListCoreFamily::appendImageCopyToMemoryExt(voi } if (destRowPitch == 0) { - destRowPitch = pSrcRegion->width * bytesPerPixel; + if (image->isMimickedImage()) { + uint32_t destBytesPerPixel = bytesPerPixel; + if (bytesPerPixel == 8) { + destBytesPerPixel = 6; + } + if (bytesPerPixel == 4) { + destBytesPerPixel = 3; + } + destRowPitch = pSrcRegion->width * destBytesPerPixel; + } else { + destRowPitch = pSrcRegion->width * bytesPerPixel; + } } if (destSlicePitch == 0) { destSlicePitch = (image->getImageInfo().imgDesc.imageType == NEO::ImageType::image1DArray ? 1 : pSrcRegion->height) * destRowPitch; @@ -916,7 +947,7 @@ ze_result_t CommandListCoreFamily::appendImageCopyToMemoryExt(voi } if (isCopyOnly(false)) { - if ((bytesPerPixel == 3) || (bytesPerPixel == 6)) { + if ((bytesPerPixel == 3) || (bytesPerPixel == 6) || image->isMimickedImage()) { return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; } size_t imgRowPitch = image->getImageInfo().rowPitch; @@ -943,13 +974,21 @@ ze_result_t CommandListCoreFamily::appendImageCopyToMemoryExt(voi builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); break; case 4u: - builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); + if (image->isMimickedImage()) { + builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); + } else { + builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); + } break; case 6u: builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); break; case 8u: - builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); + if (image->isMimickedImage()) { + builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); + } else { + builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); + } break; case 16u: builtInType = BuiltinTypeHelper::adjustImageBuiltinType(isHeaplessEnabled); diff --git a/level_zero/core/source/image/image.h b/level_zero/core/source/image/image.h index a0dc50d0bf..3b3a1ee9e8 100644 --- a/level_zero/core/source/image/image.h +++ b/level_zero/core/source/image/image.h @@ -52,6 +52,7 @@ struct Image : _ze_image_handle_t { virtual ze_result_t allocateBindlessSlot() = 0; virtual NEO::SurfaceStateInHeapInfo *getBindlessSlot() = 0; virtual ze_result_t getDeviceOffset(uint64_t *deviceOffset) = 0; + virtual bool isMimickedImage() = 0; static ze_result_t getPitchFor2dImage( ze_device_handle_t hDevice, diff --git a/level_zero/core/source/image/image_imp.cpp b/level_zero/core/source/image/image_imp.cpp index 78451eef91..3fd7336c16 100644 --- a/level_zero/core/source/image/image_imp.cpp +++ b/level_zero/core/source/image/image_imp.cpp @@ -128,6 +128,58 @@ NEO::SurfaceStateInHeapInfo *ImageImp::getBindlessSlot() { return bindlessInfo.get(); } +bool getImageDescriptor(const ze_image_desc_t *origImgDesc, ze_image_desc_t *imgDesc) { + bool modified = false; + *imgDesc = *origImgDesc; + if (origImgDesc->pNext) { + const ze_base_desc_t *extendedDesc = reinterpret_cast(origImgDesc->pNext); + if (extendedDesc->stype != ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_WIN32) { + switch (origImgDesc->format.layout) { + default: + break; + case ZE_IMAGE_FORMAT_LAYOUT_16_16_16: + imgDesc->format.layout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16; + imgDesc->format.x = ZE_IMAGE_FORMAT_SWIZZLE_R; + imgDesc->format.y = ZE_IMAGE_FORMAT_SWIZZLE_G; + imgDesc->format.z = ZE_IMAGE_FORMAT_SWIZZLE_B; + imgDesc->format.w = ZE_IMAGE_FORMAT_SWIZZLE_1; + modified = true; + break; + case ZE_IMAGE_FORMAT_LAYOUT_8_8_8: + imgDesc->format.layout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8; + imgDesc->format.x = ZE_IMAGE_FORMAT_SWIZZLE_R; + imgDesc->format.y = ZE_IMAGE_FORMAT_SWIZZLE_G; + imgDesc->format.z = ZE_IMAGE_FORMAT_SWIZZLE_B; + imgDesc->format.w = ZE_IMAGE_FORMAT_SWIZZLE_1; + modified = true; + break; + } + } + } else { + switch (origImgDesc->format.layout) { + default: + break; + case ZE_IMAGE_FORMAT_LAYOUT_16_16_16: + imgDesc->format.layout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16; + imgDesc->format.x = ZE_IMAGE_FORMAT_SWIZZLE_R; + imgDesc->format.y = ZE_IMAGE_FORMAT_SWIZZLE_G; + imgDesc->format.z = ZE_IMAGE_FORMAT_SWIZZLE_B; + imgDesc->format.w = ZE_IMAGE_FORMAT_SWIZZLE_1; + modified = true; + break; + case ZE_IMAGE_FORMAT_LAYOUT_8_8_8: + imgDesc->format.layout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8; + imgDesc->format.x = ZE_IMAGE_FORMAT_SWIZZLE_R; + imgDesc->format.y = ZE_IMAGE_FORMAT_SWIZZLE_G; + imgDesc->format.z = ZE_IMAGE_FORMAT_SWIZZLE_B; + imgDesc->format.w = ZE_IMAGE_FORMAT_SWIZZLE_1; + modified = true; + break; + } + } + return modified; +} + ze_result_t Image::create(uint32_t productFamily, Device *device, const ze_image_desc_t *desc, Image **pImage) { ze_result_t result = ZE_RESULT_SUCCESS; ImageAllocatorFn allocator = nullptr; @@ -138,7 +190,15 @@ ze_result_t Image::create(uint32_t productFamily, Device *device, const ze_image ImageImp *image = nullptr; if (allocator) { image = static_cast((*allocator)()); - result = image->initialize(device, desc); + ze_image_desc_t imgDesc = {}; + bool modified = getImageDescriptor(desc, &imgDesc); + if (modified) { + image->setMimickedImage(true); + result = image->initialize(device, &imgDesc); + } else { + result = image->initialize(device, desc); + } + if (result != ZE_RESULT_SUCCESS) { image->destroy(); image = nullptr; diff --git a/level_zero/core/source/image/image_imp.h b/level_zero/core/source/image/image_imp.h index 4924bf8e1b..2a8c7440be 100644 --- a/level_zero/core/source/image/image_imp.h +++ b/level_zero/core/source/image/image_imp.h @@ -46,6 +46,14 @@ struct ImageImp : public Image, NEO::NonCopyableOrMovableClass { return sourceImageFormatDesc.has_value(); } + bool isMimickedImage() override { + return mimickedImagefor3Ch; + } + + void setMimickedImage(bool value) { + this->mimickedImagefor3Ch = value; + } + ze_result_t allocateBindlessSlot() override; NEO::SurfaceStateInHeapInfo *getBindlessSlot() override; ze_result_t getDeviceOffset(uint64_t *deviceOffset) override; @@ -63,5 +71,6 @@ struct ImageImp : public Image, NEO::NonCopyableOrMovableClass { bool bindlessImage = false; bool imageFromBuffer = false; bool sampledImage = false; + bool mimickedImagefor3Ch = false; }; } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/image/test_image.cpp b/level_zero/core/test/unit_tests/sources/image/test_image.cpp index fb2fb20515..c690b33a3b 100644 --- a/level_zero/core/test/unit_tests/sources/image/test_image.cpp +++ b/level_zero/core/test/unit_tests/sources/image/test_image.cpp @@ -2384,5 +2384,136 @@ HWTEST2_F(ImageCreate, given2DImageFormatWithPixelSizeOf6BytesWhenRowPitchIsQuer EXPECT_EQ(rowPitch, imageHW->imgInfo.rowPitch); } +HWTEST2_F(ImageCreate, givenValidImageDescriptionFor3ChannelWhenImageCreateThenImageIsCreatedCorrectly, MatchAny) { + ze_image_desc_t zeDesc = {}; + zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC; + zeDesc.arraylevels = 1u; + zeDesc.depth = 1u; + zeDesc.height = 1u; + zeDesc.width = 100u; + zeDesc.miplevels = 1u; + zeDesc.type = ZE_IMAGE_TYPE_2DARRAY; + zeDesc.flags = ZE_IMAGE_FLAG_BIAS_UNCACHED; + + zeDesc.format = {ZE_IMAGE_FORMAT_LAYOUT_8_8_8, + ZE_IMAGE_FORMAT_TYPE_UNORM, + ZE_IMAGE_FORMAT_SWIZZLE_R, + ZE_IMAGE_FORMAT_SWIZZLE_G, + ZE_IMAGE_FORMAT_SWIZZLE_B, + ZE_IMAGE_FORMAT_SWIZZLE_1}; + + Image *imagePtr; + auto result = Image::create(productFamily, device, &zeDesc, &imagePtr); + EXPECT_EQ(result, ZE_RESULT_SUCCESS); + std::unique_ptr image(imagePtr); + + ASSERT_NE(image, nullptr); +} + +HWTEST2_F(ImageCreate, givenValidImageDescriptionFor3Channel16BitFloatWhenImageCreateThenImageIsCreatedCorrectly, MatchAny) { + ze_image_desc_t zeDesc = {}; + zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC; + zeDesc.arraylevels = 1u; + zeDesc.depth = 1u; + zeDesc.height = 1u; + zeDesc.width = 100u; + zeDesc.miplevels = 1u; + zeDesc.type = ZE_IMAGE_TYPE_2DARRAY; + zeDesc.flags = ZE_IMAGE_FLAG_BIAS_UNCACHED; + + zeDesc.format = {ZE_IMAGE_FORMAT_LAYOUT_16_16_16, + ZE_IMAGE_FORMAT_TYPE_FLOAT, + ZE_IMAGE_FORMAT_SWIZZLE_R, + ZE_IMAGE_FORMAT_SWIZZLE_G, + ZE_IMAGE_FORMAT_SWIZZLE_B, + ZE_IMAGE_FORMAT_SWIZZLE_1}; + + Image *imagePtr; + auto result = Image::create(productFamily, device, &zeDesc, &imagePtr); + EXPECT_EQ(result, ZE_RESULT_SUCCESS); + std::unique_ptr image(imagePtr); + + ASSERT_NE(image, nullptr); +} + +HWTEST2_F(ImageCreateExternalMemoryTest, givenNTHandleWhenCreatingInteropImageThenSuccessIsReturned, MatchAny) { + ze_external_memory_import_win32_handle_t importNTHandle = {}; + importNTHandle.handle = &imageHandle; + importNTHandle.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32; + importNTHandle.stype = ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_WIN32; + desc.pNext = &importNTHandle; + + delete driverHandle->svmAllocsManager; + driverHandle->setMemoryManager(execEnv->memoryManager.get()); + driverHandle->svmAllocsManager = new NEO::SVMAllocsManager(execEnv->memoryManager.get(), false); + + Image *imagePtr; + auto result = Image::create(productFamily, device, &desc, &imagePtr); + EXPECT_EQ(result, ZE_RESULT_SUCCESS); + std::unique_ptr image(imagePtr); + + ASSERT_NE(image, nullptr); +} + +HWTEST2_F(ImageCreate, givenFDWhenCreatingImageWith3Channel8bitUintThenSuccessIsReturned, MatchAny) { + ze_image_desc_t desc = {}; + + desc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC; + desc.type = ZE_IMAGE_TYPE_3D; + desc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8; + desc.format.type = ZE_IMAGE_FORMAT_TYPE_UINT; + desc.width = 11; + desc.height = 13; + desc.depth = 17; + + desc.format.x = ZE_IMAGE_FORMAT_SWIZZLE_R; + desc.format.y = ZE_IMAGE_FORMAT_SWIZZLE_G; + desc.format.z = ZE_IMAGE_FORMAT_SWIZZLE_B; + desc.format.w = ZE_IMAGE_FORMAT_SWIZZLE_1; + + ze_external_memory_import_fd_t importFd = {}; + importFd.fd = 1; + importFd.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF; + importFd.stype = ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD; + desc.pNext = &importFd; + + Image *imagePtr; + auto result = Image::create(productFamily, device, &desc, &imagePtr); + EXPECT_EQ(result, ZE_RESULT_SUCCESS); + std::unique_ptr image(imagePtr); + + ASSERT_NE(image, nullptr); +} + +HWTEST2_F(ImageCreate, givenFDWhenCreatingImageWith3Channel16bitUintThenSuccessIsReturned, MatchAny) { + ze_image_desc_t desc = {}; + + desc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC; + desc.type = ZE_IMAGE_TYPE_3D; + desc.format.layout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16; + desc.format.type = ZE_IMAGE_FORMAT_TYPE_UINT; + desc.width = 11; + desc.height = 13; + desc.depth = 17; + + desc.format.x = ZE_IMAGE_FORMAT_SWIZZLE_R; + desc.format.y = ZE_IMAGE_FORMAT_SWIZZLE_G; + desc.format.z = ZE_IMAGE_FORMAT_SWIZZLE_B; + desc.format.w = ZE_IMAGE_FORMAT_SWIZZLE_1; + + ze_external_memory_import_fd_t importFd = {}; + importFd.fd = 1; + importFd.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF; + importFd.stype = ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD; + desc.pNext = &importFd; + + Image *imagePtr; + auto result = Image::create(productFamily, device, &desc, &imagePtr); + EXPECT_EQ(result, ZE_RESULT_SUCCESS); + std::unique_ptr image(imagePtr); + + ASSERT_NE(image, nullptr); +} + } // namespace ult } // namespace L0 diff --git a/manifests/manifest.yml b/manifests/manifest.yml index ca7e50cbf3..3db712a71d 100644 --- a/manifests/manifest.yml +++ b/manifests/manifest.yml @@ -36,7 +36,7 @@ components: dest_dir: kernels_bin type: git branch: kernels_bin - revision: 3253-3243 + revision: 3253-3252 kmdaf: branch: kmdaf dest_dir: kmdaf diff --git a/opencl/test/unit_test/os_interface/linux/cl_drm_memory_manager_tests.cpp b/opencl/test/unit_test/os_interface/linux/cl_drm_memory_manager_tests.cpp index 783436184e..0d7e8514b9 100644 --- a/opencl/test/unit_test/os_interface/linux/cl_drm_memory_manager_tests.cpp +++ b/opencl/test/unit_test/os_interface/linux/cl_drm_memory_manager_tests.cpp @@ -458,7 +458,7 @@ HWTEST_F(ClDrmMemoryManagerTest, givenDrmMemoryManagerWhenTiledImageIsBeingCreat mock->ioctlExpected.execbuffer2 = 0; // builtins kernels - mock->ioctlExpected.gemUserptr += 5; + mock->ioctlExpected.gemUserptr += 7; // command buffers mock->ioctlExpected.gemUserptr += 2; diff --git a/shared/source/built_ins/kernels/copy_buffer_to_image3d.builtin_kernel b/shared/source/built_ins/kernels/copy_buffer_to_image3d.builtin_kernel index 4bdf725249..4305b5d909 100644 --- a/shared/source/built_ins/kernels/copy_buffer_to_image3d.builtin_kernel +++ b/shared/source/built_ins/kernels/copy_buffer_to_image3d.builtin_kernel @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2021 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -77,6 +77,31 @@ __kernel void CopyBufferToImage3d4Bytes(__global uchar *src, write_imageui(output, dstCoord, c); } +__kernel void CopyBufferToImage3d3To4Bytes(__global uchar *src, + __write_only image3d_t output, + int srcOffset, + int4 dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = (uint4)(0, 0, 0, 1); + + + uint upper2 = 0; + uint upper = *((__global uchar*)(src + LOffset + x * 3 + 2)); + uint lower2 = *((__global uchar*)(src + LOffset + x * 3 + 1)); + uint lower = *((__global uchar*)(src + LOffset + x * 3)); + uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; + c.x = combined; + + write_imageui(output, dstCoord, c); +} + __kernel void CopyBufferToImage3d8Bytes(__global uchar *src, __write_only image3d_t output, int srcOffset, @@ -112,6 +137,37 @@ __kernel void CopyBufferToImage3d8Bytes(__global uchar *src, write_imageui(output, dstCoord, (uint4)(c.x, c.y, 0, 1)); } +__kernel void CopyBufferToImage3d6To8Bytes(__global uchar *src, + __write_only image3d_t output, + int srcOffset, + int4 dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); + + uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8)); + + + uint upper2 = *((__global uchar*)(src + LOffset + x * 6 + 3)); + uint upper = *((__global uchar*)(src + LOffset + x * 6 + 2)); + uint lower2 = *((__global uchar*)(src + LOffset + x * 6 + 1)); + uint lower = *((__global uchar*)(src + LOffset + x * 6)); + uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; + c.x = combined; + + upper2 = upper = 0; + lower2 = *((__global uchar*)(src + LOffset + x * 6 + 5)); + lower = *((__global uchar*)(src + LOffset + x * 6 + 4)); + combined = ((uint)upper2 << 24) | ((uint)upper << 16) | ((uint)lower2 << 8) | lower; + c.y = combined; + + write_imageui(output, dstCoord, (uint4)(c.x, c.y, 0, 1)); +} + __kernel void CopyBufferToImage3d16Bytes(__global uchar *src, __write_only image3d_t output, int srcOffset, diff --git a/shared/source/built_ins/kernels/copy_buffer_to_image3d_stateless.builtin_kernel b/shared/source/built_ins/kernels/copy_buffer_to_image3d_stateless.builtin_kernel index 5d74570567..e980017b48 100644 --- a/shared/source/built_ins/kernels/copy_buffer_to_image3d_stateless.builtin_kernel +++ b/shared/source/built_ins/kernels/copy_buffer_to_image3d_stateless.builtin_kernel @@ -77,6 +77,30 @@ __kernel void CopyBufferToImage3d4BytesStateless(__global uchar *src, write_imageui(output, dstCoord, c); } +__kernel void CopyBufferToImage3d3To4BytesStateless(__global uchar *src, + __write_only image3d_t output, + ulong srcOffset, + int4 dstOffset, + ulong2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + ulong LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = (uint4)(0, 0, 0, 1); + + uint upper2 = 0; + uint upper = *((__global uchar*)(src + LOffset + x * 3 + 2)); + uint lower2 = *((__global uchar*)(src + LOffset + x * 3 + 1)); + uint lower = *((__global uchar*)(src + LOffset + x * 3)); + uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; + c.x = combined; + + write_imageui(output, dstCoord, c); +} + __kernel void CopyBufferToImage3d8BytesStateless(__global uchar *src, __write_only image3d_t output, ulong srcOffset, @@ -112,6 +136,37 @@ __kernel void CopyBufferToImage3d8BytesStateless(__global uchar *src, write_imageui(output, dstCoord, (uint4)(c.x, c.y, 0, 1)); } +__kernel void CopyBufferToImage3d6To8BytesStateless(__global uchar *src, + __write_only image3d_t output, + ulong srcOffset, + int4 dstOffset, + ulong2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + ulong LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); + + uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8)); + + + uint upper2 = *((__global uchar*)(src + LOffset + x * 6 + 3)); + uint upper = *((__global uchar*)(src + LOffset + x * 6 + 2)); + uint lower2 = *((__global uchar*)(src + LOffset + x * 6 + 1)); + uint lower = *((__global uchar*)(src + LOffset + x * 6)); + uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; + c.x = combined; + + upper2 = upper = 0; + lower2 = *((__global uchar*)(src + LOffset + x * 6 + 5)); + lower = *((__global uchar*)(src + LOffset + x * 6 + 4)); + combined = ((uint)upper2 << 24) | ((uint)upper << 16) | ((uint)lower2 << 8) | lower; + c.y = combined; + + write_imageui(output, dstCoord, (uint4)(c.x, c.y, 0, 1)); +} + __kernel void CopyBufferToImage3d16BytesStateless(__global uchar *src, __write_only image3d_t output, ulong srcOffset, diff --git a/shared/source/built_ins/kernels/copy_image3d_to_buffer.builtin_kernel b/shared/source/built_ins/kernels/copy_image3d_to_buffer.builtin_kernel index 8115c3abca..f99e067ee5 100644 --- a/shared/source/built_ins/kernels/copy_image3d_to_buffer.builtin_kernel +++ b/shared/source/built_ins/kernels/copy_image3d_to_buffer.builtin_kernel @@ -91,6 +91,25 @@ __kernel void CopyImage3dToBuffer4Bytes(__read_only image3d_t input, } } +__kernel void CopyImage3dToBuffer4To3Bytes(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + int dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = read_imageui(input, srcCoord); + + *((__global uchar*)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff); +} + __kernel void CopyImage3dToBuffer6Bytes(__read_only image3d_t input, __global uchar *dst, int4 srcOffset, @@ -143,6 +162,29 @@ __kernel void CopyImage3dToBuffer8Bytes(__read_only image3d_t input, } } +__kernel void CopyImage3dToBuffer8To6Bytes(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + int dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = read_imageui(input, srcCoord); + + *((__global uchar*)(dst + DstOffset + x * 6 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6)) = convert_uchar_sat(c.x & 0xff); + + *((__global uchar*)(dst + DstOffset + x * 6 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.y & 0xff); +} + __kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input, __global uchar *dst, int4 srcOffset, diff --git a/shared/source/built_ins/kernels/copy_image3d_to_buffer_stateless.builtin_kernel b/shared/source/built_ins/kernels/copy_image3d_to_buffer_stateless.builtin_kernel index b2d0163814..373ff929c3 100644 --- a/shared/source/built_ins/kernels/copy_image3d_to_buffer_stateless.builtin_kernel +++ b/shared/source/built_ins/kernels/copy_image3d_to_buffer_stateless.builtin_kernel @@ -90,6 +90,26 @@ __kernel void CopyImage3dToBuffer4BytesStateless(__read_only image3d_t input, } } +__kernel void CopyImage3dToBuffer4To3BytesStateless(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + ulong dstOffset, + ulong2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + ulong DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = read_imageui(input, srcCoord); + + *((__global uchar*)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff); +} + + __kernel void CopyImage3dToBuffer6BytesStateless(__read_only image3d_t input, __global uchar *dst, int4 srcOffset, @@ -143,6 +163,30 @@ __kernel void CopyImage3dToBuffer8BytesStateless(__read_only image3d_t input, } } +__kernel void CopyImage3dToBuffer8To6BytesStateless(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + ulong dstOffset, + ulong2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + ulong DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = read_imageui(input, srcCoord); + + *((__global uchar*)(dst + DstOffset + x * 6 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6)) = convert_uchar_sat(c.x & 0xff); + + *((__global uchar*)(dst + DstOffset + x * 6 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.y & 0xff); +} + + __kernel void CopyImage3dToBuffer16BytesStateless(__read_only image3d_t input, __global uchar *dst, int4 srcOffset, diff --git a/shared/test/common/helpers/kernel_binary_helper_hash_value.cpp b/shared/test/common/helpers/kernel_binary_helper_hash_value.cpp index 6888d3823b..8d9412e408 100644 --- a/shared/test/common/helpers/kernel_binary_helper_hash_value.cpp +++ b/shared/test/common/helpers/kernel_binary_helper_hash_value.cpp @@ -8,4 +8,4 @@ #include "shared/test/common/helpers/kernel_binary_helper.h" const std::string KernelBinaryHelper::BUILT_INS("15672580764041246108"); -const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("11207026507167120706_images"); +const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("16235226214855072632_images"); diff --git a/shared/test/common/test_files/11207026507167120706_images.cl b/shared/test/common/test_files/11207026507167120706_images.cl deleted file mode 100644 index 0728b1e0a7..0000000000 --- a/shared/test/common/test_files/11207026507167120706_images.cl +++ /dev/null @@ -1,790 +0,0 @@ -/* - * Copyright (C) 2024 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -__kernel void fullCopy(__global const uint *src, __global uint *dst) { - unsigned int gid = get_global_id(0); - uint4 loaded = vload4(gid, src); - vstore4(loaded, gid, dst); -} - -#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr & 0b11) == 0) - -__kernel void CopyBufferToBufferBytes( - const __global uchar *pSrc, - __global uchar *pDst, - uint srcOffsetInBytes, - uint dstOffsetInBytes, - uint bytesToRead) { - ALIGNED4(pSrc); - ALIGNED4(pDst); - pSrc += (srcOffsetInBytes + get_global_id(0)); - pDst += (dstOffsetInBytes + get_global_id(0)); - pDst[0] = pSrc[0]; -} - -__kernel void CopyBufferToBufferLeftLeftover( - const __global uchar *pSrc, - __global uchar *pDst, - uint srcOffsetInBytes, - uint dstOffsetInBytes) { - ALIGNED4(pSrc); - ALIGNED4(pDst); - unsigned int gid = get_global_id(0); - pDst[gid + dstOffsetInBytes] = pSrc[gid + srcOffsetInBytes]; -} - -__kernel void CopyBufferToBufferMiddle( - const __global uint *pSrc, - __global uint *pDst, - uint srcOffsetInBytes, - uint dstOffsetInBytes) { - ALIGNED4(pSrc); - ALIGNED4(pDst); - unsigned int gid = get_global_id(0); - pDst += dstOffsetInBytes >> 2; - pSrc += srcOffsetInBytes >> 2; - uint4 loaded = vload4(gid, pSrc); - vstore4(loaded, gid, pDst); -} - -__kernel void CopyBufferToBufferMiddleMisaligned( - __global const uint *pSrc, - __global uint *pDst, - uint srcOffsetInBytes, - uint dstOffsetInBytes, - uint misalignmentInBits) { - ALIGNED4(pSrc); - ALIGNED4(pDst); - const size_t gid = get_global_id(0); - pDst += dstOffsetInBytes >> 2; - pSrc += srcOffsetInBytes >> 2; - const uint4 src0 = vload4(gid, pSrc); - const uint4 src1 = vload4(gid + 1, pSrc); - - uint4 result; - result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits)); - result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits)); - result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits)); - result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits)); - vstore4(result, gid, pDst); -} - -__kernel void CopyBufferToBufferRightLeftover( - const __global uchar *pSrc, - __global uchar *pDst, - uint srcOffsetInBytes, - uint dstOffsetInBytes) { - ALIGNED4(pSrc); - ALIGNED4(pDst); - unsigned int gid = get_global_id(0); - pDst[gid + dstOffsetInBytes] = pSrc[gid + srcOffsetInBytes]; -} - -__kernel void copyBufferToBufferBytesSingle(__global uchar *dst, const __global uchar *src) { - ALIGNED4(dst); - ALIGNED4(src); - unsigned int gid = get_global_id(0); - dst[gid] = (uchar)(src[gid]); -} -__kernel void CopyBufferToBufferSideRegion( - __global uchar *pDst, - const __global uchar *pSrc, - unsigned int len, - uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment - uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment -) { - ALIGNED4(pSrc); - ALIGNED4(pDst); - unsigned int gid = get_global_id(0); - __global uchar *pDstWithOffset = (__global uchar *)((__global uchar *)pDst + dstSshOffset); - __global uchar *pSrcWithOffset = (__global uchar *)((__global uchar *)pSrc + srcSshOffset); - if (gid < len) { - pDstWithOffset[gid] = pSrcWithOffset[gid]; - } -} - -__kernel void CopyBufferToBufferMiddleRegion( - __global uint *pDst, - const __global uint *pSrc, - unsigned int elems, - uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment - uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment -) { - ALIGNED4(pSrc); - ALIGNED4(pDst); - unsigned int gid = get_global_id(0); - __global uint *pDstWithOffset = (__global uint *)((__global uchar *)pDst + dstSshOffset); - __global uint *pSrcWithOffset = (__global uint *)((__global uchar *)pSrc + srcSshOffset); - if (gid < elems) { - uint4 loaded = vload4(gid, pSrcWithOffset); - vstore4(loaded, gid, pDstWithOffset); - } -} - -#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr & 0b11) == 0) - -// assumption is local work size = pattern size -__kernel void FillBufferBytes( - __global uchar *pDst, - uint dstOffsetInBytes, - const __global uchar *pPattern) { - ALIGNED4(pDst); - ALIGNED4(pPattern); - uint dstIndex = get_global_id(0) + dstOffsetInBytes; - uint srcIndex = get_local_id(0); - pDst[dstIndex] = pPattern[srcIndex]; -} - -__kernel void FillBufferLeftLeftover( - __global uchar *pDst, - uint dstOffsetInBytes, - const __global uchar *pPattern, - const uint patternSizeInEls) { - ALIGNED4(pDst); - ALIGNED4(pPattern); - uint gid = get_global_id(0); - pDst[gid + dstOffsetInBytes] = pPattern[gid & (patternSizeInEls - 1)]; -} - -__kernel void FillBufferMiddle( - __global uchar *pDst, - uint dstOffsetInBytes, - const __global uint *pPattern, - const uint patternSizeInEls) { - ALIGNED4(pDst); - ALIGNED4(pPattern); - uint gid = get_global_id(0); - ((__global uint *)(pDst + dstOffsetInBytes))[gid] = pPattern[gid & (patternSizeInEls - 1)]; -} - -__kernel void FillBufferRightLeftover( - __global uchar *pDst, - uint dstOffsetInBytes, - const __global uchar *pPattern, - const uint patternSizeInEls) { - ALIGNED4(pDst); - ALIGNED4(pPattern); - uint gid = get_global_id(0); - pDst[gid + dstOffsetInBytes] = pPattern[gid & (patternSizeInEls - 1)]; -} - -__kernel void FillBufferImmediate( - __global uchar *ptr, - ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment - const uint value) { - ALIGNED4(ptr); - uint gid = get_global_id(0); - __global uint4 *dstPtr = (__global uint4 *)(ptr + dstSshOffset); - dstPtr[gid] = value; -} - -__kernel void FillBufferImmediateLeftOver( - __global uchar *ptr, - ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment - const uint value) { - ALIGNED4(ptr); - uint gid = get_global_id(0); - (ptr + dstSshOffset)[gid] = value; -} - -__kernel void FillBufferSSHOffset( - __global uchar *ptr, - uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment - const __global uchar *pPattern, - uint patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment -) { - ALIGNED4(ptr); - ALIGNED4(pPattern); - uint dstIndex = get_global_id(0); - uint srcIndex = get_local_id(0); - __global uchar *pDst = (__global uchar *)ptr + dstSshOffset; - __global uchar *pSrc = (__global uchar *)pPattern + patternSshOffset; - pDst[dstIndex] = pSrc[srcIndex]; -} - -__kernel void CopyBufferRectBytes2d( - __global const char *src, - __global char *dst, - uint4 SrcOrigin, - uint4 DstOrigin, - uint2 SrcPitch, - uint2 DstPitch) - -{ - int x = get_global_id(0); - int y = get_global_id(1); - - uint LSrcOffset = x + SrcOrigin.x + ((y + SrcOrigin.y) * SrcPitch.x); - uint LDstOffset = x + DstOrigin.x + ((y + DstOrigin.y) * DstPitch.x); - - *(dst + LDstOffset) = *(src + LSrcOffset); -} - -__kernel void CopyBufferRectBytesMiddle2d( - const __global uint *src, - __global uint *dst, - uint4 SrcOrigin, - uint4 DstOrigin, - uint2 SrcPitch, - uint2 DstPitch) - -{ - int x = get_global_id(0); - int y = get_global_id(1); - - uint LSrcOffset = SrcOrigin.x + ((y + SrcOrigin.y) * SrcPitch.x); - uint LDstOffset = DstOrigin.x + ((y + DstOrigin.y) * DstPitch.x); - - src += LSrcOffset >> 2; - dst += LDstOffset >> 2; - - uint4 loaded = vload4(x, src); - vstore4(loaded, x, dst); -} - -__kernel void CopyBufferRectBytes3d( - __global const char *src, - __global char *dst, - uint4 SrcOrigin, - uint4 DstOrigin, - uint2 SrcPitch, - uint2 DstPitch) - -{ - int x = get_global_id(0); - int y = get_global_id(1); - int z = get_global_id(2); - - uint LSrcOffset = x + SrcOrigin.x + ((y + SrcOrigin.y) * SrcPitch.x) + ((z + SrcOrigin.z) * SrcPitch.y); - uint LDstOffset = x + DstOrigin.x + ((y + DstOrigin.y) * DstPitch.x) + ((z + DstOrigin.z) * DstPitch.y); - - *(dst + LDstOffset) = *(src + LSrcOffset); -} - -__kernel void CopyBufferRectBytesMiddle3d( - const __global uint *src, - __global uint *dst, - uint4 SrcOrigin, - uint4 DstOrigin, - uint2 SrcPitch, - uint2 DstPitch) - -{ - int x = get_global_id(0); - int y = get_global_id(1); - int z = get_global_id(2); - - uint LSrcOffset = SrcOrigin.x + ((y + SrcOrigin.y) * SrcPitch.x) + ((z + SrcOrigin.z) * SrcPitch.y); - uint LDstOffset = DstOrigin.x + ((y + DstOrigin.y) * DstPitch.x) + ((z + DstOrigin.z) * DstPitch.y); - - src += LSrcOffset >> 2; - dst += LDstOffset >> 2; - - uint4 loaded = vload4(x, src); - vstore4(loaded, x, dst); -} - -void SetDstData(__global ulong *dst, uint currentOffset, ulong contextStart, ulong globalStart, ulong contextEnd, ulong globalEnd, uint useOnlyGlobalTimestamps) { - dst[currentOffset] = globalStart; - dst[currentOffset + 1] = globalEnd; - if (useOnlyGlobalTimestamps != 0) { - dst[currentOffset + 2] = globalStart; - dst[currentOffset + 3] = globalEnd; - } else { - dst[currentOffset + 2] = contextStart; - dst[currentOffset + 3] = contextEnd; - } -} - -ulong GetTimestampValue(ulong srcPtr, ulong timestampSizeInDw, uint index) { - if (timestampSizeInDw == 1) { - __global uint *src = (__global uint *)srcPtr; - return src[index]; - } else if (timestampSizeInDw == 2) { - __global ulong *src = (__global ulong *)srcPtr; - return src[index]; - } - - return 0; -} - -__kernel void QueryKernelTimestamps(__global ulong *srcEvents, __global ulong *dst, uint useOnlyGlobalTimestamps) { - uint gid = get_global_id(0); - uint currentOffset = gid * 4; - dst[currentOffset] = 0; - dst[currentOffset + 1] = 0; - dst[currentOffset + 2] = 0; - dst[currentOffset + 3] = 0; - - uint eventOffsetData = 3 * gid; - - ulong srcPtr = srcEvents[eventOffsetData]; - ulong packetUsed = srcEvents[eventOffsetData + 1]; - ulong timestampSizeInDw = srcEvents[eventOffsetData + 2]; - - ulong contextStart = GetTimestampValue(srcPtr, timestampSizeInDw, 0); - ulong globalStart = GetTimestampValue(srcPtr, timestampSizeInDw, 1); - ulong contextEnd = GetTimestampValue(srcPtr, timestampSizeInDw, 2); - ulong globalEnd = GetTimestampValue(srcPtr, timestampSizeInDw, 3); - - if (packetUsed > 1) { - for (uint i = 1; i < packetUsed; i++) { - uint timestampsOffsets = 4 * i; - if (contextStart > GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets)) { - contextStart = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets); - } - if (globalStart > GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 1)) { - globalStart = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 1); - } - if (contextEnd < GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 2)) { - contextEnd = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 2); - } - if (globalEnd < GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 3)) { - globalEnd = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 3); - } - } - } - - SetDstData(dst, currentOffset, contextStart, globalStart, contextEnd, globalEnd, useOnlyGlobalTimestamps); -} - -__kernel void QueryKernelTimestampsWithOffsets(__global ulong *srcEvents, __global ulong *dst, __global ulong *offsets, uint useOnlyGlobalTimestamps) { - uint gid = get_global_id(0); - uint currentOffset = offsets[gid] / 8; - dst[currentOffset] = 0; - dst[currentOffset + 1] = 0; - dst[currentOffset + 2] = 0; - dst[currentOffset + 3] = 0; - - uint eventOffsetData = 3 * gid; - - ulong srcPtr = srcEvents[eventOffsetData]; - ulong packetUsed = srcEvents[eventOffsetData + 1]; - ulong timestampSizeInDw = srcEvents[eventOffsetData + 2]; - - ulong contextStart = GetTimestampValue(srcPtr, timestampSizeInDw, 0); - ulong globalStart = GetTimestampValue(srcPtr, timestampSizeInDw, 1); - ulong contextEnd = GetTimestampValue(srcPtr, timestampSizeInDw, 2); - ulong globalEnd = GetTimestampValue(srcPtr, timestampSizeInDw, 3); - - if (packetUsed > 1) { - for (uint i = 1; i < packetUsed; i++) { - uint timestampsOffsets = 4 * i; - if (contextStart > GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets)) { - contextStart = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets); - } - if (globalStart > GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 1)) { - globalStart = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 1); - } - if (contextEnd < GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 2)) { - contextEnd = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 2); - } - if (globalEnd < GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 3)) { - globalEnd = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 3); - } - } - } - - SetDstData(dst, currentOffset, contextStart, globalStart, contextEnd, globalEnd, useOnlyGlobalTimestamps); -} - -__kernel void FillImage1d( - __write_only image1d_t output, - uint4 color, - int4 dstOffset) { - const int x = get_global_id(0); - - const int dstCoord = x + dstOffset.x; - write_imageui(output, dstCoord, color); -} - -__kernel void FillImage2d( - __write_only image2d_t output, - uint4 color, - int4 dstOffset) { - const int x = get_global_id(0); - const int y = get_global_id(1); - - const int2 dstCoord = (int2)(x, y) + (int2)(dstOffset.x, dstOffset.y); - write_imageui(output, dstCoord, color); -} - -#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable - -__kernel void FillImage3d( - __write_only image3d_t output, - uint4 color, - int4 dstOffset) { - const int x = get_global_id(0); - const int y = get_global_id(1); - const int z = get_global_id(2); - - const int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; - write_imageui(output, dstCoord, color); -} - -__kernel void CopyImageToImage1d( - __read_only image1d_t input, - __write_only image1d_t output, - int4 srcOffset, - int4 dstOffset) { - const int x = get_global_id(0); - - const int srcCoord = x + srcOffset.x; - const int dstCoord = x + dstOffset.x; - const uint4 c = read_imageui(input, srcCoord); - write_imageui(output, dstCoord, c); -} - -__kernel void CopyImageToImage2d( - __read_only image2d_t input, - __write_only image2d_t output, - int4 srcOffset, - int4 dstOffset) { - const int x = get_global_id(0); - const int y = get_global_id(1); - - const int2 srcCoord = (int2)(x, y) + (int2)(srcOffset.x, srcOffset.y); - const int2 dstCoord = (int2)(x, y) + (int2)(dstOffset.x, dstOffset.y); - const uint4 c = read_imageui(input, srcCoord); - write_imageui(output, dstCoord, c); -} - -#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable - -__kernel void CopyImageToImage3d( - __read_only image3d_t input, - __write_only image3d_t output, - int4 srcOffset, - int4 dstOffset) { - const int x = get_global_id(0); - const int y = get_global_id(1); - const int z = get_global_id(2); - - const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; - const int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; - const uint4 c = read_imageui(input, srcCoord); - write_imageui(output, dstCoord, c); -} - -#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable - -__kernel void CopyBufferToImage3dBytes(__global uchar *src, - __write_only image3d_t output, - int srcOffset, - int4 dstOffset, - uint2 Pitch) { - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint z = get_global_id(2); - - int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; - uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); - - write_imageui(output, dstCoord, (uint4)(*(src + LOffset + x), 0, 0, 1)); -} - -__kernel void CopyBufferToImage3d2Bytes(__global uchar *src, - __write_only image3d_t output, - int srcOffset, - int4 dstOffset, - uint2 Pitch) { - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint z = get_global_id(2); - - int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; - uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); - - uint4 c = (uint4)(0, 0, 0, 1); - - if ((ulong)(src + srcOffset) & 0x00000001) { - ushort upper = *((__global uchar *)(src + LOffset + x * 2 + 1)); - ushort lower = *((__global uchar *)(src + LOffset + x * 2)); - ushort combined = (upper << 8) | lower; - c.x = (uint)combined; - } else { - c.x = (uint)(*(__global ushort *)(src + LOffset + x * 2)); - } - write_imageui(output, dstCoord, c); -} - -__kernel void CopyBufferToImage3d4Bytes(__global uchar *src, - __write_only image3d_t output, - int srcOffset, - int4 dstOffset, - uint2 Pitch) { - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint z = get_global_id(2); - - int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; - uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); - - uint4 c = (uint4)(0, 0, 0, 1); - - if ((ulong)(src + srcOffset) & 0x00000003) { - uint upper2 = *((__global uchar *)(src + LOffset + x * 4 + 3)); - uint upper = *((__global uchar *)(src + LOffset + x * 4 + 2)); - uint lower2 = *((__global uchar *)(src + LOffset + x * 4 + 1)); - uint lower = *((__global uchar *)(src + LOffset + x * 4)); - uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; - c.x = combined; - } else { - c.x = (*(__global uint *)(src + LOffset + x * 4)); - } - write_imageui(output, dstCoord, c); -} - -__kernel void CopyBufferToImage3d8Bytes(__global uchar *src, - __write_only image3d_t output, - int srcOffset, - int4 dstOffset, - uint2 Pitch) { - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint z = get_global_id(2); - - int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; - uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); - - uint2 c = (uint2)(0, 0); //*((__global uint2*)(src + LOffset + x * 8)); - - if ((ulong)(src + srcOffset) & 0x00000007) { - uint upper2 = *((__global uchar *)(src + LOffset + x * 8 + 3)); - uint upper = *((__global uchar *)(src + LOffset + x * 8 + 2)); - uint lower2 = *((__global uchar *)(src + LOffset + x * 8 + 1)); - uint lower = *((__global uchar *)(src + LOffset + x * 8)); - uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; - c.x = combined; - upper2 = *((__global uchar *)(src + LOffset + x * 8 + 7)); - upper = *((__global uchar *)(src + LOffset + x * 8 + 6)); - lower2 = *((__global uchar *)(src + LOffset + x * 8 + 5)); - lower = *((__global uchar *)(src + LOffset + x * 8 + 4)); - combined = ((uint)upper2 << 24) | ((uint)upper << 16) | ((uint)lower2 << 8) | lower; - c.y = combined; - } else { - c = *((__global uint2 *)(src + LOffset + x * 8)); - } - - write_imageui(output, dstCoord, (uint4)(c.x, c.y, 0, 1)); -} - -__kernel void CopyBufferToImage3d16Bytes(__global uchar *src, - __write_only image3d_t output, - int srcOffset, - int4 dstOffset, - uint2 Pitch) { - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint z = get_global_id(2); - - int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; - uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); - - uint4 c = (uint4)(0, 0, 0, 0); - - if ((ulong)(src + srcOffset) & 0x0000000f) { - uint upper2 = *((__global uchar *)(src + LOffset + x * 16 + 3)); - uint upper = *((__global uchar *)(src + LOffset + x * 16 + 2)); - uint lower2 = *((__global uchar *)(src + LOffset + x * 16 + 1)); - uint lower = *((__global uchar *)(src + LOffset + x * 16)); - uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; - c.x = combined; - upper2 = *((__global uchar *)(src + LOffset + x * 16 + 7)); - upper = *((__global uchar *)(src + LOffset + x * 16 + 6)); - lower2 = *((__global uchar *)(src + LOffset + x * 16 + 5)); - lower = *((__global uchar *)(src + LOffset + x * 16 + 4)); - combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; - c.y = combined; - upper2 = *((__global uchar *)(src + LOffset + x * 16 + 11)); - upper = *((__global uchar *)(src + LOffset + x * 16 + 10)); - lower2 = *((__global uchar *)(src + LOffset + x * 16 + 9)); - lower = *((__global uchar *)(src + LOffset + x * 16 + 8)); - combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; - c.z = combined; - upper2 = *((__global uchar *)(src + LOffset + x * 16 + 15)); - upper = *((__global uchar *)(src + LOffset + x * 16 + 14)); - lower2 = *((__global uchar *)(src + LOffset + x * 16 + 13)); - lower = *((__global uchar *)(src + LOffset + x * 16 + 12)); - combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; - c.w = combined; - } else { - c = *((__global uint4 *)(src + LOffset + x * 16)); - } - - write_imageui(output, dstCoord, c); -} - -__kernel void CopyImage3dToBufferBytes(__read_only image3d_t input, - __global uchar *dst, - int4 srcOffset, - int dstOffset, - uint2 Pitch) { - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint z = get_global_id(2); - - const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; - uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); - - uint4 c = read_imageui(input, srcCoord); - *(dst + DstOffset + x) = convert_uchar_sat(c.x); -} - -__kernel void CopyImage3dToBuffer2Bytes(__read_only image3d_t input, - __global uchar *dst, - int4 srcOffset, - int dstOffset, - uint2 Pitch) { - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint z = get_global_id(2); - - const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; - uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); - - uint4 c = read_imageui(input, srcCoord); - - if ((ulong)(dst + dstOffset) & 0x00000001) { - *((__global uchar *)(dst + DstOffset + x * 2 + 1)) = convert_uchar_sat((c.x >> 8) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 2)) = convert_uchar_sat(c.x & 0xff); - } else { - *((__global ushort *)(dst + DstOffset + x * 2)) = convert_ushort_sat(c.x); - } -} - -__kernel void CopyImage3dToBuffer3Bytes(__read_only image3d_t input, - __global uchar *dst, - int4 srcOffset, - int dstOffset, - uint2 Pitch) { - - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint z = get_global_id(2); - - const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; - uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); - - uint4 c = read_imageui(input, srcCoord); - - *((__global uchar *)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat(c.z & 0xff); - *((__global uchar *)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat(c.y & 0xff); - *((__global uchar *)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff); -} - -__kernel void CopyImage3dToBuffer4Bytes(__read_only image3d_t input, - __global uchar *dst, - int4 srcOffset, - int dstOffset, - uint2 Pitch) { - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint z = get_global_id(2); - - const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; - uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); - - uint4 c = read_imageui(input, srcCoord); - - if ((ulong)(dst + dstOffset) & 0x00000003) { - *((__global uchar *)(dst + DstOffset + x * 4 + 3)) = convert_uchar_sat((c.x >> 24) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 4 + 2)) = convert_uchar_sat((c.x >> 16) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 4 + 1)) = convert_uchar_sat((c.x >> 8) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 4)) = convert_uchar_sat(c.x & 0xff); - } else { - *((__global uint *)(dst + DstOffset + x * 4)) = c.x; - } -} - -__kernel void CopyImage3dToBuffer6Bytes(__read_only image3d_t input, - __global uchar *dst, - int4 srcOffset, - int dstOffset, - uint2 Pitch) { - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint z = get_global_id(2); - - const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; - uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); - - uint4 c = read_imageui(input, srcCoord); - - *((__global uchar *)(dst + DstOffset + x * 6 + 1)) = convert_uchar_sat((c.x >> 8) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 6)) = convert_uchar_sat(c.x & 0xff); - *((__global uchar *)(dst + DstOffset + x * 6 + 3)) = convert_uchar_sat((c.y >> 8) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 6 + 2)) = convert_uchar_sat(c.y & 0xff); - *((__global uchar *)(dst + DstOffset + x * 6 + 5)) = convert_uchar_sat((c.z >> 8) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.z & 0xff); -} - -__kernel void CopyImage3dToBuffer8Bytes(__read_only image3d_t input, - __global uchar *dst, - int4 srcOffset, - int dstOffset, - uint2 Pitch) { - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint z = get_global_id(2); - - const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; - uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); - - uint4 c = read_imageui(input, srcCoord); - - if ((ulong)(dst + dstOffset) & 0x00000007) { - *((__global uchar *)(dst + DstOffset + x * 8 + 3)) = convert_uchar_sat((c.x >> 24) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 8 + 2)) = convert_uchar_sat((c.x >> 16) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 8 + 1)) = convert_uchar_sat((c.x >> 8) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 8)) = convert_uchar_sat(c.x & 0xff); - *((__global uchar *)(dst + DstOffset + x * 8 + 7)) = convert_uchar_sat((c.y >> 24) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 8 + 6)) = convert_uchar_sat((c.y >> 16) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 8 + 5)) = convert_uchar_sat((c.y >> 8) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 8 + 4)) = convert_uchar_sat(c.y & 0xff); - } else { - uint2 d = (uint2)(c.x, c.y); - *((__global uint2 *)(dst + DstOffset + x * 8)) = d; - } -} - -__kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input, - __global uchar *dst, - int4 srcOffset, - int dstOffset, - uint2 Pitch) { - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint z = get_global_id(2); - - const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; - uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); - - const uint4 c = read_imageui(input, srcCoord); - - if ((ulong)(dst + dstOffset) & 0x0000000f) { - *((__global uchar *)(dst + DstOffset + x * 16 + 3)) = convert_uchar_sat((c.x >> 24) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 2)) = convert_uchar_sat((c.x >> 16) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 1)) = convert_uchar_sat((c.x >> 8) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16)) = convert_uchar_sat(c.x & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 7)) = convert_uchar_sat((c.y >> 24) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 6)) = convert_uchar_sat((c.y >> 16) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 5)) = convert_uchar_sat((c.y >> 8) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 4)) = convert_uchar_sat(c.y & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 11)) = convert_uchar_sat((c.z >> 24) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 10)) = convert_uchar_sat((c.z >> 16) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 9)) = convert_uchar_sat((c.z >> 8) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 8)) = convert_uchar_sat(c.z & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 15)) = convert_uchar_sat((c.w >> 24) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 14)) = convert_uchar_sat((c.w >> 16) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 13)) = convert_uchar_sat((c.w >> 8) & 0xff); - *((__global uchar *)(dst + DstOffset + x * 16 + 12)) = convert_uchar_sat(c.w & 0xff); - } else { - *(__global uint4 *)(dst + DstOffset + x * 16) = c; - } -} \ No newline at end of file diff --git a/shared/test/common/test_files/16235226214855072632_images.cl b/shared/test/common/test_files/16235226214855072632_images.cl new file mode 100644 index 0000000000..5ff048edfa --- /dev/null +++ b/shared/test/common/test_files/16235226214855072632_images.cl @@ -0,0 +1,914 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +__kernel void fullCopy(__global const uint* src, __global uint* dst) { + unsigned int gid = get_global_id(0); + uint4 loaded = vload4(gid, src); + vstore4(loaded, gid, dst); +} + +#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0) + +__kernel void CopyBufferToBufferBytes( + const __global uchar* pSrc, + __global uchar* pDst, + uint srcOffsetInBytes, + uint dstOffsetInBytes, + uint bytesToRead ) +{ + ALIGNED4(pSrc); + ALIGNED4(pDst); + pSrc += ( srcOffsetInBytes + get_global_id(0) ); + pDst += ( dstOffsetInBytes + get_global_id(0) ); + pDst[ 0 ] = pSrc[ 0 ]; +} + +__kernel void CopyBufferToBufferLeftLeftover( + const __global uchar* pSrc, + __global uchar* pDst, + uint srcOffsetInBytes, + uint dstOffsetInBytes) +{ + ALIGNED4(pSrc); + ALIGNED4(pDst); + unsigned int gid = get_global_id(0); + pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ]; +} + +__kernel void CopyBufferToBufferMiddle( + const __global uint* pSrc, + __global uint* pDst, + uint srcOffsetInBytes, + uint dstOffsetInBytes) +{ + ALIGNED4(pSrc); + ALIGNED4(pDst); + unsigned int gid = get_global_id(0); + pDst += dstOffsetInBytes >> 2; + pSrc += srcOffsetInBytes >> 2; + uint4 loaded = vload4(gid, pSrc); + vstore4(loaded, gid, pDst); +} + +__kernel void CopyBufferToBufferMiddleMisaligned( + __global const uint* pSrc, + __global uint* pDst, + uint srcOffsetInBytes, + uint dstOffsetInBytes, + uint misalignmentInBits) +{ + ALIGNED4(pSrc); + ALIGNED4(pDst); + const size_t gid = get_global_id(0); + pDst += dstOffsetInBytes >> 2; + pSrc += srcOffsetInBytes >> 2; + const uint4 src0 = vload4(gid, pSrc); + const uint4 src1 = vload4(gid + 1, pSrc); + + uint4 result; + result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits)); + result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits)); + result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits)); + result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits)); + vstore4(result, gid, pDst); +} + +__kernel void CopyBufferToBufferRightLeftover( + const __global uchar* pSrc, + __global uchar* pDst, + uint srcOffsetInBytes, + uint dstOffsetInBytes) +{ + ALIGNED4(pSrc); + ALIGNED4(pDst); + unsigned int gid = get_global_id(0); + pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ]; +} + +__kernel void copyBufferToBufferBytesSingle(__global uchar *dst, const __global uchar *src) { + ALIGNED4(dst); + ALIGNED4(src); + unsigned int gid = get_global_id(0); + dst[gid] = (uchar)(src[gid]); +} +__kernel void CopyBufferToBufferSideRegion( + __global uchar* pDst, + const __global uchar* pSrc, + unsigned int len, + uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment + ) +{ + ALIGNED4(pSrc); + ALIGNED4(pDst); + unsigned int gid = get_global_id(0); + __global uchar* pDstWithOffset = (__global uchar*)((__global uchar*)pDst + dstSshOffset); + __global uchar* pSrcWithOffset = (__global uchar*)((__global uchar*)pSrc + srcSshOffset); + if (gid < len) { + pDstWithOffset[ gid ] = pSrcWithOffset[ gid ]; + } +} + +__kernel void CopyBufferToBufferMiddleRegion( + __global uint* pDst, + const __global uint* pSrc, + unsigned int elems, + uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment + ) +{ + ALIGNED4(pSrc); + ALIGNED4(pDst); + unsigned int gid = get_global_id(0); + __global uint* pDstWithOffset = (__global uint*)((__global uchar*)pDst + dstSshOffset); + __global uint* pSrcWithOffset = (__global uint*)((__global uchar*)pSrc + srcSshOffset); + if (gid < elems) { + uint4 loaded = vload4(gid, pSrcWithOffset); + vstore4(loaded, gid, pDstWithOffset); + } +} + +#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0) + +// assumption is local work size = pattern size +__kernel void FillBufferBytes( + __global uchar* pDst, + uint dstOffsetInBytes, + const __global uchar* pPattern ) +{ + ALIGNED4(pDst); + ALIGNED4(pPattern); + uint dstIndex = get_global_id(0) + dstOffsetInBytes; + uint srcIndex = get_local_id(0); + pDst[dstIndex] = pPattern[srcIndex]; +} + +__kernel void FillBufferLeftLeftover( + __global uchar* pDst, + uint dstOffsetInBytes, + const __global uchar* pPattern, + const uint patternSizeInEls ) +{ + ALIGNED4(pDst); + ALIGNED4(pPattern); + uint gid = get_global_id(0); + pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ]; +} + +__kernel void FillBufferMiddle( + __global uchar* pDst, + uint dstOffsetInBytes, + const __global uint* pPattern, + const uint patternSizeInEls ) +{ + ALIGNED4(pDst); + ALIGNED4(pPattern); + uint gid = get_global_id(0); + ((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[ gid & (patternSizeInEls - 1) ]; +} + +__kernel void FillBufferRightLeftover( + __global uchar* pDst, + uint dstOffsetInBytes, + const __global uchar* pPattern, + const uint patternSizeInEls ) +{ + ALIGNED4(pDst); + ALIGNED4(pPattern); + uint gid = get_global_id(0); + pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ]; +} + +__kernel void FillBufferImmediate( + __global uchar* ptr, + ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + const uint value) +{ + ALIGNED4(ptr); + uint gid = get_global_id(0); + __global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset); + dstPtr[gid] = value; +} + +__kernel void FillBufferImmediateLeftOver( + __global uchar* ptr, + ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + const uint value) +{ + ALIGNED4(ptr); + uint gid = get_global_id(0); + (ptr + dstSshOffset)[gid] = value; +} + +__kernel void FillBufferSSHOffset( + __global uchar* ptr, + uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment + const __global uchar* pPattern, + uint patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment +) +{ + ALIGNED4(ptr); + ALIGNED4(pPattern); + uint dstIndex = get_global_id(0); + uint srcIndex = get_local_id(0); + __global uchar* pDst = (__global uchar*)ptr + dstSshOffset; + __global uchar* pSrc = (__global uchar*)pPattern + patternSshOffset; + pDst[dstIndex] = pSrc[srcIndex]; +} + + +__kernel void CopyBufferRectBytes2d( + __global const char* src, + __global char* dst, + uint4 SrcOrigin, + uint4 DstOrigin, + uint2 SrcPitch, + uint2 DstPitch ) + +{ + int x = get_global_id(0); + int y = get_global_id(1); + + uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ); + uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ); + + *( dst + LDstOffset ) = *( src + LSrcOffset ); + +} + +__kernel void CopyBufferRectBytesMiddle2d( + const __global uint* src, + __global uint* dst, + uint4 SrcOrigin, + uint4 DstOrigin, + uint2 SrcPitch, + uint2 DstPitch ) + +{ + int x = get_global_id(0); + int y = get_global_id(1); + + uint LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ); + uint LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ); + + src += LSrcOffset >> 2; + dst += LDstOffset >> 2; + + uint4 loaded = vload4(x,src); + vstore4(loaded,x,dst); +} + +__kernel void CopyBufferRectBytes3d( + __global const char* src, + __global char* dst, + uint4 SrcOrigin, + uint4 DstOrigin, + uint2 SrcPitch, + uint2 DstPitch ) + +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + + uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y ); + uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y ); + + *( dst + LDstOffset ) = *( src + LSrcOffset ); + +} + +__kernel void CopyBufferRectBytesMiddle3d( + const __global uint* src, + __global uint* dst, + uint4 SrcOrigin, + uint4 DstOrigin, + uint2 SrcPitch, + uint2 DstPitch ) + +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + + uint LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y ); + uint LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y ); + + src += LSrcOffset >> 2; + dst += LDstOffset >> 2; + + uint4 loaded = vload4(x,src); + vstore4(loaded,x,dst); +} + +void SetDstData(__global ulong* dst, uint currentOffset, ulong contextStart, ulong globalStart, ulong contextEnd, ulong globalEnd, uint useOnlyGlobalTimestamps) { + dst[currentOffset] = globalStart; + dst[currentOffset + 1] = globalEnd; + if (useOnlyGlobalTimestamps != 0) { + dst[currentOffset + 2] = globalStart; + dst[currentOffset + 3] = globalEnd; + } else { + dst[currentOffset + 2] = contextStart; + dst[currentOffset + 3] = contextEnd; + } +} + +ulong GetTimestampValue(ulong srcPtr, ulong timestampSizeInDw, uint index) { + if(timestampSizeInDw == 1) { + __global uint *src = (__global uint *) srcPtr; + return src[index]; + } else if(timestampSizeInDw == 2) { + __global ulong *src = (__global ulong *) srcPtr; + return src[index]; + } + + return 0; +} + +__kernel void QueryKernelTimestamps(__global ulong* srcEvents, __global ulong* dst, uint useOnlyGlobalTimestamps) { + uint gid = get_global_id(0); + uint currentOffset = gid * 4; + dst[currentOffset] = 0; + dst[currentOffset + 1] = 0; + dst[currentOffset + 2] = 0; + dst[currentOffset + 3] = 0; + + uint eventOffsetData = 3 * gid; + + ulong srcPtr = srcEvents[eventOffsetData]; + ulong packetUsed = srcEvents[eventOffsetData + 1]; + ulong timestampSizeInDw = srcEvents[eventOffsetData + 2]; + + ulong contextStart = GetTimestampValue(srcPtr, timestampSizeInDw, 0); + ulong globalStart = GetTimestampValue(srcPtr, timestampSizeInDw, 1); + ulong contextEnd = GetTimestampValue(srcPtr, timestampSizeInDw, 2); + ulong globalEnd = GetTimestampValue(srcPtr, timestampSizeInDw, 3); + + if(packetUsed > 1) { + for(uint i = 1; i < packetUsed; i++) { + uint timestampsOffsets = 4 * i; + if(contextStart > GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets)) { + contextStart = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets); + } + if(globalStart > GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 1)) { + globalStart = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 1); + } + if(contextEnd < GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 2)) { + contextEnd = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 2); + } + if(globalEnd < GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 3)) { + globalEnd = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 3); + } + } + } + + SetDstData(dst, currentOffset, contextStart, globalStart, contextEnd, globalEnd, useOnlyGlobalTimestamps); +} + +__kernel void QueryKernelTimestampsWithOffsets(__global ulong* srcEvents, __global ulong* dst, __global ulong *offsets, uint useOnlyGlobalTimestamps) { + uint gid = get_global_id(0); + uint currentOffset = offsets[gid] / 8; + dst[currentOffset] = 0; + dst[currentOffset + 1] = 0; + dst[currentOffset + 2] = 0; + dst[currentOffset + 3] = 0; + + uint eventOffsetData = 3 * gid; + + ulong srcPtr = srcEvents[eventOffsetData]; + ulong packetUsed = srcEvents[eventOffsetData + 1]; + ulong timestampSizeInDw = srcEvents[eventOffsetData + 2]; + + ulong contextStart = GetTimestampValue(srcPtr, timestampSizeInDw, 0); + ulong globalStart = GetTimestampValue(srcPtr, timestampSizeInDw, 1); + ulong contextEnd = GetTimestampValue(srcPtr, timestampSizeInDw, 2); + ulong globalEnd = GetTimestampValue(srcPtr, timestampSizeInDw, 3); + + if(packetUsed > 1) { + for(uint i = 1; i < packetUsed; i++) { + uint timestampsOffsets = 4 * i; + if(contextStart > GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets)) { + contextStart = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets); + } + if(globalStart > GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 1)) { + globalStart = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 1); + } + if(contextEnd < GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 2)) { + contextEnd = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 2); + } + if(globalEnd < GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 3)) { + globalEnd = GetTimestampValue(srcPtr, timestampSizeInDw, timestampsOffsets + 3); + } + } + } + + SetDstData(dst, currentOffset, contextStart, globalStart, contextEnd, globalEnd, useOnlyGlobalTimestamps); +} + +__kernel void FillImage1d( + __write_only image1d_t output, + uint4 color, + int4 dstOffset) { + const int x = get_global_id(0); + + const int dstCoord = x + dstOffset.x; + write_imageui(output, dstCoord, color); +} + +__kernel void FillImage2d( + __write_only image2d_t output, + uint4 color, + int4 dstOffset) { + const int x = get_global_id(0); + const int y = get_global_id(1); + + const int2 dstCoord = (int2)(x, y) + (int2)(dstOffset.x, dstOffset.y); + write_imageui(output, dstCoord, color); +} + +#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable + +__kernel void FillImage3d( + __write_only image3d_t output, + uint4 color, + int4 dstOffset) { + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + + const int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + write_imageui(output, dstCoord, color); +} + +__kernel void CopyImageToImage1d( + __read_only image1d_t input, + __write_only image1d_t output, + int4 srcOffset, + int4 dstOffset) { + const int x = get_global_id(0); + + const int srcCoord = x + srcOffset.x; + const int dstCoord = x + dstOffset.x; + const uint4 c = read_imageui(input, srcCoord); + write_imageui(output, dstCoord, c); +} + +__kernel void CopyImageToImage2d( + __read_only image2d_t input, + __write_only image2d_t output, + int4 srcOffset, + int4 dstOffset) { + const int x = get_global_id(0); + const int y = get_global_id(1); + + const int2 srcCoord = (int2)(x, y) + (int2)(srcOffset.x, srcOffset.y); + const int2 dstCoord = (int2)(x, y) + (int2)(dstOffset.x, dstOffset.y); + const uint4 c = read_imageui(input, srcCoord); + write_imageui(output, dstCoord, c); +} + +#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable + +__kernel void CopyImageToImage3d( + __read_only image3d_t input, + __write_only image3d_t output, + int4 srcOffset, + int4 dstOffset) { + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + const int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + const uint4 c = read_imageui(input, srcCoord); + write_imageui(output, dstCoord, c); +} + +#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable + +__kernel void CopyBufferToImage3dBytes(__global uchar *src, + __write_only image3d_t output, + int srcOffset, + int4 dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); + + write_imageui(output, dstCoord, (uint4)(*(src + LOffset + x), 0, 0, 1)); +} + +__kernel void CopyBufferToImage3d2Bytes(__global uchar *src, + __write_only image3d_t output, + int srcOffset, + int4 dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = (uint4)(0, 0, 0, 1); + + if(( ulong )(src + srcOffset) & 0x00000001){ + ushort upper = *((__global uchar*)(src + LOffset + x * 2 + 1)); + ushort lower = *((__global uchar*)(src + LOffset + x * 2)); + ushort combined = (upper << 8) | lower; + c.x = (uint)combined; + } + else{ + c.x = (uint)(*(__global ushort*)(src + LOffset + x * 2)); + } + write_imageui(output, dstCoord, c); +} + +__kernel void CopyBufferToImage3d4Bytes(__global uchar *src, + __write_only image3d_t output, + int srcOffset, + int4 dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = (uint4)(0, 0, 0, 1); + + if(( ulong )(src + srcOffset) & 0x00000003){ + uint upper2 = *((__global uchar*)(src + LOffset + x * 4 + 3)); + uint upper = *((__global uchar*)(src + LOffset + x * 4 + 2)); + uint lower2 = *((__global uchar*)(src + LOffset + x * 4 + 1)); + uint lower = *((__global uchar*)(src + LOffset + x * 4)); + uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; + c.x = combined; + } + else{ + c.x = (*(__global uint*)(src + LOffset + x * 4)); + } + write_imageui(output, dstCoord, c); +} + +__kernel void CopyBufferToImage3d3To4Bytes(__global uchar *src, + __write_only image3d_t output, + int srcOffset, + int4 dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = (uint4)(0, 0, 0, 1); + + + uint upper2 = 0; + uint upper = *((__global uchar*)(src + LOffset + x * 3 + 2)); + uint lower2 = *((__global uchar*)(src + LOffset + x * 3 + 1)); + uint lower = *((__global uchar*)(src + LOffset + x * 3)); + uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; + c.x = combined; + + write_imageui(output, dstCoord, c); +} + +__kernel void CopyBufferToImage3d8Bytes(__global uchar *src, + __write_only image3d_t output, + int srcOffset, + int4 dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); + + uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8)); + + if(( ulong )(src + srcOffset) & 0x00000007){ + uint upper2 = *((__global uchar*)(src + LOffset + x * 8 + 3)); + uint upper = *((__global uchar*)(src + LOffset + x * 8 + 2)); + uint lower2 = *((__global uchar*)(src + LOffset + x * 8 + 1)); + uint lower = *((__global uchar*)(src + LOffset + x * 8)); + uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; + c.x = combined; + upper2 = *((__global uchar*)(src + LOffset + x * 8 + 7)); + upper = *((__global uchar*)(src + LOffset + x * 8 + 6)); + lower2 = *((__global uchar*)(src + LOffset + x * 8 + 5)); + lower = *((__global uchar*)(src + LOffset + x * 8 + 4)); + combined = ((uint)upper2 << 24) | ((uint)upper << 16) | ((uint)lower2 << 8) | lower; + c.y = combined; + } + else{ + c = *((__global uint2*)(src + LOffset + x * 8)); + } + + write_imageui(output, dstCoord, (uint4)(c.x, c.y, 0, 1)); +} + +__kernel void CopyBufferToImage3d6To8Bytes(__global uchar *src, + __write_only image3d_t output, + int srcOffset, + int4 dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); + + uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8)); + + + uint upper2 = *((__global uchar*)(src + LOffset + x * 6 + 3)); + uint upper = *((__global uchar*)(src + LOffset + x * 6 + 2)); + uint lower2 = *((__global uchar*)(src + LOffset + x * 6 + 1)); + uint lower = *((__global uchar*)(src + LOffset + x * 6)); + uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; + c.x = combined; + + upper2 = upper = 0; + lower2 = *((__global uchar*)(src + LOffset + x * 6 + 5)); + lower = *((__global uchar*)(src + LOffset + x * 6 + 4)); + combined = ((uint)upper2 << 24) | ((uint)upper << 16) | ((uint)lower2 << 8) | lower; + c.y = combined; + + write_imageui(output, dstCoord, (uint4)(c.x, c.y, 0, 1)); +} + +__kernel void CopyBufferToImage3d16Bytes(__global uchar *src, + __write_only image3d_t output, + int srcOffset, + int4 dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + int4 dstCoord = (int4)(x, y, z, 0) + dstOffset; + uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = (uint4)(0, 0, 0, 0); + + if(( ulong )(src + srcOffset) & 0x0000000f){ + uint upper2 = *((__global uchar*)(src + LOffset + x * 16 + 3)); + uint upper = *((__global uchar*)(src + LOffset + x * 16 + 2)); + uint lower2 = *((__global uchar*)(src + LOffset + x * 16 + 1)); + uint lower = *((__global uchar*)(src + LOffset + x * 16)); + uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; + c.x = combined; + upper2 = *((__global uchar*)(src + LOffset + x * 16 + 7)); + upper = *((__global uchar*)(src + LOffset + x * 16 + 6)); + lower2 = *((__global uchar*)(src + LOffset + x * 16 + 5)); + lower = *((__global uchar*)(src + LOffset + x * 16 + 4)); + combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; + c.y = combined; + upper2 = *((__global uchar*)(src + LOffset + x * 16 + 11)); + upper = *((__global uchar*)(src + LOffset + x * 16 + 10)); + lower2 = *((__global uchar*)(src + LOffset + x * 16 + 9)); + lower = *((__global uchar*)(src + LOffset + x * 16 + 8)); + combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; + c.z = combined; + upper2 = *((__global uchar*)(src + LOffset + x * 16 + 15)); + upper = *((__global uchar*)(src + LOffset + x * 16 + 14)); + lower2 = *((__global uchar*)(src + LOffset + x * 16 + 13)); + lower = *((__global uchar*)(src + LOffset + x * 16 + 12)); + combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower; + c.w = combined; + } + else{ + c = *((__global uint4 *)(src + LOffset + x * 16)); + } + + write_imageui(output, dstCoord, c); +} + +__kernel void CopyImage3dToBufferBytes(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + int dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = read_imageui(input, srcCoord); + *(dst + DstOffset + x) = convert_uchar_sat(c.x); +} + +__kernel void CopyImage3dToBuffer2Bytes(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + int dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = read_imageui(input, srcCoord); + + if(( ulong )(dst + dstOffset) & 0x00000001){ + *((__global uchar*)(dst + DstOffset + x * 2 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 2)) = convert_uchar_sat(c.x & 0xff); + } + else{ + *((__global ushort*)(dst + DstOffset + x * 2)) = convert_ushort_sat(c.x); + } +} + +__kernel void CopyImage3dToBuffer3Bytes(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + int dstOffset, + uint2 Pitch) { + + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = read_imageui(input, srcCoord); + + *((__global uchar*)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat(c.z & 0xff); + *((__global uchar*)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat(c.y & 0xff); + *((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff); +} + + +__kernel void CopyImage3dToBuffer4Bytes(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + int dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = read_imageui(input, srcCoord); + + if(( ulong )(dst + dstOffset) & 0x00000003){ + *((__global uchar*)(dst + DstOffset + x * 4 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 4 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 4 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 4)) = convert_uchar_sat(c.x & 0xff); + } + else{ + *((__global uint*)(dst + DstOffset + x * 4)) = c.x; + } +} + +__kernel void CopyImage3dToBuffer4To3Bytes(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + int dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = read_imageui(input, srcCoord); + + *((__global uchar*)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff); +} + +__kernel void CopyImage3dToBuffer6Bytes(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + int dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = read_imageui(input, srcCoord); + + *((__global uchar*)(dst + DstOffset + x * 6 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6)) = convert_uchar_sat(c.x & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 3)) = convert_uchar_sat((c.y >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 2)) = convert_uchar_sat(c.y & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 5)) = convert_uchar_sat((c.z >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.z & 0xff); +} + +__kernel void CopyImage3dToBuffer8Bytes(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + int dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = read_imageui(input, srcCoord); + + if(( ulong )(dst + dstOffset) & 0x00000007){ + *((__global uchar*)(dst + DstOffset + x * 8 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 8 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 8 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 8)) = convert_uchar_sat(c.x & 0xff); + *((__global uchar*)(dst + DstOffset + x * 8 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 8 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 8 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 8 + 4)) = convert_uchar_sat(c.y & 0xff); + } + else{ + uint2 d = (uint2)(c.x,c.y); + *((__global uint2*)(dst + DstOffset + x * 8)) = d; + } +} + +__kernel void CopyImage3dToBuffer8To6Bytes(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + int dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + uint4 c = read_imageui(input, srcCoord); + + *((__global uchar*)(dst + DstOffset + x * 6 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6)) = convert_uchar_sat(c.x & 0xff); + + *((__global uchar*)(dst + DstOffset + x * 6 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.y & 0xff); +} + +__kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input, + __global uchar *dst, + int4 srcOffset, + int dstOffset, + uint2 Pitch) { + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + + const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset; + uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y); + + const uint4 c = read_imageui(input, srcCoord); + + if(( ulong )(dst + dstOffset) & 0x0000000f){ + *((__global uchar*)(dst + DstOffset + x * 16 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16)) = convert_uchar_sat(c.x & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 4)) = convert_uchar_sat(c.y & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 11)) = convert_uchar_sat((c.z >> 24 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 10)) = convert_uchar_sat((c.z >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 9)) = convert_uchar_sat((c.z >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 8)) = convert_uchar_sat(c.z & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 15)) = convert_uchar_sat((c.w >> 24 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 14)) = convert_uchar_sat((c.w >> 16 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 13)) = convert_uchar_sat((c.w >> 8 ) & 0xff); + *((__global uchar*)(dst + DstOffset + x * 16 + 12)) = convert_uchar_sat(c.w & 0xff); + } + else{ + *(__global uint4*)(dst + DstOffset + x * 16) = c; + } +} \ No newline at end of file diff --git a/shared/test/common/test_files/11207026507167120706_images_options.txt b/shared/test/common/test_files/16235226214855072632_images_options.txt similarity index 100% rename from shared/test/common/test_files/11207026507167120706_images_options.txt rename to shared/test/common/test_files/16235226214855072632_images_options.txt