refactor: unify stateless and not stateless builtins kernels

Related-to: NEO-16156

Signed-off-by: Damian Tomczak <damian.tomczak@intel.com>
This commit is contained in:
Damian Tomczak
2025-11-18 12:10:32 +00:00
committed by Compute-Runtime-Automation
parent 8882b1e54a
commit 4eb37124cb
36 changed files with 1250 additions and 2457 deletions

View File

@@ -37,11 +37,11 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) {
builtin = NEO::EBuiltInOps::copyBufferToBuffer;
break;
case Builtin::copyBufferBytesStateless:
kernelName = "copyBufferToBufferBytesSingleStateless";
kernelName = "copyBufferToBufferBytesSingle";
builtin = NEO::EBuiltInOps::copyBufferToBufferStateless;
break;
case Builtin::copyBufferBytesStatelessHeapless:
kernelName = "copyBufferToBufferBytesSingleStateless";
kernelName = "copyBufferToBufferBytesSingle";
builtin = NEO::EBuiltInOps::copyBufferToBufferStatelessHeapless;
break;
case Builtin::copyBufferRectBytes2d:
@@ -49,11 +49,11 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) {
builtin = NEO::EBuiltInOps::copyBufferRect;
break;
case Builtin::copyBufferRectBytes2dStateless:
kernelName = "CopyBufferRectBytes2dStateless";
kernelName = "CopyBufferRectBytes2d";
builtin = NEO::EBuiltInOps::copyBufferRectStateless;
break;
case Builtin::copyBufferRectBytes2dStatelessHeapless:
kernelName = "CopyBufferRectBytes2dStateless";
kernelName = "CopyBufferRectBytes2d";
builtin = NEO::EBuiltInOps::copyBufferRectStatelessHeapless;
break;
case Builtin::copyBufferRectBytes3d:
@@ -61,11 +61,11 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) {
builtin = NEO::EBuiltInOps::copyBufferRect;
break;
case Builtin::copyBufferRectBytes3dStateless:
kernelName = "CopyBufferRectBytes3dStateless";
kernelName = "CopyBufferRectBytes3d";
builtin = NEO::EBuiltInOps::copyBufferRectStateless;
break;
case Builtin::copyBufferRectBytes3dStatelessHeapless:
kernelName = "CopyBufferRectBytes3dStateless";
kernelName = "CopyBufferRectBytes3d";
builtin = NEO::EBuiltInOps::copyBufferRectStatelessHeapless;
break;
case Builtin::copyBufferToBufferMiddle:
@@ -73,11 +73,11 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) {
builtin = NEO::EBuiltInOps::copyBufferToBuffer;
break;
case Builtin::copyBufferToBufferMiddleStateless:
kernelName = "CopyBufferToBufferMiddleRegionStateless";
kernelName = "CopyBufferToBufferMiddleRegion";
builtin = NEO::EBuiltInOps::copyBufferToBufferStateless;
break;
case Builtin::copyBufferToBufferMiddleStatelessHeapless:
kernelName = "CopyBufferToBufferMiddleRegionStateless";
kernelName = "CopyBufferToBufferMiddleRegion";
builtin = NEO::EBuiltInOps::copyBufferToBufferStatelessHeapless;
break;
case Builtin::copyBufferToBufferSide:
@@ -85,11 +85,11 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) {
builtin = NEO::EBuiltInOps::copyBufferToBuffer;
break;
case Builtin::copyBufferToBufferSideStateless:
kernelName = "CopyBufferToBufferSideRegionStateless";
kernelName = "CopyBufferToBufferSideRegion";
builtin = NEO::EBuiltInOps::copyBufferToBufferStateless;
break;
case Builtin::copyBufferToBufferSideStatelessHeapless:
kernelName = "CopyBufferToBufferSideRegionStateless";
kernelName = "CopyBufferToBufferSideRegion";
builtin = NEO::EBuiltInOps::copyBufferToBufferStatelessHeapless;
break;
case Builtin::fillBufferImmediate:
@@ -97,11 +97,11 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) {
builtin = NEO::EBuiltInOps::fillBuffer;
break;
case Builtin::fillBufferImmediateStateless:
kernelName = "FillBufferImmediateStateless";
kernelName = "FillBufferImmediate";
builtin = NEO::EBuiltInOps::fillBufferStateless;
break;
case Builtin::fillBufferImmediateStatelessHeapless:
kernelName = "FillBufferImmediateStateless";
kernelName = "FillBufferImmediate";
builtin = NEO::EBuiltInOps::fillBufferStatelessHeapless;
break;
case Builtin::fillBufferImmediateLeftOver:
@@ -109,11 +109,11 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) {
builtin = NEO::EBuiltInOps::fillBuffer;
break;
case Builtin::fillBufferImmediateLeftOverStateless:
kernelName = "FillBufferImmediateLeftOverStateless";
kernelName = "FillBufferImmediateLeftOver";
builtin = NEO::EBuiltInOps::fillBufferStateless;
break;
case Builtin::fillBufferImmediateLeftOverStatelessHeapless:
kernelName = "FillBufferImmediateLeftOverStateless";
kernelName = "FillBufferImmediateLeftOver";
builtin = NEO::EBuiltInOps::fillBufferStatelessHeapless;
break;
case Builtin::fillBufferSSHOffset:
@@ -121,11 +121,11 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) {
builtin = NEO::EBuiltInOps::fillBuffer;
break;
case Builtin::fillBufferSSHOffsetStateless:
kernelName = "FillBufferSSHOffsetStateless";
kernelName = "FillBufferSSHOffset";
builtin = NEO::EBuiltInOps::fillBufferStateless;
break;
case Builtin::fillBufferSSHOffsetStatelessHeapless:
kernelName = "FillBufferSSHOffsetStateless";
kernelName = "FillBufferSSHOffset";
builtin = NEO::EBuiltInOps::fillBufferStatelessHeapless;
break;
case Builtin::fillBufferMiddle:
@@ -133,11 +133,11 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) {
builtin = NEO::EBuiltInOps::fillBuffer;
break;
case Builtin::fillBufferMiddleStateless:
kernelName = "FillBufferMiddleStateless";
kernelName = "FillBufferMiddle";
builtin = NEO::EBuiltInOps::fillBufferStateless;
break;
case Builtin::fillBufferMiddleStatelessHeapless:
kernelName = "FillBufferMiddleStateless";
kernelName = "FillBufferMiddle";
builtin = NEO::EBuiltInOps::fillBufferStatelessHeapless;
break;
case Builtin::fillBufferRightLeftover:
@@ -145,11 +145,11 @@ void BuiltinFunctionsLibImpl::initBuiltinKernel(Builtin func) {
builtin = NEO::EBuiltInOps::fillBuffer;
break;
case Builtin::fillBufferRightLeftoverStateless:
kernelName = "FillBufferRightLeftoverStateless";
kernelName = "FillBufferRightLeftover";
builtin = NEO::EBuiltInOps::fillBufferStateless;
break;
case Builtin::fillBufferRightLeftoverStatelessHeapless:
kernelName = "FillBufferRightLeftoverStateless";
kernelName = "FillBufferRightLeftover";
builtin = NEO::EBuiltInOps::fillBufferStatelessHeapless;
break;
case Builtin::queryKernelTimestamps:
@@ -182,19 +182,19 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyBufferToImage3d;
break;
case ImageBuiltin::copyBufferToImage3d16BytesStateless:
builtinName = "CopyBufferToImage3d16BytesStateless";
builtinName = "CopyBufferToImage3d16Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dStateless;
break;
case ImageBuiltin::copyBufferToImage3d16BytesAlignedStateless:
builtinName = "CopyBufferToImage3d16BytesAlignedStateless";
builtinName = "CopyBufferToImage3d16BytesAligned";
builtin = NEO::EBuiltInOps::copyBufferToImage3dStateless;
break;
case ImageBuiltin::copyBufferToImage3d16BytesHeapless:
builtinName = "CopyBufferToImage3d16BytesStateless";
builtinName = "CopyBufferToImage3d16Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless;
break;
case ImageBuiltin::copyBufferToImage3d16BytesAlignedHeapless:
builtinName = "CopyBufferToImage3d16BytesAlignedStateless";
builtinName = "CopyBufferToImage3d16BytesAligned";
builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless;
break;
case ImageBuiltin::copyBufferToImage3d2Bytes:
@@ -202,11 +202,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyBufferToImage3d;
break;
case ImageBuiltin::copyBufferToImage3d2BytesStateless:
builtinName = "CopyBufferToImage3d2BytesStateless";
builtinName = "CopyBufferToImage3d2Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dStateless;
break;
case ImageBuiltin::copyBufferToImage3d2BytesHeapless:
builtinName = "CopyBufferToImage3d2BytesStateless";
builtinName = "CopyBufferToImage3d2Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless;
break;
case ImageBuiltin::copyBufferToImage3d4Bytes:
@@ -214,11 +214,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyBufferToImage3d;
break;
case ImageBuiltin::copyBufferToImage3d4BytesStateless:
builtinName = "CopyBufferToImage3d4BytesStateless";
builtinName = "CopyBufferToImage3d4Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dStateless;
break;
case ImageBuiltin::copyBufferToImage3d4BytesHeapless:
builtinName = "CopyBufferToImage3d4BytesStateless";
builtinName = "CopyBufferToImage3d4Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless;
break;
case ImageBuiltin::copyBufferToImage3d3To4Bytes:
@@ -226,11 +226,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyBufferToImage3d;
break;
case ImageBuiltin::copyBufferToImage3d3To4BytesStateless:
builtinName = "CopyBufferToImage3d3To4BytesStateless";
builtinName = "CopyBufferToImage3d3To4Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dStateless;
break;
case ImageBuiltin::copyBufferToImage3d3To4BytesHeapless:
builtinName = "CopyBufferToImage3d3To4BytesStateless";
builtinName = "CopyBufferToImage3d3To4Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless;
break;
case ImageBuiltin::copyBufferToImage3d8Bytes:
@@ -238,11 +238,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyBufferToImage3d;
break;
case ImageBuiltin::copyBufferToImage3d8BytesStateless:
builtinName = "CopyBufferToImage3d8BytesStateless";
builtinName = "CopyBufferToImage3d8Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dStateless;
break;
case ImageBuiltin::copyBufferToImage3d8BytesHeapless:
builtinName = "CopyBufferToImage3d8BytesStateless";
builtinName = "CopyBufferToImage3d8Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless;
break;
case ImageBuiltin::copyBufferToImage3d6To8Bytes:
@@ -250,11 +250,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyBufferToImage3d;
break;
case ImageBuiltin::copyBufferToImage3d6To8BytesStateless:
builtinName = "CopyBufferToImage3d6To8BytesStateless";
builtinName = "CopyBufferToImage3d6To8Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dStateless;
break;
case ImageBuiltin::copyBufferToImage3d6To8BytesHeapless:
builtinName = "CopyBufferToImage3d6To8BytesStateless";
builtinName = "CopyBufferToImage3d6To8Bytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless;
break;
case ImageBuiltin::copyBufferToImage3dBytes:
@@ -262,11 +262,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyBufferToImage3d;
break;
case ImageBuiltin::copyBufferToImage3dBytesStateless:
builtinName = "CopyBufferToImage3dBytesStateless";
builtinName = "CopyBufferToImage3dBytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dStateless;
break;
case ImageBuiltin::copyBufferToImage3dBytesHeapless:
builtinName = "CopyBufferToImage3dBytesStateless";
builtinName = "CopyBufferToImage3dBytes";
builtin = NEO::EBuiltInOps::copyBufferToImage3dHeapless;
break;
case ImageBuiltin::copyImage3dToBuffer16Bytes:
@@ -278,19 +278,19 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyImage3dToBuffer;
break;
case ImageBuiltin::copyImage3dToBuffer16BytesStateless:
builtinName = "CopyImage3dToBuffer16BytesStateless";
builtinName = "CopyImage3dToBuffer16Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferStateless;
break;
case ImageBuiltin::copyImage3dToBuffer16BytesAlignedStateless:
builtinName = "CopyImage3dToBuffer16BytesAlignedStateless";
builtinName = "CopyImage3dToBuffer16BytesAligned";
builtin = NEO::EBuiltInOps::copyImage3dToBufferStateless;
break;
case ImageBuiltin::copyImage3dToBuffer16BytesHeapless:
builtinName = "CopyImage3dToBuffer16BytesStateless";
builtinName = "CopyImage3dToBuffer16Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless;
break;
case ImageBuiltin::copyImage3dToBuffer16BytesAlignedHeapless:
builtinName = "CopyImage3dToBuffer16BytesAlignedStateless";
builtinName = "CopyImage3dToBuffer16BytesAligned";
builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless;
break;
case ImageBuiltin::copyImage3dToBuffer2Bytes:
@@ -298,11 +298,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyImage3dToBuffer;
break;
case ImageBuiltin::copyImage3dToBuffer2BytesStateless:
builtinName = "CopyImage3dToBuffer2BytesStateless";
builtinName = "CopyImage3dToBuffer2Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferStateless;
break;
case ImageBuiltin::copyImage3dToBuffer2BytesHeapless:
builtinName = "CopyImage3dToBuffer2BytesStateless";
builtinName = "CopyImage3dToBuffer2Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless;
break;
case ImageBuiltin::copyImage3dToBuffer3Bytes:
@@ -310,11 +310,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyImage3dToBuffer;
break;
case ImageBuiltin::copyImage3dToBuffer3BytesStateless:
builtinName = "CopyImage3dToBuffer3BytesStateless";
builtinName = "CopyImage3dToBuffer3Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferStateless;
break;
case ImageBuiltin::copyImage3dToBuffer3BytesHeapless:
builtinName = "CopyImage3dToBuffer3BytesStateless";
builtinName = "CopyImage3dToBuffer3Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless;
break;
case ImageBuiltin::copyImage3dToBuffer4Bytes:
@@ -322,11 +322,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyImage3dToBuffer;
break;
case ImageBuiltin::copyImage3dToBuffer4BytesStateless:
builtinName = "CopyImage3dToBuffer4BytesStateless";
builtinName = "CopyImage3dToBuffer4Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferStateless;
break;
case ImageBuiltin::copyImage3dToBuffer4BytesHeapless:
builtinName = "CopyImage3dToBuffer4BytesStateless";
builtinName = "CopyImage3dToBuffer4Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless;
break;
case ImageBuiltin::copyImage3dToBuffer4To3Bytes:
@@ -334,11 +334,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyImage3dToBuffer;
break;
case ImageBuiltin::copyImage3dToBuffer4To3BytesStateless:
builtinName = "CopyImage3dToBuffer4To3BytesStateless";
builtinName = "CopyImage3dToBuffer4To3Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferStateless;
break;
case ImageBuiltin::copyImage3dToBuffer4To3BytesHeapless:
builtinName = "CopyImage3dToBuffer4To3BytesStateless";
builtinName = "CopyImage3dToBuffer4To3Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless;
break;
case ImageBuiltin::copyImage3dToBuffer6Bytes:
@@ -346,11 +346,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyImage3dToBuffer;
break;
case ImageBuiltin::copyImage3dToBuffer6BytesStateless:
builtinName = "CopyImage3dToBuffer6BytesStateless";
builtinName = "CopyImage3dToBuffer6Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferStateless;
break;
case ImageBuiltin::copyImage3dToBuffer6BytesHeapless:
builtinName = "CopyImage3dToBuffer6BytesStateless";
builtinName = "CopyImage3dToBuffer6Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless;
break;
case ImageBuiltin::copyImage3dToBuffer8Bytes:
@@ -358,11 +358,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyImage3dToBuffer;
break;
case ImageBuiltin::copyImage3dToBuffer8BytesStateless:
builtinName = "CopyImage3dToBuffer8BytesStateless";
builtinName = "CopyImage3dToBuffer8Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferStateless;
break;
case ImageBuiltin::copyImage3dToBuffer8BytesHeapless:
builtinName = "CopyImage3dToBuffer8BytesStateless";
builtinName = "CopyImage3dToBuffer8Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless;
break;
case ImageBuiltin::copyImage3dToBuffer8To6Bytes:
@@ -370,11 +370,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyImage3dToBuffer;
break;
case ImageBuiltin::copyImage3dToBuffer8To6BytesStateless:
builtinName = "CopyImage3dToBuffer8To6BytesStateless";
builtinName = "CopyImage3dToBuffer8To6Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferStateless;
break;
case ImageBuiltin::copyImage3dToBuffer8To6BytesHeapless:
builtinName = "CopyImage3dToBuffer8To6BytesStateless";
builtinName = "CopyImage3dToBuffer8To6Bytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless;
break;
case ImageBuiltin::copyImage3dToBufferBytes:
@@ -382,11 +382,11 @@ void BuiltinFunctionsLibImpl::initBuiltinImageKernel(ImageBuiltin func) {
builtin = NEO::EBuiltInOps::copyImage3dToBuffer;
break;
case ImageBuiltin::copyImage3dToBufferBytesStateless:
builtinName = "CopyImage3dToBufferBytesStateless";
builtinName = "CopyImage3dToBufferBytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferStateless;
break;
case ImageBuiltin::copyImage3dToBufferBytesHeapless:
builtinName = "CopyImage3dToBufferBytesStateless";
builtinName = "CopyImage3dToBufferBytes";
builtin = NEO::EBuiltInOps::copyImage3dToBufferHeapless;
break;
case ImageBuiltin::copyImageRegion:
@@ -502,7 +502,7 @@ std::unique_ptr<BuiltinFunctionsLibImpl::BuiltinData> BuiltinFunctionsLibImpl::l
ze_kernel_desc_t kernelDesc = {};
kernelDesc.pKernelName = builtInName;
res = this->modules[builtin]->createKernel(&kernelDesc, &kernelHandle);
DEBUG_BREAK_IF(res != ZE_RESULT_SUCCESS);
UNRECOVERABLE_IF(res != ZE_RESULT_SUCCESS);
kernel.reset(Kernel::fromHandle(kernelHandle));
return std::unique_ptr<BuiltinData>(new BuiltinData{modules[builtin].get(), std::move(kernel)});

View File

@@ -167,39 +167,39 @@ HWTEST_F(TestBuiltinFunctionsLibImpl, givenHeaplessBuiltinsWhenInitBuiltinKernel
lib.initBuiltinKernel(L0::Builtin::copyBufferBytesStatelessHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToBufferStatelessHeapless, lib.builtinPassed);
EXPECT_STREQ("copyBufferToBufferBytesSingleStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("copyBufferToBufferBytesSingle", lib.kernelNamePassed.c_str());
lib.initBuiltinKernel(L0::Builtin::copyBufferToBufferMiddleStatelessHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToBufferStatelessHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToBufferMiddleRegionStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferToBufferMiddleRegion", lib.kernelNamePassed.c_str());
lib.initBuiltinKernel(L0::Builtin::copyBufferToBufferSideStatelessHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToBufferStatelessHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToBufferSideRegionStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferToBufferSideRegion", lib.kernelNamePassed.c_str());
lib.initBuiltinKernel(L0::Builtin::fillBufferImmediateStatelessHeapless);
EXPECT_EQ(NEO::EBuiltInOps::fillBufferStatelessHeapless, lib.builtinPassed);
EXPECT_STREQ("FillBufferImmediateStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("FillBufferImmediate", lib.kernelNamePassed.c_str());
lib.initBuiltinKernel(L0::Builtin::fillBufferImmediateLeftOverStatelessHeapless);
EXPECT_EQ(NEO::EBuiltInOps::fillBufferStatelessHeapless, lib.builtinPassed);
EXPECT_STREQ("FillBufferImmediateLeftOverStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("FillBufferImmediateLeftOver", lib.kernelNamePassed.c_str());
lib.initBuiltinKernel(L0::Builtin::fillBufferSSHOffsetStatelessHeapless);
EXPECT_EQ(NEO::EBuiltInOps::fillBufferStatelessHeapless, lib.builtinPassed);
EXPECT_STREQ("FillBufferSSHOffsetStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("FillBufferSSHOffset", lib.kernelNamePassed.c_str());
lib.initBuiltinKernel(L0::Builtin::fillBufferSSHOffsetStatelessHeapless);
EXPECT_EQ(NEO::EBuiltInOps::fillBufferStatelessHeapless, lib.builtinPassed);
EXPECT_STREQ("FillBufferSSHOffsetStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("FillBufferSSHOffset", lib.kernelNamePassed.c_str());
lib.initBuiltinKernel(L0::Builtin::fillBufferMiddleStatelessHeapless);
EXPECT_EQ(NEO::EBuiltInOps::fillBufferStatelessHeapless, lib.builtinPassed);
EXPECT_STREQ("FillBufferMiddleStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("FillBufferMiddle", lib.kernelNamePassed.c_str());
lib.initBuiltinKernel(L0::Builtin::fillBufferRightLeftoverStatelessHeapless);
EXPECT_EQ(NEO::EBuiltInOps::fillBufferStatelessHeapless, lib.builtinPassed);
EXPECT_STREQ("FillBufferRightLeftoverStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("FillBufferRightLeftover", lib.kernelNamePassed.c_str());
}
HWTEST_F(TestBuiltinFunctionsLibImpl, givenHeaplessImageBuiltinsWhenInitBuiltinKernelThenCorrectArgumentsArePassed) {
@@ -208,43 +208,43 @@ HWTEST_F(TestBuiltinFunctionsLibImpl, givenHeaplessImageBuiltinsWhenInitBuiltinK
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d16BytesHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3dHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d16BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferToImage3d16Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d2BytesHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3dHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d2BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferToImage3d2Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d4BytesHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3dHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d4BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferToImage3d4Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d8BytesHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3dHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d8BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferToImage3d8Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer16BytesHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer16BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBuffer16Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer2BytesHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer2BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBuffer2Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer4BytesHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer4BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBuffer4Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBufferBytesHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBufferBytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBufferBytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer3BytesHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer3BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBuffer3Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer6BytesHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer6BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBuffer6Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImageRegionHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyImageToImage3dHeapless, lib.builtinPassed);
@@ -257,11 +257,11 @@ HWTEST_F(TestBuiltinFunctionsLibImpl, givenHeaplessBufferRectBuiltinsWhenInitBui
lib.initBuiltinKernel(L0::Builtin::copyBufferRectBytes2dStatelessHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferRectStatelessHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferRectBytes2dStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferRectBytes2d", lib.kernelNamePassed.c_str());
lib.initBuiltinKernel(L0::Builtin::copyBufferRectBytes3dStatelessHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferRectStatelessHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferRectBytes3dStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferRectBytes3d", lib.kernelNamePassed.c_str());
}
HWTEST_F(TestBuiltinFunctionsLibImpl, givenStatelessBufferRectBuiltinsWhenInitBuiltinKernelThenCorrectArgumentsArePassed) {
@@ -270,11 +270,11 @@ HWTEST_F(TestBuiltinFunctionsLibImpl, givenStatelessBufferRectBuiltinsWhenInitBu
lib.initBuiltinKernel(L0::Builtin::copyBufferRectBytes2dStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferRectStateless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferRectBytes2dStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferRectBytes2d", lib.kernelNamePassed.c_str());
lib.initBuiltinKernel(L0::Builtin::copyBufferRectBytes3dStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferRectStateless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferRectBytes3dStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferRectBytes3d", lib.kernelNamePassed.c_str());
}
HWTEST_F(TestBuiltinFunctionsLibImpl, givenStatelessImageBuiltinsWhenInitBuiltinKernelThenCorrectArgumentsArePassed) {
@@ -282,43 +282,43 @@ HWTEST_F(TestBuiltinFunctionsLibImpl, givenStatelessImageBuiltinsWhenInitBuiltin
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d16BytesStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3dStateless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d16BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferToImage3d16Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d2BytesStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3dStateless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d2BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferToImage3d2Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d4BytesStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3dStateless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d4BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferToImage3d4Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d8BytesStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3dStateless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d8BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferToImage3d8Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer16BytesStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferStateless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer16BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBuffer16Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer2BytesStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferStateless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer2BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBuffer2Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer4BytesStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferStateless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer4BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBuffer4Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBufferBytesStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferStateless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBufferBytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBufferBytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer3BytesStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferStateless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer3BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBuffer3Bytes", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer6BytesStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferStateless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer6BytesStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBuffer6Bytes", lib.kernelNamePassed.c_str());
}
HWTEST_F(TestBuiltinFunctionsLibImpl, givenAlignedImageBuiltinsWhenInitBuiltinKernelThenCorrectArgumentsArePassed) {
@@ -330,11 +330,11 @@ HWTEST_F(TestBuiltinFunctionsLibImpl, givenAlignedImageBuiltinsWhenInitBuiltinKe
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d16BytesAlignedStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3dStateless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d16BytesAlignedStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferToImage3d16BytesAligned", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyBufferToImage3d16BytesAlignedHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyBufferToImage3dHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyBufferToImage3d16BytesAlignedStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyBufferToImage3d16BytesAligned", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer16BytesAligned);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBuffer, lib.builtinPassed);
@@ -342,11 +342,11 @@ HWTEST_F(TestBuiltinFunctionsLibImpl, givenAlignedImageBuiltinsWhenInitBuiltinKe
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer16BytesAlignedStateless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferStateless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer16BytesAlignedStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBuffer16BytesAligned", lib.kernelNamePassed.c_str());
lib.initBuiltinImageKernel(L0::ImageBuiltin::copyImage3dToBuffer16BytesAlignedHeapless);
EXPECT_EQ(NEO::EBuiltInOps::copyImage3dToBufferHeapless, lib.builtinPassed);
EXPECT_STREQ("CopyImage3dToBuffer16BytesAlignedStateless", lib.kernelNamePassed.c_str());
EXPECT_STREQ("CopyImage3dToBuffer16BytesAligned", lib.kernelNamePassed.c_str());
}
HWTEST_F(TestBuiltinFunctionsLibImpl, givenCompilerInterfaceWhenCreateDeviceAndImageSupportedThenBuiltinsImageFunctionsAreLoaded) {

View File

@@ -1,5 +1,5 @@
#
# Copyright (C) 2020-2022 Intel Corporation
# Copyright (C) 2020-2025 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
@@ -9,6 +9,7 @@ target_sources(${TARGET_NAME} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/test_kernel.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_kernel_2.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_sampler_patch_value.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_kernel_addressing.cpp
)
add_subdirectories()

View File

@@ -0,0 +1,211 @@
/*
* Copyright (C) 2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/compiler_product_helper.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include "shared/source/program/kernel_info.h"
#include "shared/test/common/test_macros/test_checks_shared.h"
#include "level_zero/core/source/builtin/builtin_functions_lib.h"
#include "level_zero/core/source/builtin/builtin_functions_lib_impl.h"
#include "level_zero/core/source/device/device.h"
#include "level_zero/core/source/kernel/kernel.h"
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
namespace L0::ult {
struct KernelAddressingTest : public DeviceFixture, public ::testing::Test {
void SetUp() override {
DeviceFixture::setUp();
builtinLib = std::make_unique<L0::BuiltinFunctionsLibImpl>(device, neoDevice->getBuiltIns());
ASSERT_NE(nullptr, builtinLib.get());
auto &compilerProductHelper = device->getCompilerProductHelper();
isHeapless = compilerProductHelper.isHeaplessModeEnabled(neoDevice->getHardwareInfo());
isStateless = compilerProductHelper.isForceToStatelessRequired();
}
void TearDown() override {
builtinLib.reset();
DeviceFixture::tearDown();
}
bool isHeapless = false;
bool isStateless = false;
std::unique_ptr<L0::BuiltinFunctionsLibImpl> builtinLib;
};
TEST_F(KernelAddressingTest,
givenBuiltinCopyBufferToBufferKernelsWhenFetchedFromBuiltinLibThenCorrectArgumentSizesAreUsed) {
{
// Builtin copyBufferBytes uses copyBufferToBufferBytesSingle without extra arguments
} {
const auto builtinType = BuiltinTypeHelper::adjustBuiltinType<L0::Builtin::copyBufferToBufferMiddle>(isStateless, isHeapless);
auto kernel = builtinLib->getFunction(builtinType);
ASSERT_NE(nullptr, kernel);
auto kernelInfo = kernel->getImmutableData()->getKernelInfo();
ASSERT_NE(nullptr, kernelInfo);
EXPECT_EQ(kernelInfo->getArgDescriptorAt(2).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(4).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
}
{
const auto builtinType = BuiltinTypeHelper::adjustBuiltinType<L0::Builtin::copyBufferToBufferSide>(isStateless, isHeapless);
auto kernel = builtinLib->getFunction(builtinType);
ASSERT_NE(nullptr, kernel);
auto kernelInfo = kernel->getImmutableData()->getKernelInfo();
ASSERT_NE(nullptr, kernelInfo);
EXPECT_EQ(kernelInfo->getArgDescriptorAt(2).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(4).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
}
}
TEST_F(KernelAddressingTest,
givenBuiltinCopyBufferRectKernelsWhenFetchedFromBuiltinLibThenCorrectArgumentSizesAreUsed) {
{
const auto builtinType = BuiltinTypeHelper::adjustBuiltinType<L0::Builtin::copyBufferRectBytes2d>(isStateless, isHeapless);
auto kernel = builtinLib->getFunction(builtinType);
ASSERT_NE(nullptr, kernel);
auto kernelInfo = kernel->getImmutableData()->getKernelInfo();
ASSERT_NE(nullptr, kernelInfo);
EXPECT_EQ(kernelInfo->getArgDescriptorAt(2).as<ArgDescValue>().elements[0].size, isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(4).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(5).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
}
{
const auto builtinType = BuiltinTypeHelper::adjustBuiltinType<L0::Builtin::copyBufferRectBytes3d>(isStateless, isHeapless);
auto kernel = builtinLib->getFunction(builtinType);
ASSERT_NE(nullptr, kernel);
auto kernelInfo = kernel->getImmutableData()->getKernelInfo();
ASSERT_NE(nullptr, kernelInfo);
EXPECT_EQ(kernelInfo->getArgDescriptorAt(2).as<ArgDescValue>().elements[0].size, isStateless ? 4 * sizeof(uint64_t) : 4 * sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? 4 * sizeof(uint64_t) : 4 * sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(4).as<ArgDescValue>().elements[0].size, isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(5).as<ArgDescValue>().elements[0].size, isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
}
}
TEST_F(KernelAddressingTest, givenBuiltinFillBufferKernelsWhenFetchedFromProgramThenCorrectArgumentSizesAreUsed) {
{
const auto builtinType = BuiltinTypeHelper::adjustBuiltinType<L0::Builtin::fillBufferImmediate>(isStateless, isHeapless);
auto kernel = builtinLib->getFunction(builtinType);
ASSERT_NE(nullptr, kernel);
auto kernelInfo = kernel->getImmutableData()->getKernelInfo();
ASSERT_NE(nullptr, kernelInfo);
EXPECT_EQ(kernelInfo->getArgDescriptorAt(1).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
}
{
const auto builtinType = BuiltinTypeHelper::adjustBuiltinType<L0::Builtin::fillBufferImmediateLeftOver>(isStateless, isHeapless);
auto kernel = builtinLib->getFunction(builtinType);
ASSERT_NE(nullptr, kernel);
auto kernelInfo = kernel->getImmutableData()->getKernelInfo();
ASSERT_NE(nullptr, kernelInfo);
EXPECT_EQ(kernelInfo->getArgDescriptorAt(1).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
}
{
const auto builtinType = BuiltinTypeHelper::adjustBuiltinType<L0::Builtin::fillBufferSSHOffset>(isStateless, isHeapless);
auto kernel = builtinLib->getFunction(builtinType);
ASSERT_NE(nullptr, kernel);
auto kernelInfo = kernel->getImmutableData()->getKernelInfo();
ASSERT_NE(nullptr, kernelInfo);
EXPECT_EQ(kernelInfo->getArgDescriptorAt(1).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
}
{
const auto builtinType = BuiltinTypeHelper::adjustBuiltinType<L0::Builtin::fillBufferMiddle>(isStateless, isHeapless);
auto kernel = builtinLib->getFunction(builtinType);
ASSERT_NE(nullptr, kernel);
auto kernelInfo = kernel->getImmutableData()->getKernelInfo();
ASSERT_NE(nullptr, kernelInfo);
EXPECT_EQ(kernelInfo->getArgDescriptorAt(1).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
}
{
const auto builtinType = BuiltinTypeHelper::adjustBuiltinType<L0::Builtin::fillBufferRightLeftover>(isStateless, isHeapless);
auto kernel = builtinLib->getFunction(builtinType);
ASSERT_NE(nullptr, kernel);
auto kernelInfo = kernel->getImmutableData()->getKernelInfo();
ASSERT_NE(nullptr, kernelInfo);
EXPECT_EQ(kernelInfo->getArgDescriptorAt(1).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
}
}
TEST_F(KernelAddressingTest,
givenBuiltinCopyBufferToImage3dKernelsWhenFetchedFromBuiltinLibThenCorrectArgumentSizesAreUsed) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto testBuiltinType = [&](auto builtinType) {
auto kernel = builtinLib->getImageFunction(builtinType);
ASSERT_NE(nullptr, kernel);
auto kernelInfo = kernel->getImmutableData()->getKernelInfo();
ASSERT_NE(nullptr, kernelInfo);
EXPECT_EQ(kernelInfo->getArgDescriptorAt(2).template as<ArgDescValue>().elements[0].size,
isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(4).template as<ArgDescValue>().elements[0].size,
isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
};
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyBufferToImage3d16Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyBufferToImage3d2Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyBufferToImage3d4Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyBufferToImage3d3To4Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyBufferToImage3d8Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyBufferToImage3d6To8Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyBufferToImage3dBytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyBufferToImage3d16BytesAligned>(isStateless, isHeapless));
}
TEST_F(KernelAddressingTest,
givenBuiltinCopyImage3dToBufferKernelsWhenFetchedFromBuiltinLibThenCorrectArgumentSizesAreUsed) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto testBuiltinType = [&](auto builtinType) {
auto kernel = builtinLib->getImageFunction(builtinType);
ASSERT_NE(nullptr, kernel);
auto kernelInfo = kernel->getImmutableData()->getKernelInfo();
ASSERT_NE(nullptr, kernelInfo);
EXPECT_EQ(kernelInfo->getArgDescriptorAt(3).template as<ArgDescValue>().elements[0].size,
isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(kernelInfo->getArgDescriptorAt(4).template as<ArgDescValue>().elements[0].size,
isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
};
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyImage3dToBuffer16Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyImage3dToBuffer2Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyImage3dToBuffer3Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyImage3dToBuffer4Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyImage3dToBuffer4To3Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyImage3dToBuffer6Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyImage3dToBuffer8Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyImage3dToBuffer8To6Bytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyImage3dToBufferBytes>(isStateless, isHeapless));
testBuiltinType(BuiltinTypeHelper::adjustImageBuiltinType<L0::ImageBuiltin::copyImage3dToBuffer16BytesAligned>(isStateless, isHeapless));
}
} // namespace L0::ult

View File

@@ -41,7 +41,7 @@ components:
dest_dir: kernels_bin
type: git
branch: kernels_bin
revision: 3770-5795
revision: 3770-5810
level_zero:
asset_name: level_zero
dest_dir: level_zero

View File

@@ -44,7 +44,7 @@ template <>
class BuiltInOp<EBuiltInOps::copyBufferToBuffer> : public BuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp(kernelsLib, device, true) {}
: BuiltInOp(kernelsLib, device, EBuiltInOps::copyBufferToBuffer, "") {}
template <typename OffsetType>
bool buildDispatchInfosTyped(MultiDispatchInfo &multiDispatchInfo) const {
DispatchInfoBuilder<SplitDispatch::Dim::d1D, SplitDispatch::SplitMode::kernelSplit> kernelSplit1DBuilder(clDevice);
@@ -139,16 +139,14 @@ class BuiltInOp<EBuiltInOps::copyBufferToBuffer> : public BuiltinDispatchInfoBui
MultiDeviceKernel *kernMiddle = nullptr;
MultiDeviceKernel *kernMiddleMisaligned = nullptr;
MultiDeviceKernel *kernRightLeftover = nullptr;
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, bool populateKernels)
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, EBuiltInOps::Type operation, ConstStringRef options)
: BuiltinDispatchInfoBuilder(kernelsLib, device) {
if (populateKernels) {
populate(EBuiltInOps::copyBufferToBuffer,
"",
"CopyBufferToBufferLeftLeftover", kernLeftLeftover,
"CopyBufferToBufferMiddle", kernMiddle,
"CopyBufferToBufferMiddleMisaligned", kernMiddleMisaligned,
"CopyBufferToBufferRightLeftover", kernRightLeftover);
}
populate(operation,
options,
"CopyBufferToBufferLeftLeftover", kernLeftLeftover,
"CopyBufferToBufferMiddle", kernMiddle,
"CopyBufferToBufferMiddleMisaligned", kernMiddleMisaligned,
"CopyBufferToBufferRightLeftover", kernRightLeftover);
}
};
@@ -156,13 +154,7 @@ template <>
class BuiltInOp<EBuiltInOps::copyBufferToBufferStateless> : public BuiltInOp<EBuiltInOps::copyBufferToBuffer> {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp<EBuiltInOps::copyBufferToBuffer>(kernelsLib, device, false) {
populate(EBuiltInOps::copyBufferToBufferStateless,
CompilerOptions::greaterThan4gbBuffersRequired,
"CopyBufferToBufferLeftLeftoverStateless", kernLeftLeftover,
"CopyBufferToBufferMiddleStateless", kernMiddle,
"CopyBufferToBufferMiddleMisalignedStateless", kernMiddleMisaligned,
"CopyBufferToBufferRightLeftoverStateless", kernRightLeftover);
: BuiltInOp<EBuiltInOps::copyBufferToBuffer>(kernelsLib, device, EBuiltInOps::copyBufferToBufferStateless, CompilerOptions::wideStatelessOptions.c_str()) {
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {
@@ -174,13 +166,7 @@ template <>
class BuiltInOp<EBuiltInOps::copyBufferToBufferStatelessHeapless> : public BuiltInOp<EBuiltInOps::copyBufferToBuffer> {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp<EBuiltInOps::copyBufferToBuffer>(kernelsLib, device, false) {
populate(EBuiltInOps::copyBufferToBufferStatelessHeapless,
CompilerOptions::greaterThan4gbBuffersRequired,
"CopyBufferToBufferLeftLeftoverStateless", kernLeftLeftover,
"CopyBufferToBufferMiddleStateless", kernMiddle,
"CopyBufferToBufferMiddleMisalignedStateless", kernMiddleMisaligned,
"CopyBufferToBufferRightLeftoverStateless", kernRightLeftover);
: BuiltInOp<EBuiltInOps::copyBufferToBuffer>(kernelsLib, device, EBuiltInOps::copyBufferToBufferStatelessHeapless, CompilerOptions::wideStatelessOptions.c_str()) {
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {
@@ -192,7 +178,7 @@ template <>
class BuiltInOp<EBuiltInOps::copyBufferRect> : public BuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp(kernelsLib, device, true) {}
: BuiltInOp(kernelsLib, device, EBuiltInOps::copyBufferRect, "") {}
template <typename OffsetType>
bool buildDispatchInfosTyped(MultiDispatchInfo &multiDispatchInfo) const {
@@ -397,24 +383,22 @@ class BuiltInOp<EBuiltInOps::copyBufferRect> : public BuiltinDispatchInfoBuilder
MultiDeviceKernel *kernelLeft[3]{};
MultiDeviceKernel *kernelMiddle[3]{};
MultiDeviceKernel *kernelRight[3]{};
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, bool populateKernels)
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, EBuiltInOps::Type operation, ConstStringRef options)
: BuiltinDispatchInfoBuilder(kernelsLib, device) {
if (populateKernels) {
populate(EBuiltInOps::copyBufferRect,
"",
"CopyBufferRectBytes2d", kernelBytes[0],
"CopyBufferRectBytes2d", kernelBytes[1],
"CopyBufferRectBytes3d", kernelBytes[2],
"CopyBufferRectBytes2d", kernelLeft[0],
"CopyBufferRectBytes2d", kernelLeft[1],
"CopyBufferRectBytes3d", kernelLeft[2],
"CopyBufferRectBytesMiddle2d", kernelMiddle[0],
"CopyBufferRectBytesMiddle2d", kernelMiddle[1],
"CopyBufferRectBytesMiddle3d", kernelMiddle[2],
"CopyBufferRectBytes2d", kernelRight[0],
"CopyBufferRectBytes2d", kernelRight[1],
"CopyBufferRectBytes3d", kernelRight[2]);
}
populate(operation,
options,
"CopyBufferRectBytes2d", kernelBytes[0],
"CopyBufferRectBytes2d", kernelBytes[1],
"CopyBufferRectBytes3d", kernelBytes[2],
"CopyBufferRectBytes2d", kernelLeft[0],
"CopyBufferRectBytes2d", kernelLeft[1],
"CopyBufferRectBytes3d", kernelLeft[2],
"CopyBufferRectBytesMiddle2d", kernelMiddle[0],
"CopyBufferRectBytesMiddle2d", kernelMiddle[1],
"CopyBufferRectBytesMiddle3d", kernelMiddle[2],
"CopyBufferRectBytes2d", kernelRight[0],
"CopyBufferRectBytes2d", kernelRight[1],
"CopyBufferRectBytes3d", kernelRight[2]);
}
};
@@ -422,21 +406,7 @@ template <>
class BuiltInOp<EBuiltInOps::copyBufferRectStateless> : public BuiltInOp<EBuiltInOps::copyBufferRect> {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp<EBuiltInOps::copyBufferRect>(kernelsLib, device, false) {
populate(EBuiltInOps::copyBufferRectStateless,
CompilerOptions::greaterThan4gbBuffersRequired,
"CopyBufferRectBytes2dStateless", kernelBytes[0],
"CopyBufferRectBytes2dStateless", kernelBytes[1],
"CopyBufferRectBytes3dStateless", kernelBytes[2],
"CopyBufferRectBytes2dStateless", kernelLeft[0],
"CopyBufferRectBytes2dStateless", kernelLeft[1],
"CopyBufferRectBytes3dStateless", kernelLeft[2],
"CopyBufferRectBytesMiddle2dStateless", kernelMiddle[0],
"CopyBufferRectBytesMiddle2dStateless", kernelMiddle[1],
"CopyBufferRectBytesMiddle3dStateless", kernelMiddle[2],
"CopyBufferRectBytes2dStateless", kernelRight[0],
"CopyBufferRectBytes2dStateless", kernelRight[1],
"CopyBufferRectBytes3dStateless", kernelRight[2]);
: BuiltInOp<EBuiltInOps::copyBufferRect>(kernelsLib, device, EBuiltInOps::copyBufferRectStateless, CompilerOptions::wideStatelessOptions.c_str()) {
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {
return buildDispatchInfosTyped<uint64_t>(multiDispatchInfo);
@@ -447,21 +417,7 @@ template <>
class BuiltInOp<EBuiltInOps::copyBufferRectStatelessHeapless> : public BuiltInOp<EBuiltInOps::copyBufferRect> {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp<EBuiltInOps::copyBufferRect>(kernelsLib, device, false) {
populate(EBuiltInOps::copyBufferRectStatelessHeapless,
CompilerOptions::greaterThan4gbBuffersRequired,
"CopyBufferRectBytes2dStateless", kernelBytes[0],
"CopyBufferRectBytes2dStateless", kernelBytes[1],
"CopyBufferRectBytes3dStateless", kernelBytes[2],
"CopyBufferRectBytes2dStateless", kernelLeft[0],
"CopyBufferRectBytes2dStateless", kernelLeft[1],
"CopyBufferRectBytes3dStateless", kernelLeft[2],
"CopyBufferRectBytesMiddle2dStateless", kernelMiddle[0],
"CopyBufferRectBytesMiddle2dStateless", kernelMiddle[1],
"CopyBufferRectBytesMiddle3dStateless", kernelMiddle[2],
"CopyBufferRectBytes2dStateless", kernelRight[0],
"CopyBufferRectBytes2dStateless", kernelRight[1],
"CopyBufferRectBytes3dStateless", kernelRight[2]);
: BuiltInOp<EBuiltInOps::copyBufferRect>(kernelsLib, device, EBuiltInOps::copyBufferRectStatelessHeapless, CompilerOptions::wideStatelessOptions.c_str()) {
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {
return buildDispatchInfosTyped<uint64_t>(multiDispatchInfo);
@@ -472,7 +428,7 @@ template <>
class BuiltInOp<EBuiltInOps::fillBuffer> : public BuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp(kernelsLib, device, true) {}
: BuiltInOp(kernelsLib, device, EBuiltInOps::fillBuffer, "") {}
template <typename OffsetType>
bool buildDispatchInfosTyped(MultiDispatchInfo &multiDispatchInfo) const {
@@ -549,27 +505,21 @@ class BuiltInOp<EBuiltInOps::fillBuffer> : public BuiltinDispatchInfoBuilder {
MultiDeviceKernel *kernMiddle = nullptr;
MultiDeviceKernel *kernRightLeftover = nullptr;
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, bool populateKernels)
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, EBuiltInOps::Type operation, ConstStringRef options)
: BuiltinDispatchInfoBuilder(kernelsLib, device) {
if (populateKernels) {
populate(EBuiltInOps::fillBuffer,
"",
"FillBufferLeftLeftover", kernLeftLeftover,
"FillBufferMiddle", kernMiddle,
"FillBufferRightLeftover", kernRightLeftover);
}
populate(operation,
options,
"FillBufferLeftLeftover", kernLeftLeftover,
"FillBufferMiddle", kernMiddle,
"FillBufferRightLeftover", kernRightLeftover);
}
};
template <>
class BuiltInOp<EBuiltInOps::fillBufferStateless> : public BuiltInOp<EBuiltInOps::fillBuffer> {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device) : BuiltInOp<EBuiltInOps::fillBuffer>(kernelsLib, device, false) {
populate(EBuiltInOps::fillBufferStateless,
CompilerOptions::greaterThan4gbBuffersRequired,
"FillBufferLeftLeftoverStateless", kernLeftLeftover,
"FillBufferMiddleStateless", kernMiddle,
"FillBufferRightLeftoverStateless", kernRightLeftover);
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp<EBuiltInOps::fillBuffer>(kernelsLib, device, EBuiltInOps::fillBufferStateless, CompilerOptions::wideStatelessOptions.c_str()) {
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfos) const override {
return buildDispatchInfosTyped<uint64_t>(multiDispatchInfos);
@@ -579,13 +529,10 @@ class BuiltInOp<EBuiltInOps::fillBufferStateless> : public BuiltInOp<EBuiltInOps
template <>
class BuiltInOp<EBuiltInOps::fillBufferStatelessHeapless> : public BuiltInOp<EBuiltInOps::fillBuffer> {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device) : BuiltInOp<EBuiltInOps::fillBuffer>(kernelsLib, device, false) {
populate(EBuiltInOps::fillBufferStatelessHeapless,
CompilerOptions::greaterThan4gbBuffersRequired,
"FillBufferLeftLeftoverStateless", kernLeftLeftover,
"FillBufferMiddleStateless", kernMiddle,
"FillBufferRightLeftoverStateless", kernRightLeftover);
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp<EBuiltInOps::fillBuffer>(kernelsLib, device, EBuiltInOps::fillBufferStatelessHeapless, CompilerOptions::wideStatelessOptions.c_str()) {
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfos) const override {
return buildDispatchInfosTyped<uint64_t>(multiDispatchInfos);
}
@@ -595,7 +542,7 @@ template <>
class BuiltInOp<EBuiltInOps::copyBufferToImage3d> : public BuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp(kernelsLib, device, true) {}
: BuiltInOp(kernelsLib, device, EBuiltInOps::copyBufferToImage3d, "") {}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {
return buildDispatchInfosTyped<uint32_t>(multiDispatchInfo);
@@ -604,18 +551,17 @@ class BuiltInOp<EBuiltInOps::copyBufferToImage3d> : public BuiltinDispatchInfoBu
protected:
MultiDeviceKernel *kernelBytes[5] = {nullptr};
MultiDeviceKernel *kernel16BytesAligned = nullptr;
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, bool populateKernels)
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, EBuiltInOps::Type operation, ConstStringRef options)
: BuiltinDispatchInfoBuilder(kernelsLib, device) {
if (populateKernels) {
populate(EBuiltInOps::copyBufferToImage3d,
"",
"CopyBufferToImage3dBytes", kernelBytes[0],
"CopyBufferToImage3d2Bytes", kernelBytes[1],
"CopyBufferToImage3d4Bytes", kernelBytes[2],
"CopyBufferToImage3d8Bytes", kernelBytes[3],
"CopyBufferToImage3d16Bytes", kernelBytes[4],
"CopyBufferToImage3d16BytesAligned", kernel16BytesAligned);
}
populate(operation,
options,
"CopyBufferToImage3dBytes", kernelBytes[0],
"CopyBufferToImage3d2Bytes", kernelBytes[1],
"CopyBufferToImage3d4Bytes", kernelBytes[2],
"CopyBufferToImage3d8Bytes", kernelBytes[3],
"CopyBufferToImage3d16Bytes", kernelBytes[4],
"CopyBufferToImage3d16BytesAligned", kernel16BytesAligned);
}
template <typename OffsetType>
@@ -704,15 +650,7 @@ template <>
class BuiltInOp<EBuiltInOps::copyBufferToImage3dStateless> : public BuiltInOp<EBuiltInOps::copyBufferToImage3d> {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp<EBuiltInOps::copyBufferToImage3d>(kernelsLib, device, false) {
populate(EBuiltInOps::copyBufferToImage3dStateless,
CompilerOptions::greaterThan4gbBuffersRequired,
"CopyBufferToImage3dBytesStateless", kernelBytes[0],
"CopyBufferToImage3d2BytesStateless", kernelBytes[1],
"CopyBufferToImage3d4BytesStateless", kernelBytes[2],
"CopyBufferToImage3d8BytesStateless", kernelBytes[3],
"CopyBufferToImage3d16BytesStateless", kernelBytes[4],
"CopyBufferToImage3d16BytesAlignedStateless", kernel16BytesAligned);
: BuiltInOp<EBuiltInOps::copyBufferToImage3d>(kernelsLib, device, EBuiltInOps::copyBufferToImage3dStateless, CompilerOptions::wideStatelessOptions.c_str()) {
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {
@@ -724,15 +662,7 @@ template <>
class BuiltInOp<EBuiltInOps::copyBufferToImage3dHeapless> : public BuiltInOp<EBuiltInOps::copyBufferToImage3d> {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp<EBuiltInOps::copyBufferToImage3d>(kernelsLib, device, false) {
populate(EBuiltInOps::copyBufferToImage3dHeapless,
CompilerOptions::greaterThan4gbBuffersRequired,
"CopyBufferToImage3dBytesStateless", kernelBytes[0],
"CopyBufferToImage3d2BytesStateless", kernelBytes[1],
"CopyBufferToImage3d4BytesStateless", kernelBytes[2],
"CopyBufferToImage3d8BytesStateless", kernelBytes[3],
"CopyBufferToImage3d16BytesStateless", kernelBytes[4],
"CopyBufferToImage3d16BytesAlignedStateless", kernel16BytesAligned);
: BuiltInOp<EBuiltInOps::copyBufferToImage3d>(kernelsLib, device, EBuiltInOps::copyBufferToImage3dHeapless, CompilerOptions::wideStatelessOptions.c_str()) {
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {
@@ -744,7 +674,7 @@ template <>
class BuiltInOp<EBuiltInOps::copyImage3dToBuffer> : public BuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp(kernelsLib, device, true) {}
: BuiltInOp(kernelsLib, device, EBuiltInOps::copyImage3dToBuffer, "") {}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {
return buildDispatchInfosTyped<uint32_t>(multiDispatchInfo);
@@ -754,18 +684,16 @@ class BuiltInOp<EBuiltInOps::copyImage3dToBuffer> : public BuiltinDispatchInfoBu
MultiDeviceKernel *kernelBytes[5] = {nullptr};
MultiDeviceKernel *kernel16BytesAligned = nullptr;
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, bool populateKernels)
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device, EBuiltInOps::Type operation, ConstStringRef options)
: BuiltinDispatchInfoBuilder(kernelsLib, device) {
if (populateKernels) {
populate(EBuiltInOps::copyImage3dToBuffer,
"",
"CopyImage3dToBufferBytes", kernelBytes[0],
"CopyImage3dToBuffer2Bytes", kernelBytes[1],
"CopyImage3dToBuffer4Bytes", kernelBytes[2],
"CopyImage3dToBuffer8Bytes", kernelBytes[3],
"CopyImage3dToBuffer16Bytes", kernelBytes[4],
"CopyImage3dToBuffer16BytesAligned", kernel16BytesAligned);
}
populate(operation,
options,
"CopyImage3dToBufferBytes", kernelBytes[0],
"CopyImage3dToBuffer2Bytes", kernelBytes[1],
"CopyImage3dToBuffer4Bytes", kernelBytes[2],
"CopyImage3dToBuffer8Bytes", kernelBytes[3],
"CopyImage3dToBuffer16Bytes", kernelBytes[4],
"CopyImage3dToBuffer16BytesAligned", kernel16BytesAligned);
}
template <typename OffsetType>
@@ -860,15 +788,7 @@ template <>
class BuiltInOp<EBuiltInOps::copyImage3dToBufferStateless> : public BuiltInOp<EBuiltInOps::copyImage3dToBuffer> {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp<EBuiltInOps::copyImage3dToBuffer>(kernelsLib, device, false) {
populate(EBuiltInOps::copyImage3dToBufferStateless,
CompilerOptions::greaterThan4gbBuffersRequired,
"CopyImage3dToBufferBytesStateless", kernelBytes[0],
"CopyImage3dToBuffer2BytesStateless", kernelBytes[1],
"CopyImage3dToBuffer4BytesStateless", kernelBytes[2],
"CopyImage3dToBuffer8BytesStateless", kernelBytes[3],
"CopyImage3dToBuffer16BytesStateless", kernelBytes[4],
"CopyImage3dToBuffer16BytesAlignedStateless", kernel16BytesAligned);
: BuiltInOp<EBuiltInOps::copyImage3dToBuffer>(kernelsLib, device, EBuiltInOps::copyImage3dToBufferStateless, CompilerOptions::wideStatelessOptions.c_str()) {
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {
@@ -880,15 +800,7 @@ template <>
class BuiltInOp<EBuiltInOps::copyImage3dToBufferHeapless> : public BuiltInOp<EBuiltInOps::copyImage3dToBuffer> {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: BuiltInOp<EBuiltInOps::copyImage3dToBuffer>(kernelsLib, device, false) {
populate(EBuiltInOps::copyImage3dToBufferHeapless,
CompilerOptions::greaterThan4gbBuffersRequired,
"CopyImage3dToBufferBytesStateless", kernelBytes[0],
"CopyImage3dToBuffer2BytesStateless", kernelBytes[1],
"CopyImage3dToBuffer4BytesStateless", kernelBytes[2],
"CopyImage3dToBuffer8BytesStateless", kernelBytes[3],
"CopyImage3dToBuffer16BytesStateless", kernelBytes[4],
"CopyImage3dToBuffer16BytesAlignedStateless", kernel16BytesAligned);
: BuiltInOp<EBuiltInOps::copyImage3dToBuffer>(kernelsLib, device, EBuiltInOps::copyImage3dToBufferHeapless, CompilerOptions::wideStatelessOptions.c_str()) {
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo) const override {

View File

@@ -1084,6 +1084,7 @@ TEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderFillLocalBufferStatelessIsU
}
HWTEST_F(BuiltInTests, givenBigOffsetAndSizeWhenBuilderCopyBufferToImageStatelessIsUsedThenParamsAreCorrect) {
REQUIRE_64BIT_OR_SKIP();
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
uint64_t bigSize = 10ull * MemoryConstants::gigaByte;
@@ -2137,802 +2138,3 @@ HWTEST2_F(BuiltInTests, whenBuilderFillLocalBufferStatelessHeaplessIsUsedThenPar
EXPECT_FALSE(dispatchInfo.getKernel()->getDestinationAllocationInSystemMemory());
}
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesAlignedSrcPtrWhenBuilderIsUsedThenAlignedKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *srcPtr = alignedBuffer;
ASSERT_EQ(0u, reinterpret_cast<uintptr_t>(srcPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcPtr = srcPtr;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") != std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesAlignedSrcMemObjWhenBuilderIsUsedThenAlignedKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
MockBuffer srcBuffer;
srcBuffer.size = 1024;
srcBuffer.mockGfxAllocation.gpuAddress = 0x1000;
ASSERT_EQ(0u, srcBuffer.mockGfxAllocation.gpuAddress & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = &srcBuffer;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") != std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithAddressMisalignedSrcPtrWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
uint8_t buffer[bufferSize + 16];
void *srcPtr = buffer + 1;
ASSERT_NE(0u, reinterpret_cast<uintptr_t>(srcPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcPtr = srcPtr;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyImage3dToBuffer16BytesAlignedDstPtrWhenBuilderIsUsedThenAlignedKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *dstPtr = alignedBuffer;
ASSERT_EQ(0u, reinterpret_cast<uintptr_t>(dstPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = image;
dc.dstPtr = dstPtr;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") != std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyImage3dToBuffer16BytesAlignedDstMemObjWhenBuilderIsUsedThenAlignedKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
MockBuffer dstBuffer;
dstBuffer.size = 1024;
dstBuffer.mockGfxAllocation.gpuAddress = 0x2000;
ASSERT_EQ(0u, dstBuffer.mockGfxAllocation.gpuAddress & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = image;
dc.dstMemObj = &dstBuffer;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") != std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyImage3dToBuffer16BytesWithAddressMisalignedDstPtrWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
uint8_t buffer[bufferSize + 16];
void *dstPtr = buffer + 1;
ASSERT_NE(0u, reinterpret_cast<uintptr_t>(dstPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = image;
dc.dstPtr = dstPtr;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyImage3dToBufferNon16BytesFormatWhenBuilderIsUsedThenAlignmentCheckIsSkipped) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_R, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *dstPtr = alignedBuffer;
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = image;
dc.dstPtr = dstPtr;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("4Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") == std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImageNon16BytesFormatWhenBuilderIsUsedThenAlignmentCheckIsSkipped) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_R, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *srcPtr = alignedBuffer;
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcPtr = srcPtr;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("4Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") == std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithImageAsSrcMemObjWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto dstImage = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, dstImage);
auto srcImage = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, srcImage);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = srcImage;
dc.dstMemObj = dstImage;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete dstImage;
delete srcImage;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithAlignedBufferWhenBuilderIsUsedThenAlignedKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
cl_int retVal = CL_SUCCESS;
auto srcBuffer = Buffer::create(pContext, 0, bufferSize, nullptr, retVal);
ASSERT_NE(nullptr, srcBuffer);
ASSERT_EQ(CL_SUCCESS, retVal);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = srcBuffer;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") != std::string::npos);
delete image;
delete srcBuffer;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithOffsetMisalignedBufferWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
cl_int retVal = CL_SUCCESS;
auto srcBuffer = Buffer::create(pContext, 0, bufferSize, nullptr, retVal);
ASSERT_NE(nullptr, srcBuffer);
ASSERT_EQ(CL_SUCCESS, retVal);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = srcBuffer;
dc.dstMemObj = image;
dc.srcOffset = {1, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
delete srcBuffer;
}
HWTEST_F(BuiltInTests, givenCopyImageToBuffer16BytesWithImageAsDstMemObjWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto srcImage = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, srcImage);
auto dstImage = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, dstImage);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = srcImage;
dc.dstMemObj = dstImage;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete srcImage;
delete dstImage;
}
HWTEST_F(BuiltInTests, givenCopyImageToBuffer16BytesWithAlignedBufferWhenBuilderIsUsedThenAlignedKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto srcImage = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, srcImage);
constexpr size_t bufferSize = 1024;
cl_int retVal = CL_SUCCESS;
auto dstBuffer = Buffer::create(pContext, 0, bufferSize, nullptr, retVal);
ASSERT_NE(nullptr, dstBuffer);
ASSERT_EQ(CL_SUCCESS, retVal);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = srcImage;
dc.dstMemObj = dstBuffer;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") != std::string::npos);
delete srcImage;
delete dstBuffer;
}
HWTEST_F(BuiltInTests, givenCopyImageToBuffer16BytesWithOffsetMisalignedBufferWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto srcImage = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, srcImage);
constexpr size_t bufferSize = 1024;
cl_int retVal = CL_SUCCESS;
auto dstBuffer = Buffer::create(pContext, 0, bufferSize, nullptr, retVal);
ASSERT_NE(nullptr, dstBuffer);
ASSERT_EQ(CL_SUCCESS, retVal);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = srcImage;
dc.dstMemObj = dstBuffer;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {1, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete srcImage;
delete dstBuffer;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithRowPitchMisalignedWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *srcPtr = alignedBuffer;
ASSERT_EQ(0u, reinterpret_cast<uintptr_t>(srcPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcPtr = srcPtr;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 68; // 16-byte misaligned row pitch
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyImage3dToBuffer16BytesWithSlicePitchMisalignedWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *dstPtr = alignedBuffer;
ASSERT_EQ(0u, reinterpret_cast<uintptr_t>(dstPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = image;
dc.dstPtr = dstPtr;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 132; // 16-byte misaligned slice pitch
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithOffsetMisalignedWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
constexpr size_t bufferSize = 1024;
alignas(16) uint8_t alignedBuffer[bufferSize];
void *srcPtr = alignedBuffer;
ASSERT_EQ(0u, reinterpret_cast<uintptr_t>(srcPtr) & 0x0000000F);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcPtr = srcPtr;
dc.dstMemObj = image;
dc.srcOffset = {8, 0, 0}; // 16-byte misaligned offset
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyImage3dToBuffer16BytesWithNullAddressWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
MockBuffer dstBuffer;
dstBuffer.size = 1024;
dstBuffer.mockGfxAllocation.gpuAddress = 0;
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcMemObj = image;
dc.dstMemObj = &dstBuffer;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}
HWTEST_F(BuiltInTests, givenCopyBufferToImage16BytesWithNullPtrAndNullMemObjWhenBuilderIsUsedThenRegularKernelIsSelected) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
auto &compilerProductHelper = pClDevice->getCompilerProductHelper();
bool isStateless = compilerProductHelper.isForceToStatelessRequired();
bool heaplessAllowed = compilerProductHelper.isHeaplessModeEnabled(pClDevice->getHardwareInfo());
auto builtInType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, heaplessAllowed);
cl_image_format imageFormat = {CL_RGBA, CL_FLOAT};
cl_image_desc imageDesc = {CL_MEM_OBJECT_IMAGE3D, 4, 4, 4, 1, 0, 0, 0, 0, {nullptr}};
auto image = ImageHelperUlt<Image3dDefaults>::create(pContext, &imageDesc, &imageFormat);
ASSERT_NE(nullptr, image);
auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, *pClDevice);
BuiltinOpParams dc;
dc.srcPtr = nullptr;
dc.srcMemObj = nullptr;
dc.dstMemObj = image;
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.srcRowPitch = 0;
dc.srcSlicePitch = 0;
MultiDispatchInfo multiDispatchInfo(dc);
ASSERT_TRUE(builder.buildDispatchInfos(multiDispatchInfo));
EXPECT_EQ(1u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("16Bytes") != std::string::npos);
EXPECT_TRUE(kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.find("Aligned") == std::string::npos);
delete image;
}

View File

@@ -167,9 +167,6 @@ HWTEST_F(EnqueueFillBufferCmdTests, GivenRightLeftoverWhenFillingBufferThenFillB
builder.buildDispatchInfos(mdi);
EXPECT_EQ(1u, mdi.size());
auto kernel = mdi.begin()->getKernel();
EXPECT_STREQ(EBuiltInOps::isHeapless(builtInType) ? "FillBufferRightLeftoverStateless" : "FillBufferRightLeftover", kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.c_str());
context.getMemoryManager()->freeGraphicsMemory(patternAllocation);
}
@@ -195,9 +192,6 @@ HWTEST_F(EnqueueFillBufferCmdTests, GivenMiddleWhenFillingBufferThenFillBufferMi
builder.buildDispatchInfos(mdi);
EXPECT_EQ(1u, mdi.size());
auto kernel = mdi.begin()->getKernel();
EXPECT_STREQ(EBuiltInOps::isHeapless(builtInType) ? "FillBufferMiddleStateless" : "FillBufferMiddle", kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.c_str());
context.getMemoryManager()->freeGraphicsMemory(patternAllocation);
}
@@ -223,9 +217,6 @@ HWTEST_F(EnqueueFillBufferCmdTests, GivenLeftLeftoverWhenFillingBufferThenFillBu
builder.buildDispatchInfos(mdi);
EXPECT_EQ(1u, mdi.size());
auto kernel = mdi.begin()->getKernel();
EXPECT_STREQ(EBuiltInOps::isHeapless(builtInType) ? "FillBufferLeftLeftoverStateless" : "FillBufferLeftLeftover", kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.c_str());
context.getMemoryManager()->freeGraphicsMemory(patternAllocation);
}

View File

@@ -159,8 +159,9 @@ HWTEST_F(EnqueueSvmMemCopyTest, givenEnqueueSVMMemcpyWhenUsingCopyBufferToBuffer
size_t middleElSize = 4 * sizeof(uint32_t);
EXPECT_EQ(Vec3<size_t>(256 / middleElSize, 1, 1), di->getGWS());
auto kernel = mdi->begin()->getKernel();
EXPECT_EQ(EBuiltInOps::isStateless(builtIn) ? "CopyBufferToBufferMiddleStateless" : "CopyBufferToBufferMiddle", kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName);
auto kernel = di->getKernel();
EXPECT_EQ(kernel->getKernelInfo().getArgDescriptorAt(2).as<ArgDescValue>().elements[0].size, EBuiltInOps::isStateless(builtIn) ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(kernel->getKernelInfo().getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, EBuiltInOps::isStateless(builtIn) ? sizeof(uint64_t) : sizeof(uint32_t));
}
HWTEST_F(EnqueueSvmMemCopyTest, givenEnqueueSVMMemcpyWhenUsingCopyBufferToBufferBuilderAndSrcHostPtrThenItConfiguredWithBuiltinOpsAndProducesDispatchInfo) {

View File

@@ -227,9 +227,6 @@ HWTEST_P(EnqueueSvmMemFillTest, givenEnqueueSVMMemFillWhenUsingFillBufferBuilder
auto di = mdi->begin();
size_t middleElSize = sizeof(uint32_t);
EXPECT_EQ(Vec3<size_t>(256 / middleElSize, 1, 1), di->getGWS());
auto kernel = di->getKernel();
EXPECT_STREQ(EBuiltInOps::isStateless(builtIn) ? "FillBufferMiddleStateless" : "FillBufferMiddle", kernel->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName.c_str());
}
INSTANTIATE_TEST_SUITE_P(size_t,

View File

@@ -23,6 +23,7 @@ set(IGDRCL_SRCS_tests_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernel_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/debug_kernel_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/substitute_kernel_heap_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/kernel_addressing_tests.cpp
)
if(TESTS_PVC_AND_LATER)

View File

@@ -0,0 +1,192 @@
/*
* Copyright (C) 2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/built_ins/built_in_ops_base.h"
#include "shared/source/built_ins/built_ins.h"
#include "shared/source/helpers/compiler_product_helper.h"
#include "shared/source/kernel/kernel_arg_descriptor.h"
#include "shared/source/program/kernel_info.h"
#include "shared/test/common/fixtures/device_fixture.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/test_macros/hw_test.h"
#include "shared/test/common/test_macros/test_checks_shared.h"
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
#include "opencl/source/cl_device/cl_device.h"
#include "opencl/source/cl_device/cl_device_vector.h"
#include "opencl/source/program/program.h"
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
#include "opencl/test/unit_test/mocks/mock_cl_device.h"
using namespace NEO;
struct KernelAddressingTest : public ClDeviceFixture, public ::testing::Test {
void SetUp() override {
ClDeviceFixture::setUp();
auto &compilerProductHelper = pDevice->getCompilerProductHelper();
isHeapless = compilerProductHelper.isHeaplessModeEnabled(pDevice->getHardwareInfo());
isStateless = compilerProductHelper.isForceToStatelessRequired();
}
void TearDown() override {
prog.reset();
ClDeviceFixture::tearDown();
}
bool isHeapless = false;
bool isStateless = false;
std::unique_ptr<Program> prog;
};
TEST_F(KernelAddressingTest, givenBuiltinCopyBufferToBufferKernelsWhenFetchedFromProgramThenCorrectArgumentSizesAreUsed) {
const auto builtinType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToBuffer>(isStateless, isHeapless);
auto src = pDevice->getBuiltIns()->getBuiltinsLib().getBuiltinCode(builtinType, BuiltinCode::ECodeType::any, *pDevice);
ClDeviceVector deviceVector;
deviceVector.push_back(pClDevice);
prog.reset(BuiltinDispatchInfoBuilder::createProgramFromCode(src, deviceVector).release());
prog->build(deviceVector, "");
const std::vector<const char *> bufferToBufferKernelNames = {
"CopyBufferToBufferBytes",
"CopyBufferToBufferMiddle",
"CopyBufferToBufferMiddleMisaligned",
"CopyBufferToBufferRightLeftover",
"CopyBufferToBufferSideRegion",
"CopyBufferToBufferMiddleRegion"};
for (const auto &kernelName : bufferToBufferKernelNames) {
auto pKernelInfo = prog->getKernelInfo(kernelName, 0);
ASSERT_NE(pKernelInfo, nullptr);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(2).as<ArgDescValue>().elements[0].size,
isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size,
isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
}
}
TEST_F(KernelAddressingTest, givenBuiltinCopyBufferRectKernelsWhenFetchedFromProgramThenCorrectArgumentSizesAreUsed) {
const auto builtinType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferRect>(isStateless, isHeapless);
auto src = pDevice->getBuiltIns()->getBuiltinsLib().getBuiltinCode(builtinType, BuiltinCode::ECodeType::any, *pDevice);
ClDeviceVector deviceVector;
deviceVector.push_back(pClDevice);
prog.reset(BuiltinDispatchInfoBuilder::createProgramFromCode(src, deviceVector).release());
prog->build(deviceVector, "");
auto pKernelInfo = prog->getKernelInfo("CopyBufferRectBytes2d", 0);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(2).as<ArgDescValue>().elements[0].size, isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(4).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(5).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
pKernelInfo = prog->getKernelInfo("CopyBufferRectBytesMiddle2d", 0);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(2).as<ArgDescValue>().elements[0].size, isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(4).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(5).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
pKernelInfo = prog->getKernelInfo("CopyBufferRectBytes3d", 0);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(2).as<ArgDescValue>().elements[0].size, isStateless ? 4 * sizeof(uint64_t) : 4 * sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? 4 * sizeof(uint64_t) : 4 * sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(4).as<ArgDescValue>().elements[0].size, isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(5).as<ArgDescValue>().elements[0].size, isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
pKernelInfo = prog->getKernelInfo("CopyBufferRectBytesMiddle3d", 0);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(2).as<ArgDescValue>().elements[0].size, isStateless ? 4 * sizeof(uint64_t) : 4 * sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? 4 * sizeof(uint64_t) : 4 * sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(4).as<ArgDescValue>().elements[0].size, isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(5).as<ArgDescValue>().elements[0].size, isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
}
TEST_F(KernelAddressingTest, givenBuiltinFillBufferKernelsWhenFetchedFromProgramThenCorrectArgumentSizesAreUsed) {
const auto builtinType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::fillBuffer>(isStateless, isHeapless);
auto src = pDevice->getBuiltIns()->getBuiltinsLib().getBuiltinCode(builtinType, BuiltinCode::ECodeType::any, *pDevice);
ClDeviceVector deviceVector;
deviceVector.push_back(pClDevice);
prog.reset(BuiltinDispatchInfoBuilder::createProgramFromCode(src, deviceVector).release());
prog->build(deviceVector, "");
auto pKernelInfo = prog->getKernelInfo("FillBufferBytes", 0);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(1).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
pKernelInfo = prog->getKernelInfo("FillBufferLeftLeftover", 0);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(1).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
pKernelInfo = prog->getKernelInfo("FillBufferMiddle", 0);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(1).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
pKernelInfo = prog->getKernelInfo("FillBufferRightLeftover", 0);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(1).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
pKernelInfo = prog->getKernelInfo("FillBufferImmediate", 0);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(1).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
pKernelInfo = prog->getKernelInfo("FillBufferImmediateLeftOver", 0);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(1).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
pKernelInfo = prog->getKernelInfo("FillBufferSSHOffset", 0);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(1).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size, isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
}
TEST_F(KernelAddressingTest, givenBuiltinCopyBufferToImage3dKernelsWhenFetchedFromProgramThenCorrectArgumentSizesAreUsed) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
const auto builtinType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyBufferToImage3d>(isStateless, isHeapless);
auto src = pDevice->getBuiltIns()->getBuiltinsLib().getBuiltinCode(builtinType, BuiltinCode::ECodeType::any, *pDevice);
ClDeviceVector deviceVector;
deviceVector.push_back(pClDevice);
prog.reset(BuiltinDispatchInfoBuilder::createProgramFromCode(src, deviceVector).release());
prog->build(deviceVector, "");
const char *bufferToImageKernelNames[] = {
"CopyBufferToImage3dBytes",
"CopyBufferToImage3d2Bytes",
"CopyBufferToImage3d4Bytes",
"CopyBufferToImage3d3To4Bytes",
"CopyBufferToImage3d8Bytes",
"CopyBufferToImage3d6To8Bytes",
"CopyBufferToImage3d16Bytes",
"CopyBufferToImage3d16BytesAligned"};
for (const auto &kernelName : bufferToImageKernelNames) {
auto pKernelInfo = prog->getKernelInfo(kernelName, 0);
ASSERT_NE(pKernelInfo, nullptr);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(2).as<ArgDescValue>().elements[0].size,
isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(4).as<ArgDescValue>().elements[0].size,
isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
}
}
TEST_F(KernelAddressingTest, givenBuiltinCopyImage3dToBufferKernelsWhenFetchedFromProgramThenCorrectArgumentSizesAreUsed) {
REQUIRE_IMAGES_OR_SKIP(defaultHwInfo);
const auto builtinType = EBuiltInOps::adjustBuiltinType<EBuiltInOps::copyImage3dToBuffer>(isStateless, isHeapless);
auto src = pDevice->getBuiltIns()->getBuiltinsLib().getBuiltinCode(builtinType, BuiltinCode::ECodeType::any, *pDevice);
ClDeviceVector deviceVector;
deviceVector.push_back(pClDevice);
prog.reset(BuiltinDispatchInfoBuilder::createProgramFromCode(src, deviceVector).release());
auto ret = prog->build(deviceVector, "");
ASSERT_EQ(ret, CL_SUCCESS);
const char *kernelNames[] = {
"CopyImage3dToBufferBytes",
"CopyImage3dToBuffer2Bytes",
"CopyImage3dToBuffer4Bytes",
"CopyImage3dToBuffer4To3Bytes",
"CopyImage3dToBuffer8Bytes",
"CopyImage3dToBuffer8To6Bytes",
"CopyImage3dToBuffer16Bytes",
"CopyImage3dToBuffer16BytesAligned"};
for (const auto &kernelName : kernelNames) {
auto pKernelInfo = prog->getKernelInfo(kernelName, 0);
ASSERT_NE(pKernelInfo, nullptr);
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(3).as<ArgDescValue>().elements[0].size,
isStateless ? sizeof(uint64_t) : sizeof(uint32_t));
EXPECT_EQ(pKernelInfo->getArgDescriptorAt(4).as<ArgDescValue>().elements[0].size,
isStateless ? 2 * sizeof(uint64_t) : 2 * sizeof(uint32_t));
}
}

View File

@@ -24,19 +24,14 @@ set_property(GLOBAL PROPERTY NEO_CORE_SRCS_BUILT_INS ${NEO_CORE_SRCS_BUILT_INS})
set(NEO_CORE_SRCS_BUILT_IN_KERNELS
${CMAKE_CURRENT_SOURCE_DIR}/kernels/aux_translation.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_rect.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_rect_stateless.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_to_buffer.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_to_buffer_stateless.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_to_image3d.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_to_image3d_stateless.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_kernel_timestamps.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_image3d_to_buffer.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_image3d_to_buffer_stateless.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_image_to_image1d.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_image_to_image2d.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_image_to_image3d.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/fill_buffer.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/fill_buffer_stateless.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/fill_image1d.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/fill_image2d.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/fill_image3d.builtin_kernel

View File

@@ -30,30 +30,25 @@ const char *getBuiltinAsString(EBuiltInOps::Type builtin) {
case EBuiltInOps::auxTranslation:
return "aux_translation.builtin_kernel";
case EBuiltInOps::copyBufferToBuffer:
return "copy_buffer_to_buffer.builtin_kernel";
case EBuiltInOps::copyBufferToBufferStateless:
case EBuiltInOps::copyBufferToBufferStatelessHeapless:
return "copy_buffer_to_buffer_stateless.builtin_kernel";
return "copy_buffer_to_buffer.builtin_kernel";
case EBuiltInOps::copyBufferRect:
return "copy_buffer_rect.builtin_kernel";
case EBuiltInOps::copyBufferRectStateless:
case EBuiltInOps::copyBufferRectStatelessHeapless:
return "copy_buffer_rect_stateless.builtin_kernel";
return "copy_buffer_rect.builtin_kernel";
case EBuiltInOps::fillBuffer:
return "fill_buffer.builtin_kernel";
case EBuiltInOps::fillBufferStateless:
case EBuiltInOps::fillBufferStatelessHeapless:
return "fill_buffer_stateless.builtin_kernel";
return "fill_buffer.builtin_kernel";
case EBuiltInOps::copyBufferToImage3d:
return "copy_buffer_to_image3d.builtin_kernel";
case EBuiltInOps::copyBufferToImage3dStateless:
case EBuiltInOps::copyBufferToImage3dHeapless:
return "copy_buffer_to_image3d_stateless.builtin_kernel";
return "copy_buffer_to_image3d.builtin_kernel";
case EBuiltInOps::copyImage3dToBuffer:
return "copy_image3d_to_buffer.builtin_kernel";
case EBuiltInOps::copyImage3dToBufferStateless:
case EBuiltInOps::copyImage3dToBufferHeapless:
return "copy_image3d_to_buffer_stateless.builtin_kernel";
return "copy_image3d_to_buffer.builtin_kernel";
case EBuiltInOps::copyImageToImage1d:
case EBuiltInOps::copyImageToImage1dHeapless:
return "copy_image_to_image1d.builtin_kernel";

View File

@@ -35,14 +35,14 @@ set(GENERATED_BUILTINS_IMAGES
)
set(GENERATED_BUILTINS_IMAGES_STATELESS
"copy_buffer_to_image3d_stateless"
"copy_image3d_to_buffer_stateless"
"copy_buffer_to_image3d"
"copy_image3d_to_buffer"
)
set(GENERATED_BUILTINS_STATELESS
"copy_buffer_to_buffer_stateless"
"copy_buffer_rect_stateless"
"fill_buffer_stateless"
"copy_buffer_to_buffer"
"copy_buffer_rect"
"fill_buffer"
)
foreach(MODE ${ADDRESSING_MODES})

View File

@@ -74,7 +74,11 @@ function(compile_builtin core_type platform_it builtin bits builtin_options mode
set(heapless_mode "enable")
endif()
if(${mode} STREQUAL "stateless" OR ${mode} STREQUAL "heapless")
list(APPEND __ocloc__options__ "-DWIDE_STATELESS=1")
endif()
list(APPEND __ocloc__options__ "-cl-kernel-arg-info")
set(INTERNAL_OPTIONS "${${mode}_INTERNAL_OPTIONS}")
add_custom_command(
OUTPUT ${OUTPUT_FILE_SPV}

View File

@@ -6,88 +6,84 @@
*/
R"===(
#include "kernel_types.h"
__kernel void CopyBufferRectBytes2d(
__global const char* src,
__global char* dst,
uint2 SrcOrigin,
uint2 DstOrigin,
uint SrcPitch,
uint DstPitch )
coord2_t SrcOrigin,
coord2_t DstOrigin,
idx_t SrcPitch,
idx_t DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
idx_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
idx_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
__kernel void CopyBufferRectBytesMiddle2d(
const __global uint* src,
__global uint* dst,
uint2 SrcOrigin,
uint2 DstOrigin,
uint SrcPitch,
uint DstPitch )
coord2_t SrcOrigin,
coord2_t DstOrigin,
idx_t SrcPitch,
idx_t DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
uint LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
uint LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
idx_t LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
idx_t LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
src += LSrcOffset >> 2;
dst += LDstOffset >> 2;
uint4 loaded = vload4(x,src);
vstore4(loaded,x,dst);
uint4 loaded = vload4(x, src);
vstore4(loaded, x, dst);
}
__kernel void CopyBufferRectBytes3d(
__global const char* src,
__global char* dst,
uint4 SrcOrigin,
uint4 DstOrigin,
uint2 SrcPitch,
uint2 DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
*( dst + LDstOffset ) = *( src + LSrcOffset );
__global const char* src,
__global char* dst,
coord4_t SrcOrigin,
coord4_t DstOrigin,
coord2_t SrcPitch,
coord2_t DstPitch )
{
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
idx_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
idx_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
__kernel void CopyBufferRectBytesMiddle3d(
const __global uint* src,
__global uint* dst,
uint4 SrcOrigin,
uint4 DstOrigin,
uint2 SrcPitch,
uint2 DstPitch )
coord4_t SrcOrigin,
coord4_t DstOrigin,
coord2_t SrcPitch,
coord2_t DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
uint LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
uint LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
idx_t LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
idx_t LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
src += LSrcOffset >> 2;
dst += LDstOffset >> 2;
uint4 loaded = vload4(x,src);
vstore4(loaded,x,dst);
uint4 loaded = vload4(x, src);
vstore4(loaded, x, dst);
}
)==="

View File

@@ -1,93 +0,0 @@
/*
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
__kernel void CopyBufferRectBytes2dStateless(
__global const char* src,
__global char* dst,
ulong2 SrcOrigin,
ulong2 DstOrigin,
ulong SrcPitch,
ulong DstPitch )
{
size_t x = get_global_id(0);
size_t y = get_global_id(1);
size_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
size_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
__kernel void CopyBufferRectBytesMiddle2dStateless(
const __global uint* src,
__global uint* dst,
ulong2 SrcOrigin,
ulong2 DstOrigin,
ulong SrcPitch,
ulong DstPitch )
{
size_t x = get_global_id(0);
size_t y = get_global_id(1);
size_t LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
size_t LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
src += LSrcOffset >> 2;
dst += LDstOffset >> 2;
uint4 loaded = vload4(x,src);
vstore4(loaded,x,dst);
}
__kernel void CopyBufferRectBytes3dStateless(
__global const char* src,
__global char* dst,
ulong4 SrcOrigin,
ulong4 DstOrigin,
ulong2 SrcPitch,
ulong2 DstPitch )
{
size_t x = get_global_id(0);
size_t y = get_global_id(1);
size_t z = get_global_id(2);
size_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
size_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
__kernel void CopyBufferRectBytesMiddle3dStateless(
const __global uint* src,
__global uint* dst,
ulong4 SrcOrigin,
ulong4 DstOrigin,
ulong2 SrcPitch,
ulong2 DstPitch )
{
size_t x = get_global_id(0);
size_t y = get_global_id(1);
size_t z = get_global_id(2);
size_t LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
size_t LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
src += LSrcOffset >> 2;
dst += LDstOffset >> 2;
uint4 loaded = vload4(x,src);
vstore4(loaded,x,dst);
}
)==="

View File

@@ -1,19 +1,21 @@
/*
* Copyright (C) 2020-2023 Intel Corporation
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
#include "kernel_types.h"
#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0)
__kernel void CopyBufferToBufferBytes(
const __global uchar* pSrc,
__global uchar* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes,
uint bytesToRead )
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes,
offset_t bytesToRead )
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
@@ -25,24 +27,24 @@ __kernel void CopyBufferToBufferBytes(
__kernel void CopyBufferToBufferLeftLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes)
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void CopyBufferToBufferMiddle(
const __global uint* pSrc,
__global uint* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes)
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
uint4 loaded = vload4(gid, pSrc);
@@ -52,17 +54,17 @@ __kernel void CopyBufferToBufferMiddle(
__kernel void CopyBufferToBufferMiddleMisaligned(
__global const uint* pSrc,
__global uint* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes,
uint misalignmentInBits)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
const size_t gid = get_global_id(0);
idx_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
const uint4 src0 = vload4(gid, pSrc);
const uint4 src1 = vload4(gid + 1, pSrc);
const uint4 src1 = vload4((gid + 1), pSrc);
uint4 result;
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
@@ -75,32 +77,33 @@ __kernel void CopyBufferToBufferMiddleMisaligned(
__kernel void CopyBufferToBufferRightLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes)
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void copyBufferToBufferBytesSingle(__global uchar *dst, const __global uchar *src) {
ALIGNED4(dst);
ALIGNED4(src);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
dst[gid] = (uchar)(src[gid]);
}
__kernel void CopyBufferToBufferSideRegion(
__global uchar* pDst,
const __global uchar* pSrc,
unsigned int len,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
idx_t len,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
__global uchar* pDstWithOffset = (__global uchar*)((__global uchar*)pDst + dstSshOffset);
__global uchar* pSrcWithOffset = (__global uchar*)((__global uchar*)pSrc + srcSshOffset);
if (gid < len) {
@@ -111,14 +114,14 @@ __kernel void CopyBufferToBufferSideRegion(
__kernel void CopyBufferToBufferMiddleRegion(
__global uint* pDst,
const __global uint* pSrc,
unsigned int elems,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
idx_t elems,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
__global uint* pDstWithOffset = (__global uint*)((__global uchar*)pDst + dstSshOffset);
__global uint* pSrcWithOffset = (__global uint*)((__global uchar*)pSrc + srcSshOffset);
if (gid < elems) {
@@ -126,4 +129,5 @@ __kernel void CopyBufferToBufferMiddleRegion(
vstore4(loaded, gid, pDstWithOffset);
}
}
)==="

View File

@@ -1,113 +0,0 @@
/*
* Copyright (C) 2019-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
__kernel void CopyBufferToBufferBytesStateless(
const __global uchar* pSrc,
__global uchar* pDst,
ulong srcOffsetInBytes,
ulong dstOffsetInBytes,
ulong bytesToRead )
{
pSrc += ( srcOffsetInBytes + get_global_id(0) );
pDst += ( dstOffsetInBytes + get_global_id(0) );
pDst[ 0 ] = pSrc[ 0 ];
}
__kernel void CopyBufferToBufferLeftLeftoverStateless(
const __global uchar* pSrc,
__global uchar* pDst,
ulong srcOffsetInBytes,
ulong dstOffsetInBytes)
{
size_t gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void CopyBufferToBufferMiddleStateless(
const __global uint* pSrc,
__global uint* pDst,
ulong srcOffsetInBytes,
ulong dstOffsetInBytes)
{
size_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
uint4 loaded = vload4(gid, pSrc);
vstore4(loaded, gid, pDst);
}
__kernel void CopyBufferToBufferMiddleMisalignedStateless(
__global const uint* pSrc,
__global uint* pDst,
ulong srcOffsetInBytes,
ulong dstOffsetInBytes,
uint misalignmentInBits)
{
const size_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
const uint4 src0 = vload4(gid, pSrc);
const uint4 src1 = vload4(gid + 1, pSrc);
uint4 result;
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
result.y = (src0.y >> misalignmentInBits) | (src0.z << (32 - misalignmentInBits));
result.z = (src0.z >> misalignmentInBits) | (src0.w << (32 - misalignmentInBits));
result.w = (src0.w >> misalignmentInBits) | (src1.x << (32 - misalignmentInBits));
vstore4(result, gid, pDst);
}
__kernel void CopyBufferToBufferRightLeftoverStateless(
const __global uchar* pSrc,
__global uchar* pDst,
ulong srcOffsetInBytes,
ulong dstOffsetInBytes)
{
size_t gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void copyBufferToBufferBytesSingleStateless(__global uchar *dst, const __global uchar *src) {
size_t gid = get_global_id(0);
dst[gid] = (uchar)(src[gid]);
}
__kernel void CopyBufferToBufferSideRegionStateless(
__global uchar* pDst,
const __global uchar* pSrc,
ulong len,
ulong dstSshOffset,
ulong srcSshOffset
)
{
size_t gid = get_global_id(0);
__global uchar* pDstWithOffset = (__global uchar*)((__global uchar*)pDst + dstSshOffset);
__global uchar* pSrcWithOffset = (__global uchar*)((__global uchar*)pSrc + srcSshOffset);
if (gid < len) {
pDstWithOffset[ gid ] = pSrcWithOffset[ gid ];
}
}
__kernel void CopyBufferToBufferMiddleRegionStateless(
__global uint* pDst,
const __global uint* pSrc,
ulong elems,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
ulong srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
size_t gid = get_global_id(0);
__global uint* pDstWithOffset = (__global uint*)((__global uchar*)pDst + dstSshOffset);
__global uint* pSrcWithOffset = (__global uint*)((__global uchar*)pSrc + srcSshOffset);
if (gid < elems) {
uint4 loaded = vload4(gid, pSrcWithOffset);
vstore4(loaded, gid, pDstWithOffset);
}
}
)==="

View File

@@ -8,36 +8,38 @@
R"===(
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
#include "kernel_types.h"
__kernel void CopyBufferToImage3dBytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
offset_t srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
write_imageui(output, dstCoord, (uint4)(*(src + LOffset + x), 0, 0, 1));
}
__kernel void CopyBufferToImage3d2Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
offset_t srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
if(( ulong )(src + srcOffset) & 0x00000001){
if(((ulong)(src + srcOffset)) & 0x00000001){
ushort upper = *((__global uchar*)(src + LOffset + x * 2 + 1));
ushort lower = *((__global uchar*)(src + LOffset + x * 2));
ushort combined = (upper << 8) | lower;
@@ -51,19 +53,19 @@ __kernel void CopyBufferToImage3d2Bytes(__global uchar *src,
__kernel void CopyBufferToImage3d4Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
offset_t srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
if(( ulong )(src + srcOffset) & 0x00000003){
if(((ulong)(src + srcOffset)) & 0x00000003){
uint upper2 = *((__global uchar*)(src + LOffset + x * 4 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 4 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 4 + 1));
@@ -78,45 +80,44 @@ __kernel void CopyBufferToImage3d4Bytes(__global uchar *src,
}
__kernel void CopyBufferToImage3d3To4Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
__write_only image3d_t output,
offset_t srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
uint upper2 = 0;
uint upper = *((__global uchar*)(src + LOffset + x * 3 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 3 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 3));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
write_imageui(output, dstCoord, c);
}
__kernel void CopyBufferToImage3d8Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
offset_t srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8));
if(( ulong )(src + srcOffset) & 0x00000007){
if(((ulong)(src + srcOffset)) & 0x00000007){
uint upper2 = *((__global uchar*)(src + LOffset + x * 8 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 8 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 8 + 1));
@@ -138,27 +139,26 @@ __kernel void CopyBufferToImage3d8Bytes(__global uchar *src,
}
__kernel void CopyBufferToImage3d6To8Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
__write_only image3d_t output,
offset_t srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8));
uint upper2 = *((__global uchar*)(src + LOffset + x * 6 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 6 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 6 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 6));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = upper = 0;
lower2 = *((__global uchar*)(src + LOffset + x * 6 + 5));
lower = *((__global uchar*)(src + LOffset + x * 6 + 4));
@@ -170,15 +170,15 @@ __kernel void CopyBufferToImage3d6To8Bytes(__global uchar *src,
__kernel void CopyBufferToImage3d16Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
offset_t srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 0);
@@ -211,16 +211,16 @@ __kernel void CopyBufferToImage3d16Bytes(__global uchar *src,
}
__kernel void CopyBufferToImage3d16BytesAligned(__global uint4 *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
__write_only image3d_t output,
offset_t srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = src[(LOffset >> 4) + x];

View File

@@ -1,228 +0,0 @@
/*
* Copyright (C) 2019-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
__kernel void CopyBufferToImage3dBytesStateless(__global uchar *src,
__write_only image3d_t output,
ulong srcOffset,
int4 dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
ulong LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
write_imageui(output, dstCoord, (uint4)(*(src + LOffset + x), 0, 0, 1));
}
__kernel void CopyBufferToImage3d2BytesStateless(__global uchar *src,
__write_only image3d_t output,
ulong srcOffset,
int4 dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
ulong LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
if(( ulong )(src + srcOffset) & 0x00000001){
ushort upper = *((__global uchar*)(src + LOffset + x * 2 + 1));
ushort lower = *((__global uchar*)(src + LOffset + x * 2));
ushort combined = (upper << 8) | lower;
c.x = (uint)combined;
}
else{
c.x = (uint)(*(__global ushort*)(src + LOffset + x * 2));
}
write_imageui(output, dstCoord, c);
}
__kernel void CopyBufferToImage3d4BytesStateless(__global uchar *src,
__write_only image3d_t output,
ulong srcOffset,
int4 dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
ulong LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
if(( ulong )(src + srcOffset) & 0x00000003){
uint upper2 = *((__global uchar*)(src + LOffset + x * 4 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 4 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 4 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 4));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
}
else{
c.x = (*(__global uint*)(src + LOffset + x * 4));
}
write_imageui(output, dstCoord, c);
}
__kernel void CopyBufferToImage3d3To4BytesStateless(__global uchar *src,
__write_only image3d_t output,
ulong srcOffset,
int4 dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
ulong LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
uint upper2 = 0;
uint upper = *((__global uchar*)(src + LOffset + x * 3 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 3 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 3));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
write_imageui(output, dstCoord, c);
}
__kernel void CopyBufferToImage3d8BytesStateless(__global uchar *src,
__write_only image3d_t output,
ulong srcOffset,
int4 dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
ulong LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8));
if(( ulong )(src + srcOffset) & 0x00000007){
uint upper2 = *((__global uchar*)(src + LOffset + x * 8 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 8 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 8 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 8));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 8 + 7));
upper = *((__global uchar*)(src + LOffset + x * 8 + 6));
lower2 = *((__global uchar*)(src + LOffset + x * 8 + 5));
lower = *((__global uchar*)(src + LOffset + x * 8 + 4));
combined = ((uint)upper2 << 24) | ((uint)upper << 16) | ((uint)lower2 << 8) | lower;
c.y = combined;
}
else{
c = *((__global uint2*)(src + LOffset + x * 8));
}
write_imageui(output, dstCoord, (uint4)(c.x, c.y, 0, 1));
}
__kernel void CopyBufferToImage3d6To8BytesStateless(__global uchar *src,
__write_only image3d_t output,
ulong srcOffset,
int4 dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
ulong LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8));
uint upper2 = *((__global uchar*)(src + LOffset + x * 6 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 6 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 6 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 6));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = upper = 0;
lower2 = *((__global uchar*)(src + LOffset + x * 6 + 5));
lower = *((__global uchar*)(src + LOffset + x * 6 + 4));
combined = ((uint)upper2 << 24) | ((uint)upper << 16) | ((uint)lower2 << 8) | lower;
c.y = combined;
write_imageui(output, dstCoord, (uint4)(c.x, c.y, 0, 1));
}
__kernel void CopyBufferToImage3d16BytesStateless(__global uchar *src,
__write_only image3d_t output,
ulong srcOffset,
int4 dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
ulong LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 0);
uint upper2 = *((__global uchar*)(src + LOffset + x * 16 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 16 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 16 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 16));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 7));
upper = *((__global uchar*)(src + LOffset + x * 16 + 6));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 5));
lower = *((__global uchar*)(src + LOffset + x * 16 + 4));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.y = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 11));
upper = *((__global uchar*)(src + LOffset + x * 16 + 10));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 9));
lower = *((__global uchar*)(src + LOffset + x * 16 + 8));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.z = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 15));
upper = *((__global uchar*)(src + LOffset + x * 16 + 14));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 13));
lower = *((__global uchar*)(src + LOffset + x * 16 + 12));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.w = combined;
write_imageui(output, dstCoord, c);
}
__kernel void CopyBufferToImage3d16BytesAlignedStateless(__global uint4 *src,
__write_only image3d_t output,
ulong srcOffset,
int4 dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
ulong LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = src[(LOffset >> 4) + x];
write_imageui(output, dstCoord, c);
}
)==="

View File

@@ -6,17 +6,19 @@
*/
R"===(
#include "kernel_types.h"
__kernel void CopyImage3dToBufferBytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*(dst + DstOffset + x) = convert_uchar_sat(c.x);
@@ -25,18 +27,18 @@ __kernel void CopyImage3dToBufferBytes(__read_only image3d_t input,
__kernel void CopyImage3dToBuffer2Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x00000001){
if(((ulong)(dst + dstOffset)) & 0x00000001){
*((__global uchar*)(dst + DstOffset + x * 2 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 2)) = convert_uchar_sat(c.x & 0xff);
}
@@ -48,39 +50,37 @@ __kernel void CopyImage3dToBuffer2Bytes(__read_only image3d_t input,
__kernel void CopyImage3dToBuffer3Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff);
}
__kernel void CopyImage3dToBuffer4Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x00000003){
if(((ulong)(dst + dstOffset)) & 0x00000003){
*((__global uchar*)(dst + DstOffset + x * 4 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 4 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 4 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
@@ -92,19 +92,19 @@ __kernel void CopyImage3dToBuffer4Bytes(__read_only image3d_t input,
}
__kernel void CopyImage3dToBuffer4To3Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff);
@@ -113,40 +113,40 @@ __kernel void CopyImage3dToBuffer4To3Bytes(__read_only image3d_t input,
__kernel void CopyImage3dToBuffer6Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
uint4 c = read_imageui(input, srcCoord);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 6 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 3)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 2)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 5)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.z & 0xff);
}
__kernel void CopyImage3dToBuffer8Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x00000007){
if(((ulong)(dst + dstOffset)) & 0x00000007){
*((__global uchar*)(dst + DstOffset + x * 8 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
@@ -163,16 +163,16 @@ __kernel void CopyImage3dToBuffer8Bytes(__read_only image3d_t input,
}
__kernel void CopyImage3dToBuffer8To6Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
@@ -188,14 +188,14 @@ __kernel void CopyImage3dToBuffer8To6Bytes(__read_only image3d_t input,
__kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
const uint4 c = read_imageui(input, srcCoord);
@@ -218,16 +218,16 @@ __kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input,
}
__kernel void CopyImage3dToBuffer16BytesAligned(__read_only image3d_t input,
__global uint4 *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
__global uint4 *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
const uint4 c = read_imageui(input, srcCoord);

View File

@@ -1,238 +0,0 @@
/*
* Copyright (C) 2019-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
__kernel void CopyImage3dToBufferBytesStateless(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
ulong dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
ulong DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*(dst + DstOffset + x) = convert_uchar_sat(c.x);
}
__kernel void CopyImage3dToBuffer2BytesStateless(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
ulong dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
ulong DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x00000001){
*((__global uchar*)(dst + DstOffset + x * 2 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 2)) = convert_uchar_sat(c.x & 0xff);
}
else{
*((__global ushort*)(dst + DstOffset + x * 2)) = convert_ushort_sat(c.x);
}
}
__kernel void CopyImage3dToBuffer3BytesStateless(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
ulong dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
ulong DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff);
}
__kernel void CopyImage3dToBuffer4BytesStateless(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
ulong dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
ulong DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x00000003){
*((__global uchar*)(dst + DstOffset + x * 4 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 4 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 4 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 4)) = convert_uchar_sat(c.x & 0xff);
}
else{
*((__global uint*)(dst + DstOffset + x * 4)) = c.x;
}
}
__kernel void CopyImage3dToBuffer4To3BytesStateless(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
ulong dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
ulong DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff);
}
__kernel void CopyImage3dToBuffer6BytesStateless(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
ulong dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
ulong DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 6 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 3)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 2)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 5)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.z & 0xff);
}
__kernel void CopyImage3dToBuffer8BytesStateless(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
ulong dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
ulong DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x00000007){
*((__global uchar*)(dst + DstOffset + x * 8 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 4)) = convert_uchar_sat(c.y & 0xff);
}
else{
uint2 d = (uint2)(c.x,c.y);
*((__global uint2*)(dst + DstOffset + x * 8)) = d;
}
}
__kernel void CopyImage3dToBuffer8To6BytesStateless(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
ulong dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
ulong DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 6 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.y & 0xff);
}
__kernel void CopyImage3dToBuffer16BytesStateless(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
ulong dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
ulong DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
const uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 16 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 4)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 11)) = convert_uchar_sat((c.z >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 10)) = convert_uchar_sat((c.z >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 9)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 8)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 15)) = convert_uchar_sat((c.w >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 14)) = convert_uchar_sat((c.w >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 13)) = convert_uchar_sat((c.w >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 12)) = convert_uchar_sat(c.w & 0xff);
}
__kernel void CopyImage3dToBuffer16BytesAlignedStateless(__read_only image3d_t input,
__global uint4 *dst,
int4 srcOffset,
ulong dstOffset,
ulong2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
ulong DstOffset = (dstOffset + (y * Pitch.x) + (z * Pitch.y));
const uint4 c = read_imageui(input, srcCoord);
dst[(DstOffset >> 4) + x] = c;
}
)==="

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2023 Intel Corporation
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -8,87 +8,92 @@
R"===(
#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0)
#include "kernel_types.h"
// assumption is local work size = pattern size
__kernel void FillBufferBytes(
__global uchar* pDst,
uint dstOffsetInBytes,
offset_t dstOffsetInBytes,
const __global uchar* pPattern )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
uint dstIndex = get_global_id(0) + dstOffsetInBytes;
uint srcIndex = get_local_id(0);
pDst[dstIndex] = pPattern[srcIndex];
idx_t gid = get_global_id(0);
idx_t lid = get_local_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[lid];
}
__kernel void FillBufferLeftLeftover(
__global uchar* pDst,
uint dstOffsetInBytes,
offset_t dstOffsetInBytes,
const __global uchar* pPattern,
const uint patternSizeInEls )
const offset_t patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
uint gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
idx_t gid = get_global_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[gid & (patternSizeInEls - 1)];
}
__kernel void FillBufferMiddle(
__global uchar* pDst,
uint dstOffsetInBytes,
offset_t dstOffsetInBytes,
const __global uint* pPattern,
const uint patternSizeInEls )
const offset_t patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
uint gid = get_global_id(0);
((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[ gid & (patternSizeInEls - 1) ];
idx_t gid = get_global_id(0);
((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[gid & (patternSizeInEls - 1)];
}
__kernel void FillBufferRightLeftover(
__global uchar* pDst,
uint dstOffsetInBytes,
offset_t dstOffsetInBytes,
const __global uchar* pPattern,
const uint patternSizeInEls )
const offset_t patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
uint gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
idx_t gid = get_global_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[gid & (patternSizeInEls - 1)];
}
__kernel void FillBufferImmediate(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
ALIGNED4(ptr);
uint gid = get_global_id(0);
idx_t gid = get_global_id(0);
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateLeftOver(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
ALIGNED4(ptr);
uint gid = get_global_id(0);
idx_t gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferSSHOffset(
__global uchar* ptr,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const __global uchar* pPattern,
uint patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment
offset_t patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment
)
{
ALIGNED4(ptr);
ALIGNED4(pPattern);
uint dstIndex = get_global_id(0);
uint srcIndex = get_local_id(0);
idx_t dstIndex = get_global_id(0);
idx_t srcIndex = get_local_id(0);
__global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
__global uchar* pSrc = (__global uchar*)pPattern + patternSshOffset;
pDst[dstIndex] = pSrc[srcIndex];

View File

@@ -1,82 +0,0 @@
/*
* Copyright (C) 2020-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
// assumption is local work size = pattern size
__kernel void FillBufferBytesStateless(
__global uchar* pDst,
ulong dstOffsetInBytes,
const __global uchar* pPattern )
{
size_t dstIndex = get_global_id(0) + dstOffsetInBytes;
size_t srcIndex = get_local_id(0);
pDst[dstIndex] = pPattern[srcIndex];
}
__kernel void FillBufferLeftLeftoverStateless(
__global uchar* pDst,
ulong dstOffsetInBytes,
const __global uchar* pPattern,
const ulong patternSizeInEls )
{
size_t gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
}
__kernel void FillBufferMiddleStateless(
__global uchar* pDst,
ulong dstOffsetInBytes,
const __global uint* pPattern,
const ulong patternSizeInEls )
{
size_t gid = get_global_id(0);
((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[ gid & (patternSizeInEls - 1) ];
}
__kernel void FillBufferRightLeftoverStateless(
__global uchar* pDst,
ulong dstOffsetInBytes,
const __global uchar* pPattern,
const ulong patternSizeInEls )
{
size_t gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
}
__kernel void FillBufferImmediateStateless(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
size_t gid = get_global_id(0);
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateLeftOverStateless(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
size_t gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferSSHOffsetStateless(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const __global uchar* pPattern,
ulong patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment
)
{
size_t dstIndex = get_global_id(0);
size_t srcIndex = get_local_id(0);
__global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
__global uchar* pSrc = (__global uchar*)pPattern + patternSshOffset;
pDst[dstIndex] = pSrc[srcIndex];
}
)==="

View File

@@ -0,0 +1,21 @@
/*
* Copyright (C) 2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#ifndef KERNEL_TYPES_H
#define KERNEL_TYPES_H 1
#ifdef WIDE_STATELESS
typedef ulong idx_t;
typedef ulong2 coord2_t;
typedef ulong4 coord4_t;
typedef ulong offset_t;
#else
typedef uint idx_t;
typedef uint2 coord2_t;
typedef uint4 coord4_t;
typedef uint offset_t;
#endif
#endif

View File

@@ -26,7 +26,7 @@ static RegisterEmbeddedResource registerCopyBufferToBufferSrc(
copyBufferToBufferSrcSize);
static constexpr const char copyBufferToBufferStatelessSrc[] =
#include "shared/source/built_ins/kernels/copy_buffer_to_buffer_stateless.builtin_kernel"
#include "shared/source/built_ins/kernels/copy_buffer_to_buffer.builtin_kernel"
;
static constexpr size_t copyBufferToBufferStatelessSrcSize = sizeof(copyBufferToBufferStatelessSrc);
@@ -39,7 +39,7 @@ static RegisterEmbeddedResource registerCopyBufferToBufferStatelessSrc(
copyBufferToBufferStatelessSrcSize);
static constexpr const char copyBufferToBufferStatelessHeaplessSrc[] =
#include "shared/source/built_ins/kernels/copy_buffer_to_buffer_stateless.builtin_kernel"
#include "shared/source/built_ins/kernels/copy_buffer_to_buffer.builtin_kernel"
;
static constexpr size_t copyBufferToBufferStatelessHeaplessSrcSize = sizeof(copyBufferToBufferStatelessHeaplessSrc);
@@ -65,7 +65,7 @@ static RegisterEmbeddedResource registerCopyBufferRectSrc(
copyBufferRectSrcSize);
static constexpr const char copyBufferRectStatelessSrc[] =
#include "shared/source/built_ins/kernels/copy_buffer_rect_stateless.builtin_kernel"
#include "shared/source/built_ins/kernels/copy_buffer_rect.builtin_kernel"
;
static constexpr size_t copyBufferRectStatelessSrcSize = sizeof(copyBufferRectStatelessSrc);
@@ -78,7 +78,7 @@ static RegisterEmbeddedResource registerCopyBufferRectStatelessSrc(
copyBufferRectStatelessSrcSize);
static constexpr const char copyBufferRectStatelessHeaplessSrc[] =
#include "shared/source/built_ins/kernels/copy_buffer_rect_stateless.builtin_kernel"
#include "shared/source/built_ins/kernels/copy_buffer_rect.builtin_kernel"
;
static constexpr size_t copyBufferRectStatelessHeaplessSrcSize = sizeof(copyBufferRectStatelessHeaplessSrc);
@@ -104,7 +104,7 @@ static RegisterEmbeddedResource registerFillBufferSrc(
fillBufferSrcSize);
static constexpr const char fillBufferStatelessSrc[] =
#include "shared/source/built_ins/kernels/fill_buffer_stateless.builtin_kernel"
#include "shared/source/built_ins/kernels/fill_buffer.builtin_kernel"
;
static constexpr size_t fillBufferStatelessSrcSize = sizeof(fillBufferStatelessSrc);
@@ -117,7 +117,7 @@ static RegisterEmbeddedResource registerFillBufferStatelessSrc(
fillBufferStatelessSrcSize);
static constexpr const char fillBufferStatelessHeaplessSrc[] =
#include "shared/source/built_ins/kernels/fill_buffer_stateless.builtin_kernel"
#include "shared/source/built_ins/kernels/fill_buffer.builtin_kernel"
;
static constexpr size_t fillBufferStatelessHeaplessSrcSize = sizeof(fillBufferStatelessHeaplessSrc);
@@ -143,7 +143,7 @@ static RegisterEmbeddedResource registerCopyBufferToImage3dSrc(
copyBufferToImage3dSrcSize);
static constexpr const char copyBufferToImage3dStatelessSrc[] =
#include "shared/source/built_ins/kernels/copy_buffer_to_image3d_stateless.builtin_kernel"
#include "shared/source/built_ins/kernels/copy_buffer_to_image3d.builtin_kernel"
;
static constexpr size_t copyBufferToImage3dStatelessSrcSize = sizeof(copyBufferToImage3dStatelessSrc);
@@ -169,7 +169,7 @@ static RegisterEmbeddedResource registerCopyImage3dToBufferSrc(
copyImage3dToBufferSrcSize);
static constexpr const char copyImage3dToBufferStatelessSrc[] =
#include "shared/source/built_ins/kernels/copy_image3d_to_buffer_stateless.builtin_kernel"
#include "shared/source/built_ins/kernels/copy_image3d_to_buffer.builtin_kernel"
;
static constexpr size_t copyImage3dToBufferStatelessSrcSize = sizeof(copyImage3dToBufferStatelessSrc);

View File

@@ -45,7 +45,10 @@ inline constexpr ConstStringRef useCMCompiler = "-cmc";
inline constexpr ConstStringRef enableFP64GenEmu = "-cl-fp64-gen-emu";
inline constexpr ConstStringRef enableDivergentBarriers = "-cl-intel-enable-divergent-barrier-handling";
inline constexpr ConstStringRef optDisableSendWarWa = "-ze-opt-disable-sendwarwa";
inline constexpr ConstStringRef statelessAddr = "-DWIDE_STATELESS=1";
inline const std::string wideStatelessOptions =
std::string(greaterThan4gbBuffersRequired) + " " + std::string(statelessAddr);
inline const std::string statelessOptions = greaterThan4gbBuffersRequired.str();
inline constexpr size_t nullterminateSize = 1U;
inline constexpr size_t spaceSeparatorSize = 1U;

View File

@@ -7,5 +7,5 @@
#include "shared/test/common/helpers/kernel_binary_helper.h"
const std::string KernelBinaryHelper::BUILT_INS("6133084427540774618");
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("10619261412647190096_images");
const std::string KernelBinaryHelper::BUILT_INS("9138976784737507795");
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("15961210520135641457_images");

View File

@@ -11,14 +11,16 @@ __kernel void fullCopy(__global const uint* src, __global uint* dst) {
vstore4(loaded, gid, dst);
}
#include "kernel_types.h"
#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0)
__kernel void CopyBufferToBufferBytes(
const __global uchar* pSrc,
__global uchar* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes,
uint bytesToRead )
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes,
offset_t bytesToRead )
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
@@ -30,24 +32,24 @@ __kernel void CopyBufferToBufferBytes(
__kernel void CopyBufferToBufferLeftLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes)
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void CopyBufferToBufferMiddle(
const __global uint* pSrc,
__global uint* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes)
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
uint4 loaded = vload4(gid, pSrc);
@@ -57,17 +59,17 @@ __kernel void CopyBufferToBufferMiddle(
__kernel void CopyBufferToBufferMiddleMisaligned(
__global const uint* pSrc,
__global uint* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes,
uint misalignmentInBits)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
const size_t gid = get_global_id(0);
idx_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
const uint4 src0 = vload4(gid, pSrc);
const uint4 src1 = vload4(gid + 1, pSrc);
const uint4 src1 = vload4((gid + 1), pSrc);
uint4 result;
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
@@ -80,32 +82,33 @@ __kernel void CopyBufferToBufferMiddleMisaligned(
__kernel void CopyBufferToBufferRightLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes)
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void copyBufferToBufferBytesSingle(__global uchar *dst, const __global uchar *src) {
ALIGNED4(dst);
ALIGNED4(src);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
dst[gid] = (uchar)(src[gid]);
}
__kernel void CopyBufferToBufferSideRegion(
__global uchar* pDst,
const __global uchar* pSrc,
unsigned int len,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
idx_t len,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
__global uchar* pDstWithOffset = (__global uchar*)((__global uchar*)pDst + dstSshOffset);
__global uchar* pSrcWithOffset = (__global uchar*)((__global uchar*)pSrc + srcSshOffset);
if (gid < len) {
@@ -116,14 +119,14 @@ __kernel void CopyBufferToBufferSideRegion(
__kernel void CopyBufferToBufferMiddleRegion(
__global uint* pDst,
const __global uint* pSrc,
unsigned int elems,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
idx_t elems,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
__global uint* pDstWithOffset = (__global uint*)((__global uchar*)pDst + dstSshOffset);
__global uint* pSrcWithOffset = (__global uint*)((__global uchar*)pSrc + srcSshOffset);
if (gid < elems) {
@@ -132,179 +135,181 @@ __kernel void CopyBufferToBufferMiddleRegion(
}
}
#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0)
#include "kernel_types.h"
// assumption is local work size = pattern size
__kernel void FillBufferBytes(
__global uchar* pDst,
uint dstOffsetInBytes,
offset_t dstOffsetInBytes,
const __global uchar* pPattern )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
uint dstIndex = get_global_id(0) + dstOffsetInBytes;
uint srcIndex = get_local_id(0);
pDst[dstIndex] = pPattern[srcIndex];
idx_t gid = get_global_id(0);
idx_t lid = get_local_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[lid];
}
__kernel void FillBufferLeftLeftover(
__global uchar* pDst,
uint dstOffsetInBytes,
offset_t dstOffsetInBytes,
const __global uchar* pPattern,
const uint patternSizeInEls )
const offset_t patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
uint gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
idx_t gid = get_global_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[gid & (patternSizeInEls - 1)];
}
__kernel void FillBufferMiddle(
__global uchar* pDst,
uint dstOffsetInBytes,
offset_t dstOffsetInBytes,
const __global uint* pPattern,
const uint patternSizeInEls )
const offset_t patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
uint gid = get_global_id(0);
((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[ gid & (patternSizeInEls - 1) ];
idx_t gid = get_global_id(0);
((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[gid & (patternSizeInEls - 1)];
}
__kernel void FillBufferRightLeftover(
__global uchar* pDst,
uint dstOffsetInBytes,
offset_t dstOffsetInBytes,
const __global uchar* pPattern,
const uint patternSizeInEls )
const offset_t patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
uint gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
idx_t gid = get_global_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[gid & (patternSizeInEls - 1)];
}
__kernel void FillBufferImmediate(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
ALIGNED4(ptr);
uint gid = get_global_id(0);
idx_t gid = get_global_id(0);
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateLeftOver(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
ALIGNED4(ptr);
uint gid = get_global_id(0);
idx_t gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferSSHOffset(
__global uchar* ptr,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const __global uchar* pPattern,
uint patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment
offset_t patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment
)
{
ALIGNED4(ptr);
ALIGNED4(pPattern);
uint dstIndex = get_global_id(0);
uint srcIndex = get_local_id(0);
idx_t dstIndex = get_global_id(0);
idx_t srcIndex = get_local_id(0);
__global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
__global uchar* pSrc = (__global uchar*)pPattern + patternSshOffset;
pDst[dstIndex] = pSrc[srcIndex];
}
#include "kernel_types.h"
__kernel void CopyBufferRectBytes2d(
__global const char* src,
__global char* dst,
uint2 SrcOrigin,
uint2 DstOrigin,
uint SrcPitch,
uint DstPitch )
coord2_t SrcOrigin,
coord2_t DstOrigin,
idx_t SrcPitch,
idx_t DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
idx_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
idx_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
__kernel void CopyBufferRectBytesMiddle2d(
const __global uint* src,
__global uint* dst,
uint2 SrcOrigin,
uint2 DstOrigin,
uint SrcPitch,
uint DstPitch )
coord2_t SrcOrigin,
coord2_t DstOrigin,
idx_t SrcPitch,
idx_t DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
uint LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
uint LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
idx_t LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
idx_t LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
src += LSrcOffset >> 2;
dst += LDstOffset >> 2;
uint4 loaded = vload4(x,src);
vstore4(loaded,x,dst);
uint4 loaded = vload4(x, src);
vstore4(loaded, x, dst);
}
__kernel void CopyBufferRectBytes3d(
__global const char* src,
__global char* dst,
uint4 SrcOrigin,
uint4 DstOrigin,
uint2 SrcPitch,
uint2 DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
*( dst + LDstOffset ) = *( src + LSrcOffset );
__global const char* src,
__global char* dst,
coord4_t SrcOrigin,
coord4_t DstOrigin,
coord2_t SrcPitch,
coord2_t DstPitch )
{
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
idx_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
idx_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
__kernel void CopyBufferRectBytesMiddle3d(
const __global uint* src,
__global uint* dst,
uint4 SrcOrigin,
uint4 DstOrigin,
uint2 SrcPitch,
uint2 DstPitch )
coord4_t SrcOrigin,
coord4_t DstOrigin,
coord2_t SrcPitch,
coord2_t DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
uint LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
uint LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
idx_t LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
idx_t LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
src += LSrcOffset >> 2;
dst += LDstOffset >> 2;
uint4 loaded = vload4(x,src);
vstore4(loaded,x,dst);
uint4 loaded = vload4(x, src);
vstore4(loaded, x, dst);
}
void SetDstData(__global ulong* dst, uint currentOffset, ulong contextStart, ulong globalStart, ulong contextEnd, ulong globalEnd, uint useOnlyGlobalTimestamps) {
dst[currentOffset] = globalStart;
dst[currentOffset + 1] = globalEnd;
@@ -535,36 +540,38 @@ __kernel void CopyImage1dBufferToImage1dBuffer(
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
#include "kernel_types.h"
__kernel void CopyBufferToImage3dBytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
offset_t srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
write_imageui(output, dstCoord, (uint4)(*(src + LOffset + x), 0, 0, 1));
}
__kernel void CopyBufferToImage3d2Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
offset_t srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
if(( ulong )(src + srcOffset) & 0x00000001){
if(((ulong)(src + srcOffset)) & 0x00000001){
ushort upper = *((__global uchar*)(src + LOffset + x * 2 + 1));
ushort lower = *((__global uchar*)(src + LOffset + x * 2));
ushort combined = (upper << 8) | lower;
@@ -578,19 +585,19 @@ __kernel void CopyBufferToImage3d2Bytes(__global uchar *src,
__kernel void CopyBufferToImage3d4Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
offset_t srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
if(( ulong )(src + srcOffset) & 0x00000003){
if(((ulong)(src + srcOffset)) & 0x00000003){
uint upper2 = *((__global uchar*)(src + LOffset + x * 4 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 4 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 4 + 1));
@@ -605,45 +612,44 @@ __kernel void CopyBufferToImage3d4Bytes(__global uchar *src,
}
__kernel void CopyBufferToImage3d3To4Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
__write_only image3d_t output,
offset_t srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
uint upper2 = 0;
uint upper = *((__global uchar*)(src + LOffset + x * 3 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 3 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 3));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
write_imageui(output, dstCoord, c);
}
__kernel void CopyBufferToImage3d8Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
offset_t srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8));
if(( ulong )(src + srcOffset) & 0x00000007){
if(((ulong)(src + srcOffset)) & 0x00000007){
uint upper2 = *((__global uchar*)(src + LOffset + x * 8 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 8 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 8 + 1));
@@ -665,27 +671,26 @@ __kernel void CopyBufferToImage3d8Bytes(__global uchar *src,
}
__kernel void CopyBufferToImage3d6To8Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
__write_only image3d_t output,
offset_t srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8));
uint upper2 = *((__global uchar*)(src + LOffset + x * 6 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 6 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 6 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 6));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = upper = 0;
lower2 = *((__global uchar*)(src + LOffset + x * 6 + 5));
lower = *((__global uchar*)(src + LOffset + x * 6 + 4));
@@ -697,15 +702,15 @@ __kernel void CopyBufferToImage3d6To8Bytes(__global uchar *src,
__kernel void CopyBufferToImage3d16Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
offset_t srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 0);
@@ -738,33 +743,35 @@ __kernel void CopyBufferToImage3d16Bytes(__global uchar *src,
}
__kernel void CopyBufferToImage3d16BytesAligned(__global uint4 *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
__write_only image3d_t output,
offset_t srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = src[(LOffset >> 4) + x];
write_imageui(output, dstCoord, c);
}
#include "kernel_types.h"
__kernel void CopyImage3dToBufferBytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*(dst + DstOffset + x) = convert_uchar_sat(c.x);
@@ -773,18 +780,18 @@ __kernel void CopyImage3dToBufferBytes(__read_only image3d_t input,
__kernel void CopyImage3dToBuffer2Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x00000001){
if(((ulong)(dst + dstOffset)) & 0x00000001){
*((__global uchar*)(dst + DstOffset + x * 2 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 2)) = convert_uchar_sat(c.x & 0xff);
}
@@ -796,39 +803,37 @@ __kernel void CopyImage3dToBuffer2Bytes(__read_only image3d_t input,
__kernel void CopyImage3dToBuffer3Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff);
}
__kernel void CopyImage3dToBuffer4Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x00000003){
if(((ulong)(dst + dstOffset)) & 0x00000003){
*((__global uchar*)(dst + DstOffset + x * 4 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 4 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 4 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
@@ -840,19 +845,19 @@ __kernel void CopyImage3dToBuffer4Bytes(__read_only image3d_t input,
}
__kernel void CopyImage3dToBuffer4To3Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff);
@@ -861,40 +866,40 @@ __kernel void CopyImage3dToBuffer4To3Bytes(__read_only image3d_t input,
__kernel void CopyImage3dToBuffer6Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
uint4 c = read_imageui(input, srcCoord);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 6 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 3)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 2)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 5)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.z & 0xff);
}
__kernel void CopyImage3dToBuffer8Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x00000007){
if(((ulong)(dst + dstOffset)) & 0x00000007){
*((__global uchar*)(dst + DstOffset + x * 8 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
@@ -911,16 +916,16 @@ __kernel void CopyImage3dToBuffer8Bytes(__read_only image3d_t input,
}
__kernel void CopyImage3dToBuffer8To6Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
@@ -936,14 +941,14 @@ __kernel void CopyImage3dToBuffer8To6Bytes(__read_only image3d_t input,
__kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
const uint4 c = read_imageui(input, srcCoord);
@@ -966,16 +971,16 @@ __kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input,
}
__kernel void CopyImage3dToBuffer16BytesAligned(__read_only image3d_t input,
__global uint4 *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
__global uint4 *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
const uint4 c = read_imageui(input, srcCoord);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2024-2025 Intel Corporation
* Copyright (C) 2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -11,14 +11,16 @@ __kernel void fullCopy(__global const uint* src, __global uint* dst) {
vstore4(loaded, gid, dst);
}
#include "kernel_types.h"
#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0)
__kernel void CopyBufferToBufferBytes(
const __global uchar* pSrc,
__global uchar* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes,
uint bytesToRead )
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes,
offset_t bytesToRead )
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
@@ -30,24 +32,24 @@ __kernel void CopyBufferToBufferBytes(
__kernel void CopyBufferToBufferLeftLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes)
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void CopyBufferToBufferMiddle(
const __global uint* pSrc,
__global uint* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes)
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
uint4 loaded = vload4(gid, pSrc);
@@ -57,17 +59,17 @@ __kernel void CopyBufferToBufferMiddle(
__kernel void CopyBufferToBufferMiddleMisaligned(
__global const uint* pSrc,
__global uint* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes,
uint misalignmentInBits)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
const size_t gid = get_global_id(0);
idx_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
const uint4 src0 = vload4(gid, pSrc);
const uint4 src1 = vload4(gid + 1, pSrc);
const uint4 src1 = vload4((gid + 1), pSrc);
uint4 result;
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
@@ -80,32 +82,33 @@ __kernel void CopyBufferToBufferMiddleMisaligned(
__kernel void CopyBufferToBufferRightLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes)
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void copyBufferToBufferBytesSingle(__global uchar *dst, const __global uchar *src) {
ALIGNED4(dst);
ALIGNED4(src);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
dst[gid] = (uchar)(src[gid]);
}
__kernel void CopyBufferToBufferSideRegion(
__global uchar* pDst,
const __global uchar* pSrc,
unsigned int len,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
idx_t len,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
__global uchar* pDstWithOffset = (__global uchar*)((__global uchar*)pDst + dstSshOffset);
__global uchar* pSrcWithOffset = (__global uchar*)((__global uchar*)pSrc + srcSshOffset);
if (gid < len) {
@@ -116,14 +119,14 @@ __kernel void CopyBufferToBufferSideRegion(
__kernel void CopyBufferToBufferMiddleRegion(
__global uint* pDst,
const __global uint* pSrc,
unsigned int elems,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
idx_t elems,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
unsigned int gid = get_global_id(0);
idx_t gid = get_global_id(0);
__global uint* pDstWithOffset = (__global uint*)((__global uchar*)pDst + dstSshOffset);
__global uint* pSrcWithOffset = (__global uint*)((__global uchar*)pSrc + srcSshOffset);
if (gid < elems) {
@@ -132,179 +135,181 @@ __kernel void CopyBufferToBufferMiddleRegion(
}
}
#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0)
#include "kernel_types.h"
// assumption is local work size = pattern size
__kernel void FillBufferBytes(
__global uchar* pDst,
uint dstOffsetInBytes,
offset_t dstOffsetInBytes,
const __global uchar* pPattern )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
uint dstIndex = get_global_id(0) + dstOffsetInBytes;
uint srcIndex = get_local_id(0);
pDst[dstIndex] = pPattern[srcIndex];
idx_t gid = get_global_id(0);
idx_t lid = get_local_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[lid];
}
__kernel void FillBufferLeftLeftover(
__global uchar* pDst,
uint dstOffsetInBytes,
offset_t dstOffsetInBytes,
const __global uchar* pPattern,
const uint patternSizeInEls )
const offset_t patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
uint gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
idx_t gid = get_global_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[gid & (patternSizeInEls - 1)];
}
__kernel void FillBufferMiddle(
__global uchar* pDst,
uint dstOffsetInBytes,
offset_t dstOffsetInBytes,
const __global uint* pPattern,
const uint patternSizeInEls )
const offset_t patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
uint gid = get_global_id(0);
((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[ gid & (patternSizeInEls - 1) ];
idx_t gid = get_global_id(0);
((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[gid & (patternSizeInEls - 1)];
}
__kernel void FillBufferRightLeftover(
__global uchar* pDst,
uint dstOffsetInBytes,
offset_t dstOffsetInBytes,
const __global uchar* pPattern,
const uint patternSizeInEls )
const offset_t patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
uint gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
idx_t gid = get_global_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[gid & (patternSizeInEls - 1)];
}
__kernel void FillBufferImmediate(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
ALIGNED4(ptr);
uint gid = get_global_id(0);
idx_t gid = get_global_id(0);
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateLeftOver(
__global uchar* ptr,
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
ALIGNED4(ptr);
uint gid = get_global_id(0);
idx_t gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferSSHOffset(
__global uchar* ptr,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const __global uchar* pPattern,
uint patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment
offset_t patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment
)
{
ALIGNED4(ptr);
ALIGNED4(pPattern);
uint dstIndex = get_global_id(0);
uint srcIndex = get_local_id(0);
idx_t dstIndex = get_global_id(0);
idx_t srcIndex = get_local_id(0);
__global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
__global uchar* pSrc = (__global uchar*)pPattern + patternSshOffset;
pDst[dstIndex] = pSrc[srcIndex];
}
#include "kernel_types.h"
__kernel void CopyBufferRectBytes2d(
__global const char* src,
__global char* dst,
uint2 SrcOrigin,
uint2 DstOrigin,
uint SrcPitch,
uint DstPitch )
coord2_t SrcOrigin,
coord2_t DstOrigin,
idx_t SrcPitch,
idx_t DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
idx_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
idx_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
__kernel void CopyBufferRectBytesMiddle2d(
const __global uint* src,
__global uint* dst,
uint2 SrcOrigin,
uint2 DstOrigin,
uint SrcPitch,
uint DstPitch )
coord2_t SrcOrigin,
coord2_t DstOrigin,
idx_t SrcPitch,
idx_t DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
uint LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
uint LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
idx_t LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
idx_t LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
src += LSrcOffset >> 2;
dst += LDstOffset >> 2;
uint4 loaded = vload4(x,src);
vstore4(loaded,x,dst);
uint4 loaded = vload4(x, src);
vstore4(loaded, x, dst);
}
__kernel void CopyBufferRectBytes3d(
__global const char* src,
__global char* dst,
uint4 SrcOrigin,
uint4 DstOrigin,
uint2 SrcPitch,
uint2 DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
*( dst + LDstOffset ) = *( src + LSrcOffset );
__global const char* src,
__global char* dst,
coord4_t SrcOrigin,
coord4_t DstOrigin,
coord2_t SrcPitch,
coord2_t DstPitch )
{
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
idx_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
idx_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
__kernel void CopyBufferRectBytesMiddle3d(
const __global uint* src,
__global uint* dst,
uint4 SrcOrigin,
uint4 DstOrigin,
uint2 SrcPitch,
uint2 DstPitch )
coord4_t SrcOrigin,
coord4_t DstOrigin,
coord2_t SrcPitch,
coord2_t DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
uint LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
uint LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
idx_t LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
idx_t LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
src += LSrcOffset >> 2;
dst += LDstOffset >> 2;
uint4 loaded = vload4(x,src);
vstore4(loaded,x,dst);
uint4 loaded = vload4(x, src);
vstore4(loaded, x, dst);
}
void SetDstData(__global ulong* dst, uint currentOffset, ulong contextStart, ulong globalStart, ulong contextEnd, ulong globalEnd, uint useOnlyGlobalTimestamps) {
dst[currentOffset] = globalStart;
dst[currentOffset + 1] = globalEnd;

View File

@@ -5,7 +5,7 @@
#
set(SHARED_TEST_PROJECTS_SUB_FOLDER "prepare test files")
function(compile_kernels_gen device revision_id platform_name use_stateless_suffix use_heapless)
function(compile_kernels_gen device revision_id platform_name use_stateless use_heapless)
set(outputdir "${TargetDir}/${platform_name}/${revision_id}/test_files/${NEO_ARCH}/")
set(compiled_kernels)
@@ -19,8 +19,8 @@ function(compile_kernels_gen device revision_id platform_name use_stateless_suff
set(outputname_base "${basename}_${platform_name}")
if(${use_heapless})
set(outputname_base "${outputname_base}-heapless_")
elseif(${use_stateless_suffix})
set(outputname_base "${outputname_base}-cl-intel-greater-than-4GB-buffer-required_")
elseif(${use_stateless})
set(outputname_base "${outputname_base}-cl-intel-greater-than-4GB-buffer-required_-DWIDE_STATELESS=1_")
endif()
set(outputpath_base "${outputdir}${outputname_base}")
@@ -38,9 +38,14 @@ function(compile_kernels_gen device revision_id platform_name use_stateless_suff
set(internal_options "${HEAPLESS_INTERNAL_OPTIONS}")
endif()
if(${use_stateless} OR ${use_heapless})
list(APPEND __ocloc__options__ "-DWIDE_STATELESS=1")
endif()
list(APPEND __ocloc__options__ "-I${NEO_SOURCE_DIR}/shared/source/built_ins/kernels")
add_custom_command(
OUTPUT ${output_files}
COMMAND ${ocloc_cmd_prefix} -file ${absolute_filepath} -device ${device} -heapless_mode ${heapless_mode} -internal_options ${internal_options} -${NEO_BITS} -out_dir ${outputdir} -output_no_suffix -output ${outputname_base} -revision_id ${revision_id}
COMMAND ${ocloc_cmd_prefix} -file ${absolute_filepath} -device ${device} -heapless_mode ${heapless_mode} -internal_options ${internal_options} -${NEO_BITS} -out_dir ${outputdir} -output_no_suffix -output ${outputname_base} -revision_id ${revision_id} -options "$<JOIN:${__ocloc__options__}, >"
WORKING_DIRECTORY ${workdir}
DEPENDS ${filepath} ocloc copy_compiler_files
)
@@ -79,6 +84,7 @@ add_dependencies(prepare_test_kernels_for_shared copy_compiler_files)
macro(macro_for_each_platform)
set(KERNELS_TO_COMPILE ${TEST_KERNELS})
set(IMAGE_SUPPORT FALSE)
CORE_CONTAINS_PLATFORM("SUPPORTED_IMAGES" ${CORE_TYPE} ${PLATFORM_IT} IMAGE_SUPPORT)
if(NOT ${IMAGE_SUPPORT})

View File

@@ -84,7 +84,7 @@ HWTEST2_F(BuiltInSharedTest, GivenStatelessBuiltinWhenGettingResourceNameThenAdd
auto resourceNames = getBuiltinResourceNames(EBuiltInOps::copyBufferToBufferStateless, BuiltinCode::ECodeType::binary, *pDevice);
std::string expectedResourceNameGeneric = "stateless_copy_buffer_to_buffer_stateless.builtin_kernel.bin";
std::string expectedResourceNameGeneric = "stateless_copy_buffer_to_buffer.builtin_kernel.bin";
std::string expectedResourceNameForRelease = deviceIpString + "_" + expectedResourceNameGeneric;
EXPECT_EQ(1u, resourceNames.size());
@@ -103,7 +103,7 @@ HWTEST2_F(BuiltInSharedTest, GivenPlatformWithoutStatefulAddresingSupportWhenGet
{
auto resourceNames = getBuiltinResourceNames(EBuiltInOps::copyBufferToBufferStateless, BuiltinCode::ECodeType::binary, *pDevice);
std::string expectedResourceName = deviceIpString + "_stateless_copy_buffer_to_buffer_stateless.builtin_kernel.bin";
std::string expectedResourceName = deviceIpString + "_stateless_copy_buffer_to_buffer.builtin_kernel.bin";
EXPECT_EQ(1u, resourceNames.size());
EXPECT_EQ(resourceNames[0], expectedResourceName);
}
@@ -130,20 +130,20 @@ TEST_F(BuiltInSharedTest, GivenValidBuiltinTypeAndExtensionWhenCreatingBuiltinRe
const std::pair<EBuiltInOps::Type, const char *> testCases[] = {
{EBuiltInOps::auxTranslation, "aux_translation.builtin_kernel"},
{EBuiltInOps::copyBufferToBuffer, "copy_buffer_to_buffer.builtin_kernel"},
{EBuiltInOps::copyBufferToBufferStateless, "copy_buffer_to_buffer_stateless.builtin_kernel"},
{EBuiltInOps::copyBufferToBufferStatelessHeapless, "copy_buffer_to_buffer_stateless.builtin_kernel"},
{EBuiltInOps::copyBufferToBufferStateless, "copy_buffer_to_buffer.builtin_kernel"},
{EBuiltInOps::copyBufferToBufferStatelessHeapless, "copy_buffer_to_buffer.builtin_kernel"},
{EBuiltInOps::copyBufferRect, "copy_buffer_rect.builtin_kernel"},
{EBuiltInOps::copyBufferRectStateless, "copy_buffer_rect_stateless.builtin_kernel"},
{EBuiltInOps::copyBufferRectStatelessHeapless, "copy_buffer_rect_stateless.builtin_kernel"},
{EBuiltInOps::copyBufferRectStateless, "copy_buffer_rect.builtin_kernel"},
{EBuiltInOps::copyBufferRectStatelessHeapless, "copy_buffer_rect.builtin_kernel"},
{EBuiltInOps::fillBuffer, "fill_buffer.builtin_kernel"},
{EBuiltInOps::fillBufferStateless, "fill_buffer_stateless.builtin_kernel"},
{EBuiltInOps::fillBufferStatelessHeapless, "fill_buffer_stateless.builtin_kernel"},
{EBuiltInOps::fillBufferStateless, "fill_buffer.builtin_kernel"},
{EBuiltInOps::fillBufferStatelessHeapless, "fill_buffer.builtin_kernel"},
{EBuiltInOps::copyBufferToImage3d, "copy_buffer_to_image3d.builtin_kernel"},
{EBuiltInOps::copyBufferToImage3dStateless, "copy_buffer_to_image3d_stateless.builtin_kernel"},
{EBuiltInOps::copyBufferToImage3dHeapless, "copy_buffer_to_image3d_stateless.builtin_kernel"},
{EBuiltInOps::copyBufferToImage3dStateless, "copy_buffer_to_image3d.builtin_kernel"},
{EBuiltInOps::copyBufferToImage3dHeapless, "copy_buffer_to_image3d.builtin_kernel"},
{EBuiltInOps::copyImage3dToBuffer, "copy_image3d_to_buffer.builtin_kernel"},
{EBuiltInOps::copyImage3dToBufferStateless, "copy_image3d_to_buffer_stateless.builtin_kernel"},
{EBuiltInOps::copyImage3dToBufferHeapless, "copy_image3d_to_buffer_stateless.builtin_kernel"},
{EBuiltInOps::copyImage3dToBufferStateless, "copy_image3d_to_buffer.builtin_kernel"},
{EBuiltInOps::copyImage3dToBufferHeapless, "copy_image3d_to_buffer.builtin_kernel"},
{EBuiltInOps::copyImageToImage1d, "copy_image_to_image1d.builtin_kernel"},
{EBuiltInOps::copyImageToImage1dHeapless, "copy_image_to_image1d.builtin_kernel"},
{EBuiltInOps::copyImageToImage2d, "copy_image_to_image2d.builtin_kernel"},
@@ -195,9 +195,9 @@ TEST_F(BuiltInSharedTest, GivenHeaplessModeEnabledWhenGetBuiltinResourceNamesIsC
};
TestParam params[] = {
{"copy_buffer_to_buffer_stateless", EBuiltInOps::copyBufferToBufferStatelessHeapless},
{"copy_buffer_rect_stateless", EBuiltInOps::copyBufferRectStatelessHeapless},
{"fill_buffer_stateless", EBuiltInOps::fillBufferStatelessHeapless}};
{"copy_buffer_to_buffer", EBuiltInOps::copyBufferToBufferStatelessHeapless},
{"copy_buffer_rect", EBuiltInOps::copyBufferRectStatelessHeapless},
{"fill_buffer", EBuiltInOps::fillBufferStatelessHeapless}};
for (auto &[builtInTypeAsString, builtInType] : params) {