Revert "refactor: unify stateless and not stateless builtins kernels"

This reverts commit 4eb37124cb.

Signed-off-by: Compute-Runtime-Validation <compute-runtime-validation@intel.com>
This commit is contained in:
Compute-Runtime-Validation
2025-11-18 15:46:41 +01:00
committed by Compute-Runtime-Automation
parent c3e98e346a
commit 575f48f738
36 changed files with 2458 additions and 1251 deletions

View File

@@ -7,5 +7,5 @@
#include "shared/test/common/helpers/kernel_binary_helper.h"
const std::string KernelBinaryHelper::BUILT_INS("9138976784737507795");
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("15961210520135641457_images");
const std::string KernelBinaryHelper::BUILT_INS("6133084427540774618");
const std::string KernelBinaryHelper::BUILT_INS_WITH_IMAGES("10619261412647190096_images");

View File

@@ -11,16 +11,14 @@ __kernel void fullCopy(__global const uint* src, __global uint* dst) {
vstore4(loaded, gid, dst);
}
#include "kernel_types.h"
#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0)
__kernel void CopyBufferToBufferBytes(
const __global uchar* pSrc,
__global uchar* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes,
offset_t bytesToRead )
uint srcOffsetInBytes,
uint dstOffsetInBytes,
uint bytesToRead )
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
@@ -32,24 +30,24 @@ __kernel void CopyBufferToBufferBytes(
__kernel void CopyBufferToBufferLeftLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
uint srcOffsetInBytes,
uint dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
unsigned int gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void CopyBufferToBufferMiddle(
const __global uint* pSrc,
__global uint* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
uint srcOffsetInBytes,
uint dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
unsigned int gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
uint4 loaded = vload4(gid, pSrc);
@@ -59,17 +57,17 @@ __kernel void CopyBufferToBufferMiddle(
__kernel void CopyBufferToBufferMiddleMisaligned(
__global const uint* pSrc,
__global uint* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes,
uint srcOffsetInBytes,
uint dstOffsetInBytes,
uint misalignmentInBits)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
const size_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
const uint4 src0 = vload4(gid, pSrc);
const uint4 src1 = vload4((gid + 1), pSrc);
const uint4 src1 = vload4(gid + 1, pSrc);
uint4 result;
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
@@ -82,33 +80,32 @@ __kernel void CopyBufferToBufferMiddleMisaligned(
__kernel void CopyBufferToBufferRightLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
uint srcOffsetInBytes,
uint dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
unsigned int gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void copyBufferToBufferBytesSingle(__global uchar *dst, const __global uchar *src) {
ALIGNED4(dst);
ALIGNED4(src);
idx_t gid = get_global_id(0);
unsigned int gid = get_global_id(0);
dst[gid] = (uchar)(src[gid]);
}
__kernel void CopyBufferToBufferSideRegion(
__global uchar* pDst,
const __global uchar* pSrc,
idx_t len,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
unsigned int len,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
unsigned int gid = get_global_id(0);
__global uchar* pDstWithOffset = (__global uchar*)((__global uchar*)pDst + dstSshOffset);
__global uchar* pSrcWithOffset = (__global uchar*)((__global uchar*)pSrc + srcSshOffset);
if (gid < len) {
@@ -119,14 +116,14 @@ __kernel void CopyBufferToBufferSideRegion(
__kernel void CopyBufferToBufferMiddleRegion(
__global uint* pDst,
const __global uint* pSrc,
idx_t elems,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
unsigned int elems,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
unsigned int gid = get_global_id(0);
__global uint* pDstWithOffset = (__global uint*)((__global uchar*)pDst + dstSshOffset);
__global uint* pSrcWithOffset = (__global uint*)((__global uchar*)pSrc + srcSshOffset);
if (gid < elems) {
@@ -135,181 +132,179 @@ __kernel void CopyBufferToBufferMiddleRegion(
}
}
#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0)
#include "kernel_types.h"
// assumption is local work size = pattern size
__kernel void FillBufferBytes(
__global uchar* pDst,
offset_t dstOffsetInBytes,
uint dstOffsetInBytes,
const __global uchar* pPattern )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
idx_t gid = get_global_id(0);
idx_t lid = get_local_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[lid];
uint dstIndex = get_global_id(0) + dstOffsetInBytes;
uint srcIndex = get_local_id(0);
pDst[dstIndex] = pPattern[srcIndex];
}
__kernel void FillBufferLeftLeftover(
__global uchar* pDst,
offset_t dstOffsetInBytes,
uint dstOffsetInBytes,
const __global uchar* pPattern,
const offset_t patternSizeInEls )
const uint patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
idx_t gid = get_global_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[gid & (patternSizeInEls - 1)];
uint gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
}
__kernel void FillBufferMiddle(
__global uchar* pDst,
offset_t dstOffsetInBytes,
uint dstOffsetInBytes,
const __global uint* pPattern,
const offset_t patternSizeInEls )
const uint patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
idx_t gid = get_global_id(0);
((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[gid & (patternSizeInEls - 1)];
uint gid = get_global_id(0);
((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[ gid & (patternSizeInEls - 1) ];
}
__kernel void FillBufferRightLeftover(
__global uchar* pDst,
offset_t dstOffsetInBytes,
uint dstOffsetInBytes,
const __global uchar* pPattern,
const offset_t patternSizeInEls )
const uint patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
idx_t gid = get_global_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[gid & (patternSizeInEls - 1)];
uint gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
}
__kernel void FillBufferImmediate(
__global uchar* ptr,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
ALIGNED4(ptr);
idx_t gid = get_global_id(0);
uint gid = get_global_id(0);
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateLeftOver(
__global uchar* ptr,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
ALIGNED4(ptr);
idx_t gid = get_global_id(0);
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferSSHOffset(
__global uchar* ptr,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const __global uchar* pPattern,
offset_t patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment
uint patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment
)
{
ALIGNED4(ptr);
ALIGNED4(pPattern);
idx_t dstIndex = get_global_id(0);
idx_t srcIndex = get_local_id(0);
uint dstIndex = get_global_id(0);
uint srcIndex = get_local_id(0);
__global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
__global uchar* pSrc = (__global uchar*)pPattern + patternSshOffset;
pDst[dstIndex] = pSrc[srcIndex];
}
#include "kernel_types.h"
__kernel void CopyBufferRectBytes2d(
__global const char* src,
__global char* dst,
coord2_t SrcOrigin,
coord2_t DstOrigin,
idx_t SrcPitch,
idx_t DstPitch )
{
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
uint2 SrcOrigin,
uint2 DstOrigin,
uint SrcPitch,
uint DstPitch )
idx_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
idx_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
{
int x = get_global_id(0);
int y = get_global_id(1);
uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
__kernel void CopyBufferRectBytesMiddle2d(
const __global uint* src,
__global uint* dst,
coord2_t SrcOrigin,
coord2_t DstOrigin,
idx_t SrcPitch,
idx_t DstPitch )
{
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
uint2 SrcOrigin,
uint2 DstOrigin,
uint SrcPitch,
uint DstPitch )
idx_t LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
idx_t LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
{
int x = get_global_id(0);
int y = get_global_id(1);
uint LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
uint LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
src += LSrcOffset >> 2;
dst += LDstOffset >> 2;
uint4 loaded = vload4(x, src);
vstore4(loaded, x, dst);
uint4 loaded = vload4(x,src);
vstore4(loaded,x,dst);
}
__kernel void CopyBufferRectBytes3d(
__global const char* src,
__global char* dst,
coord4_t SrcOrigin,
coord4_t DstOrigin,
coord2_t SrcPitch,
coord2_t DstPitch )
{
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
idx_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
idx_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
*( dst + LDstOffset ) = *( src + LSrcOffset );
__global const char* src,
__global char* dst,
uint4 SrcOrigin,
uint4 DstOrigin,
uint2 SrcPitch,
uint2 DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
__kernel void CopyBufferRectBytesMiddle3d(
const __global uint* src,
__global uint* dst,
coord4_t SrcOrigin,
coord4_t DstOrigin,
coord2_t SrcPitch,
coord2_t DstPitch )
{
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
uint4 SrcOrigin,
uint4 DstOrigin,
uint2 SrcPitch,
uint2 DstPitch )
idx_t LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
idx_t LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
uint LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
uint LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
src += LSrcOffset >> 2;
dst += LDstOffset >> 2;
uint4 loaded = vload4(x, src);
vstore4(loaded, x, dst);
uint4 loaded = vload4(x,src);
vstore4(loaded,x,dst);
}
void SetDstData(__global ulong* dst, uint currentOffset, ulong contextStart, ulong globalStart, ulong contextEnd, ulong globalEnd, uint useOnlyGlobalTimestamps) {
dst[currentOffset] = globalStart;
dst[currentOffset + 1] = globalEnd;
@@ -540,38 +535,36 @@ __kernel void CopyImage1dBufferToImage1dBuffer(
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
#include "kernel_types.h"
__kernel void CopyBufferToImage3dBytes(__global uchar *src,
__write_only image3d_t output,
offset_t srcOffset,
int srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
write_imageui(output, dstCoord, (uint4)(*(src + LOffset + x), 0, 0, 1));
}
__kernel void CopyBufferToImage3d2Bytes(__global uchar *src,
__write_only image3d_t output,
offset_t srcOffset,
int srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
if(((ulong)(src + srcOffset)) & 0x00000001){
if(( ulong )(src + srcOffset) & 0x00000001){
ushort upper = *((__global uchar*)(src + LOffset + x * 2 + 1));
ushort lower = *((__global uchar*)(src + LOffset + x * 2));
ushort combined = (upper << 8) | lower;
@@ -585,19 +578,19 @@ __kernel void CopyBufferToImage3d2Bytes(__global uchar *src,
__kernel void CopyBufferToImage3d4Bytes(__global uchar *src,
__write_only image3d_t output,
offset_t srcOffset,
int srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
if(((ulong)(src + srcOffset)) & 0x00000003){
if(( ulong )(src + srcOffset) & 0x00000003){
uint upper2 = *((__global uchar*)(src + LOffset + x * 4 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 4 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 4 + 1));
@@ -612,44 +605,45 @@ __kernel void CopyBufferToImage3d4Bytes(__global uchar *src,
}
__kernel void CopyBufferToImage3d3To4Bytes(__global uchar *src,
__write_only image3d_t output,
offset_t srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
uint upper2 = 0;
uint upper = *((__global uchar*)(src + LOffset + x * 3 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 3 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 3));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
write_imageui(output, dstCoord, c);
}
__kernel void CopyBufferToImage3d8Bytes(__global uchar *src,
__write_only image3d_t output,
offset_t srcOffset,
int srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8));
if(((ulong)(src + srcOffset)) & 0x00000007){
if(( ulong )(src + srcOffset) & 0x00000007){
uint upper2 = *((__global uchar*)(src + LOffset + x * 8 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 8 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 8 + 1));
@@ -671,26 +665,27 @@ __kernel void CopyBufferToImage3d8Bytes(__global uchar *src,
}
__kernel void CopyBufferToImage3d6To8Bytes(__global uchar *src,
__write_only image3d_t output,
offset_t srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8));
uint upper2 = *((__global uchar*)(src + LOffset + x * 6 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 6 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 6 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 6));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = upper = 0;
lower2 = *((__global uchar*)(src + LOffset + x * 6 + 5));
lower = *((__global uchar*)(src + LOffset + x * 6 + 4));
@@ -702,15 +697,15 @@ __kernel void CopyBufferToImage3d6To8Bytes(__global uchar *src,
__kernel void CopyBufferToImage3d16Bytes(__global uchar *src,
__write_only image3d_t output,
offset_t srcOffset,
int srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 0);
@@ -743,35 +738,33 @@ __kernel void CopyBufferToImage3d16Bytes(__global uchar *src,
}
__kernel void CopyBufferToImage3d16BytesAligned(__global uint4 *src,
__write_only image3d_t output,
offset_t srcOffset,
int4 dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
offset_t LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = src[(LOffset >> 4) + x];
write_imageui(output, dstCoord, c);
}
#include "kernel_types.h"
__kernel void CopyImage3dToBufferBytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*(dst + DstOffset + x) = convert_uchar_sat(c.x);
@@ -780,18 +773,18 @@ __kernel void CopyImage3dToBufferBytes(__read_only image3d_t input,
__kernel void CopyImage3dToBuffer2Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(((ulong)(dst + dstOffset)) & 0x00000001){
if(( ulong )(dst + dstOffset) & 0x00000001){
*((__global uchar*)(dst + DstOffset + x * 2 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 2)) = convert_uchar_sat(c.x & 0xff);
}
@@ -803,37 +796,39 @@ __kernel void CopyImage3dToBuffer2Bytes(__read_only image3d_t input,
__kernel void CopyImage3dToBuffer3Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff);
}
__kernel void CopyImage3dToBuffer4Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(((ulong)(dst + dstOffset)) & 0x00000003){
if(( ulong )(dst + dstOffset) & 0x00000003){
*((__global uchar*)(dst + DstOffset + x * 4 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 4 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 4 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
@@ -845,19 +840,19 @@ __kernel void CopyImage3dToBuffer4Bytes(__read_only image3d_t input,
}
__kernel void CopyImage3dToBuffer4To3Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 3 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 3)) = convert_uchar_sat(c.x & 0xff);
@@ -866,40 +861,40 @@ __kernel void CopyImage3dToBuffer4To3Bytes(__read_only image3d_t input,
__kernel void CopyImage3dToBuffer6Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
uint4 c = read_imageui(input, srcCoord);
*((__global uchar*)(dst + DstOffset + x * 6 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 3)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 2)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 5)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 6 + 4)) = convert_uchar_sat(c.z & 0xff);
}
__kernel void CopyImage3dToBuffer8Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(((ulong)(dst + dstOffset)) & 0x00000007){
if(( ulong )(dst + dstOffset) & 0x00000007){
*((__global uchar*)(dst + DstOffset + x * 8 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
@@ -916,16 +911,16 @@ __kernel void CopyImage3dToBuffer8Bytes(__read_only image3d_t input,
}
__kernel void CopyImage3dToBuffer8To6Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
@@ -941,14 +936,14 @@ __kernel void CopyImage3dToBuffer8To6Bytes(__read_only image3d_t input,
__kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
const uint4 c = read_imageui(input, srcCoord);
@@ -971,16 +966,16 @@ __kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input,
}
__kernel void CopyImage3dToBuffer16BytesAligned(__read_only image3d_t input,
__global uint4 *dst,
int4 srcOffset,
offset_t dstOffset,
coord2_t Pitch) {
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
__global uint4 *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
offset_t DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
const uint4 c = read_imageui(input, srcCoord);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2025 Intel Corporation
* Copyright (C) 2024-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -11,16 +11,14 @@ __kernel void fullCopy(__global const uint* src, __global uint* dst) {
vstore4(loaded, gid, dst);
}
#include "kernel_types.h"
#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0)
__kernel void CopyBufferToBufferBytes(
const __global uchar* pSrc,
__global uchar* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes,
offset_t bytesToRead )
uint srcOffsetInBytes,
uint dstOffsetInBytes,
uint bytesToRead )
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
@@ -32,24 +30,24 @@ __kernel void CopyBufferToBufferBytes(
__kernel void CopyBufferToBufferLeftLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
uint srcOffsetInBytes,
uint dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
unsigned int gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void CopyBufferToBufferMiddle(
const __global uint* pSrc,
__global uint* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
uint srcOffsetInBytes,
uint dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
unsigned int gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
uint4 loaded = vload4(gid, pSrc);
@@ -59,17 +57,17 @@ __kernel void CopyBufferToBufferMiddle(
__kernel void CopyBufferToBufferMiddleMisaligned(
__global const uint* pSrc,
__global uint* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes,
uint srcOffsetInBytes,
uint dstOffsetInBytes,
uint misalignmentInBits)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
const size_t gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
const uint4 src0 = vload4(gid, pSrc);
const uint4 src1 = vload4((gid + 1), pSrc);
const uint4 src1 = vload4(gid + 1, pSrc);
uint4 result;
result.x = (src0.x >> misalignmentInBits) | (src0.y << (32 - misalignmentInBits));
@@ -82,33 +80,32 @@ __kernel void CopyBufferToBufferMiddleMisaligned(
__kernel void CopyBufferToBufferRightLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
offset_t srcOffsetInBytes,
offset_t dstOffsetInBytes)
uint srcOffsetInBytes,
uint dstOffsetInBytes)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
unsigned int gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void copyBufferToBufferBytesSingle(__global uchar *dst, const __global uchar *src) {
ALIGNED4(dst);
ALIGNED4(src);
idx_t gid = get_global_id(0);
unsigned int gid = get_global_id(0);
dst[gid] = (uchar)(src[gid]);
}
__kernel void CopyBufferToBufferSideRegion(
__global uchar* pDst,
const __global uchar* pSrc,
idx_t len,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
unsigned int len,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
unsigned int gid = get_global_id(0);
__global uchar* pDstWithOffset = (__global uchar*)((__global uchar*)pDst + dstSshOffset);
__global uchar* pSrcWithOffset = (__global uchar*)((__global uchar*)pSrc + srcSshOffset);
if (gid < len) {
@@ -119,14 +116,14 @@ __kernel void CopyBufferToBufferSideRegion(
__kernel void CopyBufferToBufferMiddleRegion(
__global uint* pDst,
const __global uint* pSrc,
idx_t elems,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
offset_t srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
unsigned int elems,
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
uint srcSshOffset // Offset needed in case ptr has been adjusted for SSH alignment
)
{
ALIGNED4(pSrc);
ALIGNED4(pDst);
idx_t gid = get_global_id(0);
unsigned int gid = get_global_id(0);
__global uint* pDstWithOffset = (__global uint*)((__global uchar*)pDst + dstSshOffset);
__global uint* pSrcWithOffset = (__global uint*)((__global uchar*)pSrc + srcSshOffset);
if (gid < elems) {
@@ -135,181 +132,179 @@ __kernel void CopyBufferToBufferMiddleRegion(
}
}
#define ALIGNED4(ptr) __builtin_assume(((size_t)ptr&0b11) == 0)
#include "kernel_types.h"
// assumption is local work size = pattern size
__kernel void FillBufferBytes(
__global uchar* pDst,
offset_t dstOffsetInBytes,
uint dstOffsetInBytes,
const __global uchar* pPattern )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
idx_t gid = get_global_id(0);
idx_t lid = get_local_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[lid];
uint dstIndex = get_global_id(0) + dstOffsetInBytes;
uint srcIndex = get_local_id(0);
pDst[dstIndex] = pPattern[srcIndex];
}
__kernel void FillBufferLeftLeftover(
__global uchar* pDst,
offset_t dstOffsetInBytes,
uint dstOffsetInBytes,
const __global uchar* pPattern,
const offset_t patternSizeInEls )
const uint patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
idx_t gid = get_global_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[gid & (patternSizeInEls - 1)];
uint gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
}
__kernel void FillBufferMiddle(
__global uchar* pDst,
offset_t dstOffsetInBytes,
uint dstOffsetInBytes,
const __global uint* pPattern,
const offset_t patternSizeInEls )
const uint patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
idx_t gid = get_global_id(0);
((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[gid & (patternSizeInEls - 1)];
uint gid = get_global_id(0);
((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[ gid & (patternSizeInEls - 1) ];
}
__kernel void FillBufferRightLeftover(
__global uchar* pDst,
offset_t dstOffsetInBytes,
uint dstOffsetInBytes,
const __global uchar* pPattern,
const offset_t patternSizeInEls )
const uint patternSizeInEls )
{
ALIGNED4(pDst);
ALIGNED4(pPattern);
idx_t gid = get_global_id(0);
idx_t dstIndex = dstOffsetInBytes + gid;
pDst[dstIndex] = pPattern[gid & (patternSizeInEls - 1)];
uint gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
}
__kernel void FillBufferImmediate(
__global uchar* ptr,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
ALIGNED4(ptr);
idx_t gid = get_global_id(0);
uint gid = get_global_id(0);
__global uint4* dstPtr = (__global uint4*)(ptr + dstSshOffset);
dstPtr[gid] = value;
}
__kernel void FillBufferImmediateLeftOver(
__global uchar* ptr,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
ulong dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const uint value)
{
ALIGNED4(ptr);
idx_t gid = get_global_id(0);
uint gid = get_global_id(0);
(ptr + dstSshOffset)[gid] = value;
}
__kernel void FillBufferSSHOffset(
__global uchar* ptr,
offset_t dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
uint dstSshOffset, // Offset needed in case ptr has been adjusted for SSH alignment
const __global uchar* pPattern,
offset_t patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment
uint patternSshOffset // Offset needed in case pPattern has been adjusted for SSH alignment
)
{
ALIGNED4(ptr);
ALIGNED4(pPattern);
idx_t dstIndex = get_global_id(0);
idx_t srcIndex = get_local_id(0);
uint dstIndex = get_global_id(0);
uint srcIndex = get_local_id(0);
__global uchar* pDst = (__global uchar*)ptr + dstSshOffset;
__global uchar* pSrc = (__global uchar*)pPattern + patternSshOffset;
pDst[dstIndex] = pSrc[srcIndex];
}
#include "kernel_types.h"
__kernel void CopyBufferRectBytes2d(
__global const char* src,
__global char* dst,
coord2_t SrcOrigin,
coord2_t DstOrigin,
idx_t SrcPitch,
idx_t DstPitch )
{
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
uint2 SrcOrigin,
uint2 DstOrigin,
uint SrcPitch,
uint DstPitch )
idx_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
idx_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
{
int x = get_global_id(0);
int y = get_global_id(1);
uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
__kernel void CopyBufferRectBytesMiddle2d(
const __global uint* src,
__global uint* dst,
coord2_t SrcOrigin,
coord2_t DstOrigin,
idx_t SrcPitch,
idx_t DstPitch )
{
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
uint2 SrcOrigin,
uint2 DstOrigin,
uint SrcPitch,
uint DstPitch )
idx_t LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
idx_t LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
{
int x = get_global_id(0);
int y = get_global_id(1);
uint LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch );
uint LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch );
src += LSrcOffset >> 2;
dst += LDstOffset >> 2;
uint4 loaded = vload4(x, src);
vstore4(loaded, x, dst);
uint4 loaded = vload4(x,src);
vstore4(loaded,x,dst);
}
__kernel void CopyBufferRectBytes3d(
__global const char* src,
__global char* dst,
coord4_t SrcOrigin,
coord4_t DstOrigin,
coord2_t SrcPitch,
coord2_t DstPitch )
{
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
idx_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
idx_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
*( dst + LDstOffset ) = *( src + LSrcOffset );
__global const char* src,
__global char* dst,
uint4 SrcOrigin,
uint4 DstOrigin,
uint2 SrcPitch,
uint2 DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
__kernel void CopyBufferRectBytesMiddle3d(
const __global uint* src,
__global uint* dst,
coord4_t SrcOrigin,
coord4_t DstOrigin,
coord2_t SrcPitch,
coord2_t DstPitch )
{
idx_t x = get_global_id(0);
idx_t y = get_global_id(1);
idx_t z = get_global_id(2);
uint4 SrcOrigin,
uint4 DstOrigin,
uint2 SrcPitch,
uint2 DstPitch )
idx_t LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
idx_t LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
uint LSrcOffset = SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
uint LDstOffset = DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
src += LSrcOffset >> 2;
dst += LDstOffset >> 2;
uint4 loaded = vload4(x, src);
vstore4(loaded, x, dst);
uint4 loaded = vload4(x,src);
vstore4(loaded,x,dst);
}
void SetDstData(__global ulong* dst, uint currentOffset, ulong contextStart, ulong globalStart, ulong contextEnd, ulong globalEnd, uint useOnlyGlobalTimestamps) {
dst[currentOffset] = globalStart;
dst[currentOffset + 1] = globalEnd;

View File

@@ -5,7 +5,7 @@
#
set(SHARED_TEST_PROJECTS_SUB_FOLDER "prepare test files")
function(compile_kernels_gen device revision_id platform_name use_stateless use_heapless)
function(compile_kernels_gen device revision_id platform_name use_stateless_suffix use_heapless)
set(outputdir "${TargetDir}/${platform_name}/${revision_id}/test_files/${NEO_ARCH}/")
set(compiled_kernels)
@@ -19,8 +19,8 @@ function(compile_kernels_gen device revision_id platform_name use_stateless use_
set(outputname_base "${basename}_${platform_name}")
if(${use_heapless})
set(outputname_base "${outputname_base}-heapless_")
elseif(${use_stateless})
set(outputname_base "${outputname_base}-cl-intel-greater-than-4GB-buffer-required_-DWIDE_STATELESS=1_")
elseif(${use_stateless_suffix})
set(outputname_base "${outputname_base}-cl-intel-greater-than-4GB-buffer-required_")
endif()
set(outputpath_base "${outputdir}${outputname_base}")
@@ -38,14 +38,9 @@ function(compile_kernels_gen device revision_id platform_name use_stateless use_
set(internal_options "${HEAPLESS_INTERNAL_OPTIONS}")
endif()
if(${use_stateless} OR ${use_heapless})
list(APPEND __ocloc__options__ "-DWIDE_STATELESS=1")
endif()
list(APPEND __ocloc__options__ "-I${NEO_SOURCE_DIR}/shared/source/built_ins/kernels")
add_custom_command(
OUTPUT ${output_files}
COMMAND ${ocloc_cmd_prefix} -file ${absolute_filepath} -device ${device} -heapless_mode ${heapless_mode} -internal_options ${internal_options} -${NEO_BITS} -out_dir ${outputdir} -output_no_suffix -output ${outputname_base} -revision_id ${revision_id} -options "$<JOIN:${__ocloc__options__}, >"
COMMAND ${ocloc_cmd_prefix} -file ${absolute_filepath} -device ${device} -heapless_mode ${heapless_mode} -internal_options ${internal_options} -${NEO_BITS} -out_dir ${outputdir} -output_no_suffix -output ${outputname_base} -revision_id ${revision_id}
WORKING_DIRECTORY ${workdir}
DEPENDS ${filepath} ocloc copy_compiler_files
)
@@ -84,7 +79,6 @@ add_dependencies(prepare_test_kernels_for_shared copy_compiler_files)
macro(macro_for_each_platform)
set(KERNELS_TO_COMPILE ${TEST_KERNELS})
set(IMAGE_SUPPORT FALSE)
CORE_CONTAINS_PLATFORM("SUPPORTED_IMAGES" ${CORE_TYPE} ${PLATFORM_IT} IMAGE_SUPPORT)
if(NOT ${IMAGE_SUPPORT})

View File

@@ -84,7 +84,7 @@ HWTEST2_F(BuiltInSharedTest, GivenStatelessBuiltinWhenGettingResourceNameThenAdd
auto resourceNames = getBuiltinResourceNames(EBuiltInOps::copyBufferToBufferStateless, BuiltinCode::ECodeType::binary, *pDevice);
std::string expectedResourceNameGeneric = "stateless_copy_buffer_to_buffer.builtin_kernel.bin";
std::string expectedResourceNameGeneric = "stateless_copy_buffer_to_buffer_stateless.builtin_kernel.bin";
std::string expectedResourceNameForRelease = deviceIpString + "_" + expectedResourceNameGeneric;
EXPECT_EQ(1u, resourceNames.size());
@@ -103,7 +103,7 @@ HWTEST2_F(BuiltInSharedTest, GivenPlatformWithoutStatefulAddresingSupportWhenGet
{
auto resourceNames = getBuiltinResourceNames(EBuiltInOps::copyBufferToBufferStateless, BuiltinCode::ECodeType::binary, *pDevice);
std::string expectedResourceName = deviceIpString + "_stateless_copy_buffer_to_buffer.builtin_kernel.bin";
std::string expectedResourceName = deviceIpString + "_stateless_copy_buffer_to_buffer_stateless.builtin_kernel.bin";
EXPECT_EQ(1u, resourceNames.size());
EXPECT_EQ(resourceNames[0], expectedResourceName);
}
@@ -130,20 +130,20 @@ TEST_F(BuiltInSharedTest, GivenValidBuiltinTypeAndExtensionWhenCreatingBuiltinRe
const std::pair<EBuiltInOps::Type, const char *> testCases[] = {
{EBuiltInOps::auxTranslation, "aux_translation.builtin_kernel"},
{EBuiltInOps::copyBufferToBuffer, "copy_buffer_to_buffer.builtin_kernel"},
{EBuiltInOps::copyBufferToBufferStateless, "copy_buffer_to_buffer.builtin_kernel"},
{EBuiltInOps::copyBufferToBufferStatelessHeapless, "copy_buffer_to_buffer.builtin_kernel"},
{EBuiltInOps::copyBufferToBufferStateless, "copy_buffer_to_buffer_stateless.builtin_kernel"},
{EBuiltInOps::copyBufferToBufferStatelessHeapless, "copy_buffer_to_buffer_stateless.builtin_kernel"},
{EBuiltInOps::copyBufferRect, "copy_buffer_rect.builtin_kernel"},
{EBuiltInOps::copyBufferRectStateless, "copy_buffer_rect.builtin_kernel"},
{EBuiltInOps::copyBufferRectStatelessHeapless, "copy_buffer_rect.builtin_kernel"},
{EBuiltInOps::copyBufferRectStateless, "copy_buffer_rect_stateless.builtin_kernel"},
{EBuiltInOps::copyBufferRectStatelessHeapless, "copy_buffer_rect_stateless.builtin_kernel"},
{EBuiltInOps::fillBuffer, "fill_buffer.builtin_kernel"},
{EBuiltInOps::fillBufferStateless, "fill_buffer.builtin_kernel"},
{EBuiltInOps::fillBufferStatelessHeapless, "fill_buffer.builtin_kernel"},
{EBuiltInOps::fillBufferStateless, "fill_buffer_stateless.builtin_kernel"},
{EBuiltInOps::fillBufferStatelessHeapless, "fill_buffer_stateless.builtin_kernel"},
{EBuiltInOps::copyBufferToImage3d, "copy_buffer_to_image3d.builtin_kernel"},
{EBuiltInOps::copyBufferToImage3dStateless, "copy_buffer_to_image3d.builtin_kernel"},
{EBuiltInOps::copyBufferToImage3dHeapless, "copy_buffer_to_image3d.builtin_kernel"},
{EBuiltInOps::copyBufferToImage3dStateless, "copy_buffer_to_image3d_stateless.builtin_kernel"},
{EBuiltInOps::copyBufferToImage3dHeapless, "copy_buffer_to_image3d_stateless.builtin_kernel"},
{EBuiltInOps::copyImage3dToBuffer, "copy_image3d_to_buffer.builtin_kernel"},
{EBuiltInOps::copyImage3dToBufferStateless, "copy_image3d_to_buffer.builtin_kernel"},
{EBuiltInOps::copyImage3dToBufferHeapless, "copy_image3d_to_buffer.builtin_kernel"},
{EBuiltInOps::copyImage3dToBufferStateless, "copy_image3d_to_buffer_stateless.builtin_kernel"},
{EBuiltInOps::copyImage3dToBufferHeapless, "copy_image3d_to_buffer_stateless.builtin_kernel"},
{EBuiltInOps::copyImageToImage1d, "copy_image_to_image1d.builtin_kernel"},
{EBuiltInOps::copyImageToImage1dHeapless, "copy_image_to_image1d.builtin_kernel"},
{EBuiltInOps::copyImageToImage2d, "copy_image_to_image2d.builtin_kernel"},
@@ -195,9 +195,9 @@ TEST_F(BuiltInSharedTest, GivenHeaplessModeEnabledWhenGetBuiltinResourceNamesIsC
};
TestParam params[] = {
{"copy_buffer_to_buffer", EBuiltInOps::copyBufferToBufferStatelessHeapless},
{"copy_buffer_rect", EBuiltInOps::copyBufferRectStatelessHeapless},
{"fill_buffer", EBuiltInOps::fillBufferStatelessHeapless}};
{"copy_buffer_to_buffer_stateless", EBuiltInOps::copyBufferToBufferStatelessHeapless},
{"copy_buffer_rect_stateless", EBuiltInOps::copyBufferRectStatelessHeapless},
{"fill_buffer_stateless", EBuiltInOps::fillBufferStatelessHeapless}};
for (auto &[builtInTypeAsString, builtInType] : params) {