diff --git a/runtime/built_ins/CMakeLists.txt b/runtime/built_ins/CMakeLists.txt index 85f6d1755f..d260228e42 100644 --- a/runtime/built_ins/CMakeLists.txt +++ b/runtime/built_ins/CMakeLists.txt @@ -26,6 +26,7 @@ set(RUNTIME_SRCS_BUILT_IN_KERNELS ${CMAKE_CURRENT_SOURCE_DIR}/kernels/aux_translation.igdrcl_built_in ${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_rect.igdrcl_built_in ${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_to_buffer.igdrcl_built_in + ${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_to_buffer_stateless.igdrcl_built_in ${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_to_image3d.igdrcl_built_in ${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_image3d_to_buffer.igdrcl_built_in ${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_image_to_image1d.igdrcl_built_in @@ -35,9 +36,9 @@ set(RUNTIME_SRCS_BUILT_IN_KERNELS ${CMAKE_CURRENT_SOURCE_DIR}/kernels/fill_image1d.igdrcl_built_in ${CMAKE_CURRENT_SOURCE_DIR}/kernels/fill_image2d.igdrcl_built_in ${CMAKE_CURRENT_SOURCE_DIR}/kernels/fill_image3d.igdrcl_built_in - ${CMAKE_CURRENT_SOURCE_DIR}/kernels/vme_block_motion_estimate_intel.igdrcl_built_in - ${CMAKE_CURRENT_SOURCE_DIR}/kernels/vme_block_advanced_motion_estimate_check_intel.igdrcl_built_in ${CMAKE_CURRENT_SOURCE_DIR}/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel.igdrcl_built_in + ${CMAKE_CURRENT_SOURCE_DIR}/kernels/vme_block_advanced_motion_estimate_check_intel.igdrcl_built_in + ${CMAKE_CURRENT_SOURCE_DIR}/kernels/vme_block_motion_estimate_intel.igdrcl_built_in ) target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_BUILT_IN_KERNELS}) diff --git a/runtime/built_ins/built_in_ops_base.h b/runtime/built_ins/built_in_ops_base.h index 9340242eec..e2f6055b42 100644 --- a/runtime/built_ins/built_in_ops_base.h +++ b/runtime/built_ins/built_in_ops_base.h @@ -14,20 +14,22 @@ using Type = uint32_t; constexpr Type AuxTranslation{0}; constexpr Type CopyBufferToBuffer{1}; -constexpr Type CopyBufferRect{2}; -constexpr Type FillBuffer{3}; -constexpr Type CopyBufferToImage3d{4}; -constexpr Type CopyImage3dToBuffer{5}; -constexpr Type CopyImageToImage1d{6}; -constexpr Type CopyImageToImage2d{7}; -constexpr Type CopyImageToImage3d{8}; -constexpr Type FillImage1d{9}; -constexpr Type FillImage2d{10}; -constexpr Type FillImage3d{11}; -constexpr Type VmeBlockMotionEstimateIntel{12}; -constexpr Type VmeBlockAdvancedMotionEstimateCheckIntel{13}; -constexpr Type VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel{14}; -constexpr Type Scheduler{15}; -constexpr uint32_t MaxBaseValue{15}; +constexpr Type CopyBufferToBufferStateless{2}; +constexpr Type CopyBufferRect{3}; +constexpr Type FillBuffer{4}; +constexpr Type CopyBufferToImage3d{5}; +constexpr Type CopyImage3dToBuffer{6}; +constexpr Type CopyImageToImage1d{7}; +constexpr Type CopyImageToImage2d{8}; +constexpr Type CopyImageToImage3d{9}; +constexpr Type FillImage1d{10}; +constexpr Type FillImage2d{11}; +constexpr Type FillImage3d{12}; +constexpr Type VmeBlockMotionEstimateIntel{13}; +constexpr Type VmeBlockAdvancedMotionEstimateCheckIntel{14}; +constexpr Type VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel{15}; +constexpr Type Scheduler{16}; + +constexpr uint32_t MaxBaseValue{16}; } // namespace EBuiltInOps } // namespace NEO diff --git a/runtime/built_ins/built_ins.cpp b/runtime/built_ins/built_ins.cpp index b04ef2db5d..da11849320 100644 --- a/runtime/built_ins/built_ins.cpp +++ b/runtime/built_ins/built_ins.cpp @@ -200,7 +200,7 @@ template <> class BuiltInOp : public BuiltinDispatchInfoBuilder { public: BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device) - : BuiltinDispatchInfoBuilder(kernelsLib), kernLeftLeftover(nullptr), kernMiddle(nullptr), kernRightLeftover(nullptr) { + : BuiltinDispatchInfoBuilder(kernelsLib) { populate(context, device, EBuiltInOps::CopyBufferToBuffer, "", @@ -278,9 +278,26 @@ class BuiltInOp : public BuiltinDispatchInfoBui } protected: - Kernel *kernLeftLeftover; - Kernel *kernMiddle; - Kernel *kernRightLeftover; + Kernel *kernLeftLeftover = nullptr; + Kernel *kernMiddle = nullptr; + Kernel *kernRightLeftover = nullptr; + BuiltInOp(BuiltIns &kernelsLib) + : BuiltinDispatchInfoBuilder(kernelsLib) { + } +}; + +template <> +class BuiltInOp : public BuiltInOp { + public: + BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device) + : BuiltInOp(kernelsLib) { + populate(context, device, + EBuiltInOps::CopyBufferToBufferStateless, + "-cl-intel-greater-than-4GB-buffer-required", + "CopyBufferToBufferLeftLeftover", kernLeftLeftover, + "CopyBufferToBufferMiddle", kernMiddle, + "CopyBufferToBufferRightLeftover", kernRightLeftover); + } }; template <> @@ -763,6 +780,9 @@ BuiltinDispatchInfoBuilder &BuiltIns::getBuiltinDispatchInfoBuilder(EBuiltInOps: case EBuiltInOps::CopyBufferToBuffer: std::call_once(operationBuilder.second, [&] { operationBuilder.first = std::make_unique>(*this, context, device); }); break; + case EBuiltInOps::CopyBufferToBufferStateless: + std::call_once(operationBuilder.second, [&] { operationBuilder.first = std::make_unique>(*this, context, device); }); + break; case EBuiltInOps::CopyBufferRect: std::call_once(operationBuilder.second, [&] { operationBuilder.first = std::make_unique>(*this, context, device); }); break; diff --git a/runtime/built_ins/built_ins_storage.cpp b/runtime/built_ins/built_ins_storage.cpp index 45c2e10fda..91625c2dd5 100644 --- a/runtime/built_ins/built_ins_storage.cpp +++ b/runtime/built_ins/built_ins_storage.cpp @@ -24,6 +24,8 @@ const char *getBuiltinAsString(EBuiltInOps::Type builtin) { return "aux_translation.igdrcl_built_in"; case EBuiltInOps::CopyBufferToBuffer: return "copy_buffer_to_buffer.igdrcl_built_in"; + case EBuiltInOps::CopyBufferToBufferStateless: + return "copy_buffer_to_buffer_stateless.igdrcl_built_in"; case EBuiltInOps::CopyBufferRect: return "copy_buffer_rect.igdrcl_built_in"; case EBuiltInOps::FillBuffer: diff --git a/runtime/built_ins/builtins_binary.cmake b/runtime/built_ins/builtins_binary.cmake index 6745c5ecf8..1d20b56275 100644 --- a/runtime/built_ins/builtins_binary.cmake +++ b/runtime/built_ins/builtins_binary.cmake @@ -24,6 +24,10 @@ set(GENERATED_BUILTINS "fill_image3d" ) +set(GENERATED_BUILTINS_STATELESS + "copy_buffer_to_buffer_stateless" +) + # Generate builtins cpps if(COMPILE_BUILT_INS) add_subdirectory(kernels) @@ -35,6 +39,9 @@ macro(macro_for_each_gen) foreach(GENERATED_BUILTIN ${GENERATED_BUILTINS}) list(APPEND GENERATED_BUILTINS_CPPS ${BUILTINS_INCLUDE_DIR}/${RUNTIME_GENERATED_${GENERATED_BUILTIN}_${family_name_with_type}}) endforeach() + foreach(GENERATED_BUILTIN_STATELESS ${GENERATED_BUILTINS_STATELESS}) + list(APPEND GENERATED_BUILTINS_CPPS ${BUILTINS_INCLUDE_DIR}/${RUNTIME_GENERATED_${GENERATED_BUILTIN_STATELESS}_${family_name_with_type}}) + endforeach() endforeach() source_group("generated files\\${GEN_TYPE_LOWER}" FILES ${GENERATED_BUILTINS_CPPS}) endmacro() diff --git a/runtime/built_ins/kernels/CMakeLists.txt b/runtime/built_ins/kernels/CMakeLists.txt index 7ac437f210..8b00e0b5a1 100644 --- a/runtime/built_ins/kernels/CMakeLists.txt +++ b/runtime/built_ins/kernels/CMakeLists.txt @@ -10,6 +10,7 @@ set(BUILTINS_OUTDIR_WITH_ARCH "${TargetDir}/built_ins/${NEO_ARCH}") add_dependencies(${BUILTINS_BINARIES_LIB_NAME} builtins) add_subdirectories() set(GENERATED_BUILTINS ${GENERATED_BUILTINS} PARENT_SCOPE) +set(GENERATED_BUILTINS_STATELESS ${GENERATED_BUILTINS_STATELESS} PARENT_SCOPE) if("${NEO_ARCH}" STREQUAL "x32") set(BUILTIN_OPTIONS "-cl-intel-greater-than-4GB-buffer-required") @@ -17,6 +18,10 @@ else() set(BUILTIN_OPTIONS "") endif() +set(BUILTIN_OPTIONS_STATELESS + "-cl-intel-greater-than-4GB-buffer-required" +) + if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug" ) list(APPEND __cloc__options__ "-D DEBUG") endif() @@ -24,8 +29,17 @@ endif() set(BUILTINS_INCLUDE_DIR ${TargetDir} PARENT_SCOPE) set(BUILTIN_CPP "") +function(get_bits_for_stateless gen_type platform_type) + # Force 32bits compiling on gen9lp for stateless builtins + if( (${GEN_TYPE} STREQUAL "GEN9" ) AND ( ${PLATFORM_TYPE} STREQUAL "LP")) + set(BITS "32" PARENT_SCOPE) + else() + set(BITS ${NEO_BITS} PARENT_SCOPE) + endif() +endfunction() + # Define function for compiling built-ins (with ocloc) -function(compile_builtin gen_type platform_type builtin) +function(compile_builtin gen_type platform_type builtin bits builtin_options) string(TOLOWER ${gen_type} gen_type_lower) get_family_name_with_type(${gen_type} ${platform_type}) set(OUTPUTDIR "${BUILTINS_OUTDIR_WITH_ARCH}/${gen_type_lower}") @@ -58,7 +72,7 @@ function(compile_builtin gen_type platform_type builtin) list(APPEND __cloc__options__ "-cl-kernel-arg-info") add_custom_command( OUTPUT ${OUTPUT_FILES} - COMMAND ${cloc_cmd_prefix} -q -file ${FILENAME} -device ${DEFAULT_SUPPORTED_${gen_type}_${platform_type}_PLATFORM} ${BUILTIN_OPTIONS} -${NEO_BITS} -out_dir ${OUTPUTDIR} -cpp_file -options "$" + COMMAND ${cloc_cmd_prefix} -q -file ${FILENAME} -device ${DEFAULT_SUPPORTED_${gen_type}_${platform_type}_PLATFORM} ${builtin_options} -${bits} -out_dir ${OUTPUTDIR} -cpp_file -options "$" WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${builtin} ocloc copy_compiler_files ) @@ -71,11 +85,16 @@ macro(macro_for_each_gen) string(TOLOWER ${PLATFORM_TYPE} PLATFORM_TYPE_LOWER) unset(BUILTINS_COMMANDS) foreach(GENERATED_BUILTIN ${GENERATED_BUILTINS}) - compile_builtin(${GEN_TYPE} ${PLATFORM_TYPE} ${GENERATED_BUILTIN}.igdrcl_built_in) + compile_builtin(${GEN_TYPE} ${PLATFORM_TYPE} ${GENERATED_BUILTIN}.igdrcl_built_in ${NEO_BITS} "${BUILTIN_OPTIONS}") list(APPEND BUILTINS_COMMANDS ${TargetDir}/${BUILTIN_CPP}) set(RUNTIME_GENERATED_${GENERATED_BUILTIN}_${family_name_with_type} ${BUILTIN_CPP} PARENT_SCOPE) endforeach() - + get_bits_for_stateless(${GEN_TYPE} ${PLATFORM_TYPE}) + foreach(GENERATED_BUILTIN_STATELESS ${GENERATED_BUILTINS_STATELESS}) + compile_builtin(${GEN_TYPE} ${PLATFORM_TYPE} ${GENERATED_BUILTIN_STATELESS}.igdrcl_built_in ${BITS} "${BUILTIN_OPTIONS_STATELESS}") + list(APPEND BUILTINS_COMMANDS ${TargetDir}/${BUILTIN_CPP}) + set(RUNTIME_GENERATED_${GENERATED_BUILTIN_STATELESS}_${family_name_with_type} ${BUILTIN_CPP} PARENT_SCOPE) + endforeach() set(target_name builtins_${family_name_with_type}) add_custom_target(${target_name} DEPENDS ${BUILTINS_COMMANDS}) add_dependencies(builtins ${target_name}) diff --git a/runtime/built_ins/kernels/copy_buffer_to_buffer_stateless.igdrcl_built_in b/runtime/built_ins/kernels/copy_buffer_to_buffer_stateless.igdrcl_built_in new file mode 100644 index 0000000000..5332c7dc5e --- /dev/null +++ b/runtime/built_ins/kernels/copy_buffer_to_buffer_stateless.igdrcl_built_in @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2019 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +R"===( +__kernel void CopyBufferToBufferBytes( + const __global uchar* pSrc, + __global uchar* pDst, + ulong srcOffsetInBytes, + ulong dstOffsetInBytes, + ulong bytesToRead ) +{ + pSrc += ( srcOffsetInBytes + get_global_id(0) ); + pDst += ( dstOffsetInBytes + get_global_id(0) ); + pDst[ 0 ] = pSrc[ 0 ]; +} + +__kernel void CopyBufferToBufferLeftLeftover( + const __global uchar* pSrc, + __global uchar* pDst, + ulong srcOffsetInBytes, + ulong dstOffsetInBytes) +{ + size_t gid = get_global_id(0); + pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ]; +} + +__kernel void CopyBufferToBufferMiddle( + const __global uint* pSrc, + __global uint* pDst, + ulong srcOffsetInBytes, + ulong dstOffsetInBytes) +{ + size_t gid = get_global_id(0); + pDst += dstOffsetInBytes >> 2; + pSrc += srcOffsetInBytes >> 2; + uint4 loaded = vload4(gid, pSrc); + vstore4(loaded, gid, pDst); +} + +__kernel void CopyBufferToBufferRightLeftover( + const __global uchar* pSrc, + __global uchar* pDst, + ulong srcOffsetInBytes, + ulong dstOffsetInBytes) +{ + size_t gid = get_global_id(0); + pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ]; +} + +)===" \ No newline at end of file diff --git a/runtime/built_ins/registry/register_copy_kernels_source.cpp b/runtime/built_ins/registry/register_copy_kernels_source.cpp index 8d625f4983..a4e8ccc089 100644 --- a/runtime/built_ins/registry/register_copy_kernels_source.cpp +++ b/runtime/built_ins/registry/register_copy_kernels_source.cpp @@ -20,6 +20,15 @@ static RegisterEmbeddedResource registerCopyBufferToBufferSrc( #include "runtime/built_ins/kernels/copy_buffer_to_buffer.igdrcl_built_in" )); +static RegisterEmbeddedResource registerCopyBufferToBufferStatelessSrc( + createBuiltinResourceName( + EBuiltInOps::CopyBufferToBufferStateless, + BuiltinCode::getExtension(BuiltinCode::ECodeType::Source)) + .c_str(), + std::string( +#include "runtime/built_ins/kernels/copy_buffer_to_buffer_stateless.igdrcl_built_in" + )); + static RegisterEmbeddedResource registerCopyBufferRectSrc( createBuiltinResourceName( EBuiltInOps::CopyBufferRect, diff --git a/runtime/command_queue/command_queue_hw.h b/runtime/command_queue/command_queue_hw.h index dfb1a78c5e..ae2a356946 100644 --- a/runtime/command_queue/command_queue_hw.h +++ b/runtime/command_queue/command_queue_hw.h @@ -395,6 +395,8 @@ class CommandQueueHw : public CommandQueue { MOCKABLE_VIRTUAL void dispatchAuxTranslation(MultiDispatchInfo &multiDispatchInfo, MemObjsForAuxTranslation &memObjsForAuxTranslation, AuxTranslationDirection auxTranslationDirection); + MOCKABLE_VIRTUAL bool forceStateless(size_t size); + template LinearStream *obtainCommandStream(const CsrDependencies &csrDependencies, bool blitEnqueue, bool blockedQueue, const MultiDispatchInfo &multiDispatchInfo, const EventsRequest &eventsRequest, diff --git a/runtime/command_queue/command_queue_hw_base.inl b/runtime/command_queue/command_queue_hw_base.inl index d0288da2fc..c5082cae28 100644 --- a/runtime/command_queue/command_queue_hw_base.inl +++ b/runtime/command_queue/command_queue_hw_base.inl @@ -113,4 +113,10 @@ void CommandQueueHw::dispatchAuxTranslation(MultiDispatchInfo &multiDisp auxTranslationBuilder.buildDispatchInfosForAuxTranslation(multiDispatchInfo, dispatchParams); } + +template +bool CommandQueueHw::forceStateless(size_t size) { + return size >= 4ull * MemoryConstants::gigaByte; +} + } // namespace NEO diff --git a/runtime/command_queue/enqueue_copy_buffer.h b/runtime/command_queue/enqueue_copy_buffer.h index 183ebca242..9bdeba526e 100644 --- a/runtime/command_queue/enqueue_copy_buffer.h +++ b/runtime/command_queue/enqueue_copy_buffer.h @@ -32,9 +32,16 @@ cl_int CommandQueueHw::enqueueCopyBuffer( cl_event *event) { MultiDispatchInfo dispatchInfo; + auto eBuiltInOpsType = EBuiltInOps::CopyBufferToBuffer; + + if (forceStateless(size)) { + eBuiltInOpsType = EBuiltInOps::CopyBufferToBufferStateless; + } + + auto &builder = getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(eBuiltInOpsType, + this->getContext(), + this->getDevice()); - auto &builder = getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBuffer, - this->getContext(), this->getDevice()); BuiltInOwnershipWrapper builtInLock(builder, this->context); BuiltinOpParams dc; diff --git a/unit_tests/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp b/unit_tests/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp index 7fa92771e6..4dc04f2f93 100644 --- a/unit_tests/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp +++ b/unit_tests/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp @@ -506,6 +506,7 @@ HWTEST_F(AUBSimpleKernelStatelessTest, givenSimpleKernelWhenStatelessPathIsUsedT } EXPECT_FALSE(this->kernel->getKernelInfo().kernelArgInfo[0].pureStatefulBufferAccess); + EXPECT_TRUE(this->kernel->getKernelInfo().patchInfo.executionEnvironment->CompiledForGreaterThan4GBBuffers); this->pCmdQ->flush(); expectMemory(reinterpret_cast(pBuffer->getGraphicsAllocation()->getGpuAddress()), diff --git a/unit_tests/command_queue/enqueue_copy_buffer_tests.cpp b/unit_tests/command_queue/enqueue_copy_buffer_tests.cpp index f09eca502d..02ab029408 100644 --- a/unit_tests/command_queue/enqueue_copy_buffer_tests.cpp +++ b/unit_tests/command_queue/enqueue_copy_buffer_tests.cpp @@ -8,6 +8,7 @@ #include "core/helpers/ptr_math.h" #include "runtime/built_ins/built_ins.h" #include "runtime/built_ins/builtins_dispatch_builder.h" +#include "runtime/command_queue/command_queue_hw.h" #include "runtime/helpers/dispatch_info.h" #include "runtime/kernel/kernel.h" #include "test.h" @@ -138,7 +139,7 @@ HWTEST_F(EnqueueCopyBufferTest, WhenCopyingBufferThenIndirectDataGetsAdded) { BuiltinOpParams dc; dc.srcMemObj = srcBuffer; - dc.srcMemObj = dstBuffer; + dc.dstMemObj = dstBuffer; dc.srcOffset = {EnqueueCopyBufferTraits::srcOffset, 0, 0}; dc.dstOffset = {EnqueueCopyBufferTraits::dstOffset, 0, 0}; dc.size = {EnqueueCopyBufferTraits::size, 0, 0}; @@ -154,6 +155,30 @@ HWTEST_F(EnqueueCopyBufferTest, WhenCopyingBufferThenIndirectDataGetsAdded) { } } +HWTEST_F(EnqueueCopyBufferTest, WhenCopyingBufferStatelessThenStatelessKernelIsUsed) { + + auto srcBuffer = std::unique_ptr(BufferHelper<>::create()); + auto dstBuffer = std::unique_ptr(BufferHelper<>::create()); + + MultiDispatchInfo multiDispatchInfo; + auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBufferStateless, + pCmdQ->getContext(), pCmdQ->getDevice()); + + ASSERT_NE(nullptr, &builder); + BuiltinOpParams dc; + dc.srcMemObj = srcBuffer.get(); + dc.dstMemObj = dstBuffer.get(); + dc.srcOffset = {EnqueueCopyBufferTraits::srcOffset, 0, 0}; + dc.dstOffset = {EnqueueCopyBufferTraits::dstOffset, 0, 0}; + dc.size = {EnqueueCopyBufferTraits::size, 0, 0}; + builder.buildDispatchInfos(multiDispatchInfo, dc); + EXPECT_NE(0u, multiDispatchInfo.size()); + + auto kernel = multiDispatchInfo.begin()->getKernel(); + EXPECT_TRUE(kernel->getKernelInfo().patchInfo.executionEnvironment->CompiledForGreaterThan4GBBuffers); + EXPECT_FALSE(kernel->getKernelInfo().kernelArgInfo[0].pureStatefulBufferAccess); +} + HWTEST_F(EnqueueCopyBufferTest, WhenCopyingBufferThenL3ProgrammingIsCorrect) { enqueueCopyBufferAndParse(); validateL3Programming(cmdList, itorWalker); @@ -273,3 +298,83 @@ HWTEST_F(EnqueueCopyBufferTest, WhenCopyingBufferThenArgumentOneMatchesDestinati EXPECT_EQ((void *)((uintptr_t)dstBuffer->getGraphicsAllocation()->getGpuAddress()), *pArgument); } + +struct EnqueueCopyBufferHw : public ::testing::Test { + template + struct MyCmdQStateless : public CommandQueueHw { + using CommandQueueHw::commandStream; + MyCmdQStateless(Context *context, Device *device) : CommandQueueHw(context, device, nullptr){}; + + bool forceStateless(size_t size) override { + return true; + } + }; + + template + struct MyCmdQStatefull : public CommandQueueHw { + using CommandQueueHw::commandStream; + MyCmdQStatefull(Context *context, Device *device) : CommandQueueHw(context, device, nullptr){}; + + bool forceStateless(size_t size) override { + return false; + } + }; + + void SetUp() override { + device.reset(MockDevice::createWithNewExecutionEnvironment(*platformDevices)); + context.reset(new MockContext(device.get())); + + srcBuffer = std::unique_ptr(BufferHelper<>::create(context.get())); + dstBuffer = std::unique_ptr(BufferHelper<>::create(context.get())); + } + + std::unique_ptr device; + std::unique_ptr context; + std::unique_ptr srcBuffer; + std::unique_ptr dstBuffer; +}; + +using EnqueueCopyBufferStatelessTest = EnqueueCopyBufferHw; + +HWTEST_F(EnqueueCopyBufferStatelessTest, givenBuffersWhenCopyingBufferStatelessThenSuccessIsReturned) { + + if (is32bit) { + GTEST_SKIP(); + } + auto cmdQ = std::make_unique>(context.get(), device.get()); + + auto retVal = cmdQ->enqueueCopyBuffer( + srcBuffer.get(), + dstBuffer.get(), + 0, + 0, + sizeof(float), + 0, + nullptr, + nullptr); + + EXPECT_EQ(CL_SUCCESS, retVal); +} + +using EnqueueCopyBufferStatefullTest = EnqueueCopyBufferHw; + +HWTEST_F(EnqueueCopyBufferStatefullTest, givenBuffersWhenCopyingBufferStatefullThenSuccessIsReturned) { + + if (is32bit) { + GTEST_SKIP(); + } + + auto cmdQ = std::make_unique>(context.get(), device.get()); + + auto retVal = cmdQ->enqueueCopyBuffer( + srcBuffer.get(), + dstBuffer.get(), + 0, + 0, + sizeof(float), + 0, + nullptr, + nullptr); + + EXPECT_EQ(CL_SUCCESS, retVal); +}