diff --git a/runtime/built_ins/CMakeLists.txt b/runtime/built_ins/CMakeLists.txt index d260228e42..8a2904ec7e 100644 --- a/runtime/built_ins/CMakeLists.txt +++ b/runtime/built_ins/CMakeLists.txt @@ -25,6 +25,7 @@ set_property(GLOBAL PROPERTY RUNTIME_SRCS_BUILT_INS ${RUNTIME_SRCS_BUILT_INS}) set(RUNTIME_SRCS_BUILT_IN_KERNELS ${CMAKE_CURRENT_SOURCE_DIR}/kernels/aux_translation.igdrcl_built_in ${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_rect.igdrcl_built_in + ${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_rect_stateless.igdrcl_built_in ${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_to_buffer.igdrcl_built_in ${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_to_buffer_stateless.igdrcl_built_in ${CMAKE_CURRENT_SOURCE_DIR}/kernels/copy_buffer_to_image3d.igdrcl_built_in diff --git a/runtime/built_ins/built_in_ops_base.h b/runtime/built_ins/built_in_ops_base.h index e2f6055b42..eb06840518 100644 --- a/runtime/built_ins/built_in_ops_base.h +++ b/runtime/built_ins/built_in_ops_base.h @@ -16,20 +16,21 @@ constexpr Type AuxTranslation{0}; constexpr Type CopyBufferToBuffer{1}; constexpr Type CopyBufferToBufferStateless{2}; constexpr Type CopyBufferRect{3}; -constexpr Type FillBuffer{4}; -constexpr Type CopyBufferToImage3d{5}; -constexpr Type CopyImage3dToBuffer{6}; -constexpr Type CopyImageToImage1d{7}; -constexpr Type CopyImageToImage2d{8}; -constexpr Type CopyImageToImage3d{9}; -constexpr Type FillImage1d{10}; -constexpr Type FillImage2d{11}; -constexpr Type FillImage3d{12}; -constexpr Type VmeBlockMotionEstimateIntel{13}; -constexpr Type VmeBlockAdvancedMotionEstimateCheckIntel{14}; -constexpr Type VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel{15}; -constexpr Type Scheduler{16}; +constexpr Type CopyBufferRectStateless{4}; +constexpr Type FillBuffer{5}; +constexpr Type CopyBufferToImage3d{6}; +constexpr Type CopyImage3dToBuffer{7}; +constexpr Type CopyImageToImage1d{8}; +constexpr Type CopyImageToImage2d{9}; +constexpr Type CopyImageToImage3d{10}; +constexpr Type FillImage1d{11}; +constexpr Type FillImage2d{12}; +constexpr Type FillImage3d{13}; +constexpr Type VmeBlockMotionEstimateIntel{14}; +constexpr Type VmeBlockAdvancedMotionEstimateCheckIntel{15}; +constexpr Type VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel{16}; +constexpr Type Scheduler{17}; -constexpr uint32_t MaxBaseValue{16}; +constexpr uint32_t MaxBaseValue{17}; } // namespace EBuiltInOps } // namespace NEO diff --git a/runtime/built_ins/built_ins.cpp b/runtime/built_ins/built_ins.cpp index da3d776222..4cb1f5b0ce 100644 --- a/runtime/built_ins/built_ins.cpp +++ b/runtime/built_ins/built_ins.cpp @@ -396,6 +396,21 @@ class BuiltInOp : public BuiltinDispatchInfoBuilder protected: Kernel *kernelBytes[3]; + BuiltInOp(BuiltIns &kernelsLib) : BuiltinDispatchInfoBuilder(kernelsLib), kernelBytes{nullptr} {}; +}; + +template <> +class BuiltInOp : public BuiltInOp { + public: + BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device) + : BuiltInOp(kernelsLib) { + populate(context, device, + EBuiltInOps::CopyBufferRectStateless, + "-cl-intel-greater-than-4GB-buffer-required", + "CopyBufferRectBytes2d", kernelBytes[0], + "CopyBufferRectBytes2d", kernelBytes[1], + "CopyBufferRectBytes3d", kernelBytes[2]); + } }; template <> @@ -786,6 +801,9 @@ BuiltinDispatchInfoBuilder &BuiltIns::getBuiltinDispatchInfoBuilder(EBuiltInOps: case EBuiltInOps::CopyBufferRect: std::call_once(operationBuilder.second, [&] { operationBuilder.first = std::make_unique>(*this, context, device); }); break; + case EBuiltInOps::CopyBufferRectStateless: + std::call_once(operationBuilder.second, [&] { operationBuilder.first = std::make_unique>(*this, context, device); }); + break; case EBuiltInOps::FillBuffer: std::call_once(operationBuilder.second, [&] { operationBuilder.first = std::make_unique>(*this, context, device); }); break; diff --git a/runtime/built_ins/built_ins_storage.cpp b/runtime/built_ins/built_ins_storage.cpp index 91625c2dd5..664f062611 100644 --- a/runtime/built_ins/built_ins_storage.cpp +++ b/runtime/built_ins/built_ins_storage.cpp @@ -28,6 +28,8 @@ const char *getBuiltinAsString(EBuiltInOps::Type builtin) { return "copy_buffer_to_buffer_stateless.igdrcl_built_in"; case EBuiltInOps::CopyBufferRect: return "copy_buffer_rect.igdrcl_built_in"; + case EBuiltInOps::CopyBufferRectStateless: + return "copy_buffer_rect_stateless.igdrcl_built_in"; case EBuiltInOps::FillBuffer: return "fill_buffer.igdrcl_built_in"; case EBuiltInOps::CopyBufferToImage3d: diff --git a/runtime/built_ins/builtins_binary.cmake b/runtime/built_ins/builtins_binary.cmake index 1d20b56275..8c926a80da 100644 --- a/runtime/built_ins/builtins_binary.cmake +++ b/runtime/built_ins/builtins_binary.cmake @@ -26,6 +26,7 @@ set(GENERATED_BUILTINS set(GENERATED_BUILTINS_STATELESS "copy_buffer_to_buffer_stateless" + "copy_buffer_rect_stateless" ) # Generate builtins cpps diff --git a/runtime/built_ins/kernels/copy_buffer_rect_stateless.igdrcl_built_in b/runtime/built_ins/kernels/copy_buffer_rect_stateless.igdrcl_built_in new file mode 100644 index 0000000000..f7246c713c --- /dev/null +++ b/runtime/built_ins/kernels/copy_buffer_rect_stateless.igdrcl_built_in @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2019 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +R"===( +////////////////////////////////////////////////////////////////////////////// +__kernel void CopyBufferRectBytes2d( + __global const char* src, + __global char* dst, + ulong4 SrcOrigin, + ulong4 DstOrigin, + ulong2 SrcPitch, + ulong2 DstPitch ) + +{ + size_t x = get_global_id(0); + size_t y = get_global_id(1); + + size_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ); + size_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ); + + *( dst + LDstOffset ) = *( src + LSrcOffset ); + +} +////////////////////////////////////////////////////////////////////////////// +__kernel void CopyBufferRectBytes3d( + __global const char* src, + __global char* dst, + ulong4 SrcOrigin, + ulong4 DstOrigin, + ulong2 SrcPitch, + ulong2 DstPitch ) + +{ + size_t x = get_global_id(0); + size_t y = get_global_id(1); + size_t z = get_global_id(2); + + size_t LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y ); + size_t LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y ); + + *( dst + LDstOffset ) = *( src + LSrcOffset ); + +} +)===" diff --git a/runtime/built_ins/registry/register_copy_kernels_source.cpp b/runtime/built_ins/registry/register_copy_kernels_source.cpp index a4e8ccc089..7d8f24e9a7 100644 --- a/runtime/built_ins/registry/register_copy_kernels_source.cpp +++ b/runtime/built_ins/registry/register_copy_kernels_source.cpp @@ -38,6 +38,15 @@ static RegisterEmbeddedResource registerCopyBufferRectSrc( #include "runtime/built_ins/kernels/copy_buffer_rect.igdrcl_built_in" )); +static RegisterEmbeddedResource registerCopyBufferRectStatelessSrc( + createBuiltinResourceName( + EBuiltInOps::CopyBufferRectStateless, + BuiltinCode::getExtension(BuiltinCode::ECodeType::Source)) + .c_str(), + std::string( +#include "runtime/built_ins/kernels/copy_buffer_rect_stateless.igdrcl_built_in" + )); + static RegisterEmbeddedResource registerFillBufferSrc( createBuiltinResourceName( EBuiltInOps::FillBuffer, diff --git a/runtime/command_queue/enqueue_copy_buffer_rect.h b/runtime/command_queue/enqueue_copy_buffer_rect.h index 7e3555ad10..f9f9e2cdab 100644 --- a/runtime/command_queue/enqueue_copy_buffer_rect.h +++ b/runtime/command_queue/enqueue_copy_buffer_rect.h @@ -34,8 +34,15 @@ cl_int CommandQueueHw::enqueueCopyBufferRect( MultiDispatchInfo dispatchInfo; - auto &builder = getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferRect, - this->getContext(), this->getDevice()); + auto eBuiltInOps = EBuiltInOps::CopyBufferRect; + auto size = region[0] * region[1] * region[2]; + if (forceStateless(size)) { + eBuiltInOps = EBuiltInOps::CopyBufferRectStateless; + } + + auto &builder = getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(eBuiltInOps, + this->getContext(), + this->getDevice()); BuiltInOwnershipWrapper builtInLock(builder, this->context); MemObjSurface srcBufferSurf(srcBuffer); diff --git a/unit_tests/command_queue/command_enqueue_fixture.h b/unit_tests/command_queue/command_enqueue_fixture.h index aa38cd3413..c6e94982f0 100644 --- a/unit_tests/command_queue/command_enqueue_fixture.h +++ b/unit_tests/command_queue/command_enqueue_fixture.h @@ -6,6 +6,7 @@ */ #pragma once +#include "runtime/command_queue/command_queue_hw.h" #include "unit_tests/command_queue/command_queue_fixture.h" #include "unit_tests/command_stream/command_stream_fixture.h" #include "unit_tests/fixtures/buffer_fixture.h" @@ -92,4 +93,39 @@ struct NegativeFailAllocationCommandEnqueueBaseFixture : public CommandEnqueueBa MemoryManager *oldMemManager; }; +template +struct CommandQueueStateless : public CommandQueueHw { + CommandQueueStateless(Context *context, Device *device) : CommandQueueHw(context, device, nullptr){}; + + bool forceStateless(size_t size) override { + return true; + } + + void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo) override { + auto kernel = dispatchInfo.begin()->getKernel(); + EXPECT_TRUE(kernel->getKernelInfo().patchInfo.executionEnvironment->CompiledForGreaterThan4GBBuffers); + EXPECT_FALSE(kernel->getKernelInfo().kernelArgInfo[0].pureStatefulBufferAccess); + } +}; + +template +struct CommandQueueStateful : public CommandQueueHw { + CommandQueueStateful(Context *context, Device *device) : CommandQueueHw(context, device, nullptr){}; + + bool forceStateless(size_t size) override { + return false; + } + + void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo) override { + auto kernel = dispatchInfo.begin()->getKernel(); + if (!kernel->getDevice().areSharedSystemAllocationsAllowed()) { + EXPECT_FALSE(kernel->getKernelInfo().patchInfo.executionEnvironment->CompiledForGreaterThan4GBBuffers); + EXPECT_TRUE(kernel->getKernelInfo().kernelArgInfo[0].pureStatefulBufferAccess); + } else { + EXPECT_TRUE(kernel->getKernelInfo().patchInfo.executionEnvironment->CompiledForGreaterThan4GBBuffers); + EXPECT_FALSE(kernel->getKernelInfo().kernelArgInfo[0].pureStatefulBufferAccess); + } + } +}; + } // namespace NEO diff --git a/unit_tests/command_queue/enqueue_copy_buffer_rect_tests.cpp b/unit_tests/command_queue/enqueue_copy_buffer_rect_tests.cpp index 142e4b23d3..84e87b95d4 100644 --- a/unit_tests/command_queue/enqueue_copy_buffer_rect_tests.cpp +++ b/unit_tests/command_queue/enqueue_copy_buffer_rect_tests.cpp @@ -368,3 +368,107 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueCopyBufferRectTest, WhenCopyingBufferRect3DTh enqueueCopyBufferRect3D(); validateMediaVFEState(&pDevice->getHardwareInfo(), cmdMediaVfeState, cmdList, itorMediaVfeState); } + +struct EnqueueCopyBufferRectHw : public ::testing::Test { + void SetUp() override { + device.reset(MockDevice::createWithNewExecutionEnvironment(*platformDevices)); + context.reset(new MockContext(device.get())); + srcBuffer = std::unique_ptr(BufferHelper::create(context.get())); + dstBuffer = std::unique_ptr(BufferHelper::create(context.get())); + } + + std::unique_ptr device; + std::unique_ptr context; + std::unique_ptr srcBuffer; + std::unique_ptr dstBuffer; + const size_t rowPitch = 100; + const size_t slicePitch = 100 * 100; + + std::array srcOrigin = {{0, 0, 0}}; + std::array dstOrigin = {{0, 0, 0}}; + std::array region = {{50, 50, 1}}; + + protected: + template + cl_int enqueueCopyBufferRectHw(CommandQueueHw *cmdQ) { + + auto retVal = CL_SUCCESS; + + retVal = clEnqueueCopyBufferRect( + cmdQ, + srcBuffer.get(), + dstBuffer.get(), + srcOrigin.data(), + dstOrigin.data(), + region.data(), + rowPitch, + slicePitch, + rowPitch, + slicePitch, + 0, + nullptr, + nullptr); + + return retVal; + } +}; + +using EnqueueCopyBufferRectStateless = EnqueueCopyBufferRectHw; + +HWTEST_F(EnqueueCopyBufferRectStateless, GivenValidParametersWhenCopyingBufferRectStatelessThenSuccessIsReturned) { + + if (is32bit) { + GTEST_SKIP(); + } + + std::unique_ptr> cmdQ(new CommandQueueStateless(context.get(), device.get())); + + auto retVal = enqueueCopyBufferRectHw(cmdQ.get()); + + EXPECT_EQ(CL_SUCCESS, retVal); +} + +using EnqueueCopyBufferRectStateful = EnqueueCopyBufferRectHw; + +HWTEST_F(EnqueueCopyBufferRectStateful, GivenValidParametersWhenCopyingBufferRectStatefulThenSuccessIsReturned) { + + std::unique_ptr> cmdQ(new CommandQueueStateful(context.get(), device.get())); + + auto retVal = enqueueCopyBufferRectHw(cmdQ.get()); + + EXPECT_EQ(CL_SUCCESS, retVal); +} + +HWTEST_F(EnqueueCopyBufferRectStateless, WhenCopyingBufferRectStatelessThenStatelessKernelIsUsed) { + + if (is32bit) { + GTEST_SKIP(); + } + + std::unique_ptr> cmdQ(new CommandQueueStateless(context.get(), device.get())); + + // Extract the kernel used + MultiDispatchInfo multiDispatchInfo; + auto &builder = cmdQ->getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferRectStateless, + cmdQ->getContext(), + cmdQ->getDevice()); + ASSERT_NE(nullptr, &builder); + + BuiltinOpParams dc; + dc.srcMemObj = srcBuffer.get(); + dc.dstMemObj = dstBuffer.get(); + dc.srcOffset = srcOrigin.data(); + dc.dstOffset = dstOrigin.data(); + dc.size = region.data(); + dc.srcRowPitch = rowPitch; + dc.srcSlicePitch = slicePitch; + dc.dstRowPitch = rowPitch; + dc.dstSlicePitch = slicePitch; + builder.buildDispatchInfos(multiDispatchInfo, dc); + EXPECT_NE(0u, multiDispatchInfo.size()); + + auto kernel = multiDispatchInfo.begin()->getKernel(); + ASSERT_NE(nullptr, kernel); + EXPECT_TRUE(kernel->getKernelInfo().patchInfo.executionEnvironment->CompiledForGreaterThan4GBBuffers); + EXPECT_FALSE(kernel->getKernelInfo().kernelArgInfo[0].pureStatefulBufferAccess); +} diff --git a/unit_tests/command_queue/enqueue_copy_buffer_tests.cpp b/unit_tests/command_queue/enqueue_copy_buffer_tests.cpp index 02ab029408..fd580b94ec 100644 --- a/unit_tests/command_queue/enqueue_copy_buffer_tests.cpp +++ b/unit_tests/command_queue/enqueue_copy_buffer_tests.cpp @@ -300,25 +300,6 @@ HWTEST_F(EnqueueCopyBufferTest, WhenCopyingBufferThenArgumentOneMatchesDestinati } struct EnqueueCopyBufferHw : public ::testing::Test { - template - struct MyCmdQStateless : public CommandQueueHw { - using CommandQueueHw::commandStream; - MyCmdQStateless(Context *context, Device *device) : CommandQueueHw(context, device, nullptr){}; - - bool forceStateless(size_t size) override { - return true; - } - }; - - template - struct MyCmdQStatefull : public CommandQueueHw { - using CommandQueueHw::commandStream; - MyCmdQStatefull(Context *context, Device *device) : CommandQueueHw(context, device, nullptr){}; - - bool forceStateless(size_t size) override { - return false; - } - }; void SetUp() override { device.reset(MockDevice::createWithNewExecutionEnvironment(*platformDevices)); @@ -341,8 +322,8 @@ HWTEST_F(EnqueueCopyBufferStatelessTest, givenBuffersWhenCopyingBufferStatelessT if (is32bit) { GTEST_SKIP(); } - auto cmdQ = std::make_unique>(context.get(), device.get()); + auto cmdQ = std::make_unique>(context.get(), device.get()); auto retVal = cmdQ->enqueueCopyBuffer( srcBuffer.get(), dstBuffer.get(), @@ -356,16 +337,15 @@ HWTEST_F(EnqueueCopyBufferStatelessTest, givenBuffersWhenCopyingBufferStatelessT EXPECT_EQ(CL_SUCCESS, retVal); } -using EnqueueCopyBufferStatefullTest = EnqueueCopyBufferHw; +using EnqueueCopyBufferStatefulTest = EnqueueCopyBufferHw; -HWTEST_F(EnqueueCopyBufferStatefullTest, givenBuffersWhenCopyingBufferStatefullThenSuccessIsReturned) { +HWTEST_F(EnqueueCopyBufferStatefulTest, givenBuffersWhenCopyingBufferStatefulThenSuccessIsReturned) { if (is32bit) { GTEST_SKIP(); } - auto cmdQ = std::make_unique>(context.get(), device.get()); - + auto cmdQ = std::make_unique>(context.get(), device.get()); auto retVal = cmdQ->enqueueCopyBuffer( srcBuffer.get(), dstBuffer.get(),