From 9066192dc3d40999d62d0c8b46fd9c503d7a05dc Mon Sep 17 00:00:00 2001 From: Lindong Wu Date: Fri, 11 Oct 2019 12:54:10 +0800 Subject: [PATCH] add dispatch hints for more flixable control on workload dispatch Change-Id: Iecfe3031172fd108a1ef0d77d2fff8ad3cef22b3 --- runtime/command_queue/command_queue.h | 3 ++ runtime/command_queue/enqueue_common.h | 4 +++ runtime/command_stream/CMakeLists.txt | 1 + .../command_stream_receiver_hw.h | 3 ++ .../command_stream_receiver_hw_base.inl | 3 ++ .../command_stream_receiver_hw_bdw_plus.inl | 5 +++- runtime/command_stream/csr_definitions.h | 1 + .../command_stream_receiver_hw_ext.inl | 28 +++++++++++++++++++ runtime/mem_obj/CMakeLists.txt | 2 ++ runtime/mem_obj/buffer.h | 1 + runtime/mem_obj/buffer_base.inl | 2 ++ runtime/mem_obj/definitions/buffer_ext.inl | 14 ++++++++++ runtime/mem_obj/definitions/image_ext.inl | 14 ++++++++++ runtime/mem_obj/image.h | 1 + runtime/mem_obj/image.inl | 2 ++ .../command_queue/enqueue_kernel_1_tests.cpp | 13 +++++++++ 16 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 runtime/command_stream/definitions/command_stream_receiver_hw_ext.inl create mode 100644 runtime/mem_obj/definitions/buffer_ext.inl create mode 100644 runtime/mem_obj/definitions/image_ext.inl diff --git a/runtime/command_queue/command_queue.h b/runtime/command_queue/command_queue.h index 52dea10e5c..fbaf33f6e8 100644 --- a/runtime/command_queue/command_queue.h +++ b/runtime/command_queue/command_queue.h @@ -423,6 +423,9 @@ class CommandQueue : public BaseObject<_cl_command_queue> { uint64_t getSliceCount() const { return sliceCount; } + // extend dispatch hints + uint64_t dispatchHints = 0; + protected: void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet); cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest); diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index 44682d726b..8fa42ebba9 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -709,6 +709,10 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( dispatchFlags.l3CacheSettings = L3CachingSettings::l3AndL1On; } + if (this->dispatchHints != 0) { + dispatchFlags.epilogueRequired = true; + } + if (gtpinIsGTPinInitialized()) { gtpinNotifyPreFlushTask(this); } diff --git a/runtime/command_stream/CMakeLists.txt b/runtime/command_stream/CMakeLists.txt index 627b65b37a..34245f6351 100644 --- a/runtime/command_stream/CMakeLists.txt +++ b/runtime/command_stream/CMakeLists.txt @@ -48,6 +48,7 @@ set(RUNTIME_SRCS_COMMAND_STREAM ${CMAKE_CURRENT_SOURCE_DIR}/tbx_command_stream_receiver_hw.inl ${CMAKE_CURRENT_SOURCE_DIR}/tbx_stream.cpp ${CMAKE_CURRENT_SOURCE_DIR}/thread_arbitration_policy.h + ${CMAKE_CURRENT_SOURCE_DIR}/definitions${BRANCH_DIR_SUFFIX}/command_stream_receiver_hw_ext.inl ) get_property(NEO_CORE_COMMAND_STREAM GLOBAL PROPERTY NEO_CORE_COMMAND_STREAM) diff --git a/runtime/command_stream/command_stream_receiver_hw.h b/runtime/command_stream/command_stream_receiver_hw.h index 9c3c752732..89c8734356 100644 --- a/runtime/command_stream/command_stream_receiver_hw.h +++ b/runtime/command_stream/command_stream_receiver_hw.h @@ -53,6 +53,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { size_t getCmdSizeForPipelineSelect() const; size_t getCmdSizeForComputeMode(); size_t getCmdSizeForMediaSampler(bool mediaSamplerRequired) const; + size_t getCmdSizeForEngineMode(const DispatchFlags &dispatchFlags) const; void programComputeMode(LinearStream &csr, DispatchFlags &dispatchFlags); void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) override; @@ -88,6 +89,8 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { void programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t maxFrontEndThreads); void programStallingPipeControlForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags); virtual void initPageTableManagerRegisters(LinearStream &csr){}; + void programEngineModeCommands(LinearStream &csr, const DispatchFlags &dispatchFlags); + void programEngineModeEpliogue(LinearStream &csr, const DispatchFlags &dispatchFlags); void addClearSLMWorkAround(typename GfxFamily::PIPE_CONTROL *pCmd); PIPE_CONTROL *addPipeControlCmd(LinearStream &commandStream); diff --git a/runtime/command_stream/command_stream_receiver_hw_base.inl b/runtime/command_stream/command_stream_receiver_hw_base.inl index bb396f69c5..36ab0f55bb 100644 --- a/runtime/command_stream/command_stream_receiver_hw_base.inl +++ b/runtime/command_stream/command_stream_receiver_hw_base.inl @@ -31,6 +31,8 @@ #include "runtime/os_interface/os_context.h" #include "runtime/utilities/tag_allocator.h" +#include "command_stream_receiver_hw_ext.inl" + namespace NEO { template @@ -258,6 +260,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( programStallingPipeControlForBarrier(commandStreamCSR, dispatchFlags); } + programEngineModeCommands(commandStreamCSR, dispatchFlags); initPageTableManagerRegisters(commandStreamCSR); programComputeMode(commandStreamCSR, dispatchFlags); programL3(commandStreamCSR, dispatchFlags, newL3Config); diff --git a/runtime/command_stream/command_stream_receiver_hw_bdw_plus.inl b/runtime/command_stream/command_stream_receiver_hw_bdw_plus.inl index 7ed2ba39ec..27b24a1512 100644 --- a/runtime/command_stream/command_stream_receiver_hw_bdw_plus.inl +++ b/runtime/command_stream/command_stream_receiver_hw_bdw_plus.inl @@ -57,12 +57,15 @@ template void CommandStreamReceiverHw::createScratchSpaceController() { scratchSpaceController = std::make_unique(executionEnvironment, *internalAllocationStorage.get()); } + template void CommandStreamReceiverHw::programEpliogueCommands(LinearStream &csr, const DispatchFlags &dispatchFlags) { + this->programEngineModeEpliogue(csr, dispatchFlags); } + template size_t CommandStreamReceiverHw::getCmdSizeForEpilogueCommands(const DispatchFlags &dispatchFlags) const { - return 0u; + return this->getCmdSizeForEngineMode(dispatchFlags); } template diff --git a/runtime/command_stream/csr_definitions.h b/runtime/command_stream/csr_definitions.h index 4723e86292..3650bb1c28 100644 --- a/runtime/command_stream/csr_definitions.h +++ b/runtime/command_stream/csr_definitions.h @@ -78,6 +78,7 @@ struct DispatchFlags { uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber; uint32_t l3CacheSettings = L3CachingSettings::l3CacheOn; uint64_t sliceCount = QueueSliceCount::defaultSliceCount; + uint64_t engineHints = 0; bool blocking = false; bool dcFlush = false; bool useSLM = false; diff --git a/runtime/command_stream/definitions/command_stream_receiver_hw_ext.inl b/runtime/command_stream/definitions/command_stream_receiver_hw_ext.inl new file mode 100644 index 0000000000..f32cc75cd2 --- /dev/null +++ b/runtime/command_stream/definitions/command_stream_receiver_hw_ext.inl @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2019 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once +#include "core/command_stream/linear_stream.h" +#include "runtime/command_stream/command_stream_receiver_hw.h" +#include "runtime/command_stream/csr_definitions.h" + +namespace NEO { + +template +void CommandStreamReceiverHw::programEngineModeCommands(LinearStream &csr, const DispatchFlags &dispatchFlags) { +} + +template +void CommandStreamReceiverHw::programEngineModeEpliogue(LinearStream &csr, const DispatchFlags &dispatchFlags) { +} + +template +size_t CommandStreamReceiverHw::getCmdSizeForEngineMode(const DispatchFlags &dispatchFlags) const { + return 0u; +} + +} // namespace NEO diff --git a/runtime/mem_obj/CMakeLists.txt b/runtime/mem_obj/CMakeLists.txt index 388a869805..25a807362a 100644 --- a/runtime/mem_obj/CMakeLists.txt +++ b/runtime/mem_obj/CMakeLists.txt @@ -27,6 +27,8 @@ set(RUNTIME_SRCS_MEM_OBJ ${CMAKE_CURRENT_SOURCE_DIR}/mem_obj_helper_common.inl ${CMAKE_CURRENT_SOURCE_DIR}/pipe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/pipe.h + ${CMAKE_CURRENT_SOURCE_DIR}/definitions${BRANCH_DIR_SUFFIX}/buffer_ext.inl + ${CMAKE_CURRENT_SOURCE_DIR}/definitions${BRANCH_DIR_SUFFIX}/image_ext.inl ) target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_MEM_OBJ}) diff --git a/runtime/mem_obj/buffer.h b/runtime/mem_obj/buffer.h index a78b656843..791779bc5a 100644 --- a/runtime/mem_obj/buffer.h +++ b/runtime/mem_obj/buffer.h @@ -182,6 +182,7 @@ class BufferHw : public Buffer { void setArgStateful(void *memory, bool forceNonAuxMode, bool disableL3, bool alignSizeForAuxTranslation, bool isReadOnlyArgument) override; void appendBufferState(void *memory, Context *context, GraphicsAllocation *gfxAllocation, bool isReadOnlyArgument); + void appendSurfaceStateExt(void *memory); static Buffer *create(Context *context, MemoryProperties properties, diff --git a/runtime/mem_obj/buffer_base.inl b/runtime/mem_obj/buffer_base.inl index 26c05886ec..709c2b74fa 100644 --- a/runtime/mem_obj/buffer_base.inl +++ b/runtime/mem_obj/buffer_base.inl @@ -14,6 +14,7 @@ #include "runtime/helpers/surface_formats.h" #include "runtime/mem_obj/buffer.h" +#include "buffer_ext.inl" #include "hw_cmds.h" namespace NEO { @@ -79,6 +80,7 @@ void BufferHw::setArgStateful(void *memory, bool forceNonAuxMode, boo } appendBufferState(memory, context, getGraphicsAllocation(), isReadOnlyArgument); + appendSurfaceStateExt(memory); } } // namespace NEO diff --git a/runtime/mem_obj/definitions/buffer_ext.inl b/runtime/mem_obj/definitions/buffer_ext.inl new file mode 100644 index 0000000000..dfda78f8b4 --- /dev/null +++ b/runtime/mem_obj/definitions/buffer_ext.inl @@ -0,0 +1,14 @@ +/* + * Copyright (C) 2019 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +namespace NEO { + +template +void BufferHw::appendSurfaceStateExt(void *memory) { +} + +} // namespace NEO diff --git a/runtime/mem_obj/definitions/image_ext.inl b/runtime/mem_obj/definitions/image_ext.inl new file mode 100644 index 0000000000..863d221fa2 --- /dev/null +++ b/runtime/mem_obj/definitions/image_ext.inl @@ -0,0 +1,14 @@ +/* + * Copyright (C) 2019 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +namespace NEO { + +template +void ImageHw::appendSurfaceStateExt(void *memory) { +} + +} // namespace NEO diff --git a/runtime/mem_obj/image.h b/runtime/mem_obj/image.h index 7fcff44fc3..b91bf98612 100644 --- a/runtime/mem_obj/image.h +++ b/runtime/mem_obj/image.h @@ -290,6 +290,7 @@ class ImageHw : public Image { void setMediaSurfaceRotation(void *memory) override; void setSurfaceMemoryObjectControlStateIndexToMocsTable(void *memory, uint32_t value) override; void appendSurfaceStateParams(RENDER_SURFACE_STATE *surfaceState); + void appendSurfaceStateExt(void *memory); void setFlagsForMediaCompression(RENDER_SURFACE_STATE *surfaceState, Gmm *gmm); void transformImage2dArrayTo3d(void *memory) override; void transformImage3dTo2dArray(void *memory) override; diff --git a/runtime/mem_obj/image.inl b/runtime/mem_obj/image.inl index 62112fa7d9..f6f4f5b53d 100644 --- a/runtime/mem_obj/image.inl +++ b/runtime/mem_obj/image.inl @@ -14,6 +14,7 @@ #include "runtime/mem_obj/image.h" #include "hw_cmds.h" +#include "image_ext.inl" namespace NEO { @@ -155,6 +156,7 @@ void ImageHw::setImageArg(void *memory, bool setAsMediaBlockImage, ui setAuxParamsForCCS(surfaceState, gmm); } appendSurfaceStateParams(surfaceState); + appendSurfaceStateExt(surfaceState); } template diff --git a/unit_tests/command_queue/enqueue_kernel_1_tests.cpp b/unit_tests/command_queue/enqueue_kernel_1_tests.cpp index 9892c29b9c..85ecdb6617 100644 --- a/unit_tests/command_queue/enqueue_kernel_1_tests.cpp +++ b/unit_tests/command_queue/enqueue_kernel_1_tests.cpp @@ -1091,3 +1091,16 @@ HWTEST_F(EnqueueKernelTest, givenNonVMEKernelWhenEnqueueKernelThenDispatchFlagsD clEnqueueNDRangeKernel(this->pCmdQ, mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); EXPECT_FALSE(mockCsr->passedDispatchFlags.pipelineSelectArgs.mediaSamplerRequired); } + +HWTEST_F(EnqueueKernelTest, whenEnqueueKernelWithEngineHintsThenEpilogRequiredIsSet) { + auto &csr = pDevice->getUltCommandStreamReceiver(); + size_t off[3] = {0, 0, 0}; + size_t gws[3] = {1, 1, 1}; + + MockKernelWithInternals mockKernel(*pDevice); + pCmdQ->dispatchHints = 1; + + pCmdQ->enqueueKernel(mockKernel.mockKernel, 1, off, gws, nullptr, 0, nullptr, nullptr); + + EXPECT_EQ(csr.recordedDispatchFlags.epilogueRequired, true); +}