From b6b92ae808a2590af4cccf30709dcbc652652ab5 Mon Sep 17 00:00:00 2001 From: "Zdanowicz, Zbigniew" Date: Fri, 30 Mar 2018 17:57:51 +0200 Subject: [PATCH] Create GpgpuWalkerHelper class Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b --- runtime/command_queue/CMakeLists.txt | 5 +- runtime/command_queue/command_queue_hw.inl | 4 +- .../command_queue/dispatch_walker_helper.h | 56 --- .../command_queue/dispatch_walker_helper.inl | 99 ----- runtime/command_queue/enqueue_common.h | 6 +- runtime/command_queue/enqueue_kernel.h | 6 +- runtime/command_queue/enqueue_marker.h | 4 +- .../enqueue_migrate_mem_objects.h | 4 +- runtime/command_queue/gpgpu_walker.h | 371 ++++++++++++++++++ .../{dispatch_walker.h => gpgpu_walker.inl} | 332 +++++----------- .../command_stream_receiver_hw.inl | 2 +- runtime/command_stream/preemption.inl | 2 +- runtime/device_queue/device_queue_hw.inl | 13 +- runtime/enable_gens.cmake | 1 + runtime/gen8/command_queue.cpp | 43 +- runtime/gen8/gpgpu_walker.cpp | 71 ++++ runtime/gen9/command_queue.cpp | 43 +- runtime/gen9/gpgpu_walker.cpp | 71 ++++ runtime/helpers/dispatch_info_builder.h | 8 +- .../command_queue/dispatch_walker_tests.cpp | 63 +-- .../get_size_required_buffer_tests.cpp | 4 +- .../get_size_required_image_tests.cpp | 2 +- .../command_queue/local_work_size_tests.cpp | 4 +- .../command_queue/work_group_size_tests.cpp | 4 +- .../command_stream_receiver_hw_tests.cpp | 2 +- unit_tests/context/driver_diagnostics_tests.h | 2 +- .../device_queue/device_queue_hw_tests.cpp | 8 +- .../enqueue_execution_model_kernel_tests.cpp | 4 +- .../parent_kernel_dispatch_tests.cpp | 208 +++++----- .../scheduler_dispatch_tests.cpp | 6 +- unit_tests/gen8/scheduler_dispatch_tests.cpp | 4 +- unit_tests/gen9/test_device_queue_hw.cpp | 4 +- unit_tests/libult/mock_gfx_family.cpp | 96 +++++ unit_tests/libult/mock_gfx_family.h | 303 ++++++++++++++ 34 files changed, 1209 insertions(+), 646 deletions(-) delete mode 100644 runtime/command_queue/dispatch_walker_helper.h delete mode 100644 runtime/command_queue/dispatch_walker_helper.inl create mode 100644 runtime/command_queue/gpgpu_walker.h rename runtime/command_queue/{dispatch_walker.h => gpgpu_walker.inl} (70%) create mode 100644 runtime/gen8/gpgpu_walker.cpp create mode 100644 runtime/gen9/gpgpu_walker.cpp diff --git a/runtime/command_queue/CMakeLists.txt b/runtime/command_queue/CMakeLists.txt index bd395ee888..4d1d481d4d 100644 --- a/runtime/command_queue/CMakeLists.txt +++ b/runtime/command_queue/CMakeLists.txt @@ -25,9 +25,6 @@ set(RUNTIME_SRCS_COMMAND_QUEUE ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw.h ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw.inl ${CMAKE_CURRENT_SOURCE_DIR}/cpu_data_transfer_handler.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker.h - ${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker_helper.h - ${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker_helper.inl ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_barrier.h ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_common.h ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_copy_buffer.h @@ -49,6 +46,8 @@ set(RUNTIME_SRCS_COMMAND_QUEUE ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_write_image.h ${CMAKE_CURRENT_SOURCE_DIR}/finish.h ${CMAKE_CURRENT_SOURCE_DIR}/flush.h + ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.h + ${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.inl ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.h ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl diff --git a/runtime/command_queue/command_queue_hw.inl b/runtime/command_queue/command_queue_hw.inl index 0a54a00811..5ded683e3d 100644 --- a/runtime/command_queue/command_queue_hw.inl +++ b/runtime/command_queue/command_queue_hw.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -20,7 +20,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/command_queue/enqueue_barrier.h" #include "runtime/command_queue/enqueue_copy_buffer.h" #include "runtime/command_queue/enqueue_copy_buffer_rect.h" diff --git a/runtime/command_queue/dispatch_walker_helper.h b/runtime/command_queue/dispatch_walker_helper.h deleted file mode 100644 index 3be20da5b9..0000000000 --- a/runtime/command_queue/dispatch_walker_helper.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2017, Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#pragma once - -namespace OCLRT { - -constexpr int NUM_ALU_INST_FOR_READ_MODIFY_WRITE = 4; - -constexpr int L3SQC_BIT_LQSC_RO_PERF_DIS = 0x08000000; -constexpr int L3SQC_REG4 = 0xB118; - -constexpr int GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER = 0xFFFFFFFF; -constexpr int GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER = 0x00000000; - -constexpr int CS_GPR_R0 = 0x2600; -constexpr int CS_GPR_R1 = 0x2608; - -constexpr int ALU_OPCODE_LOAD = 0x080; -constexpr int ALU_OPCODE_STORE = 0x180; -constexpr int ALU_OPCODE_OR = 0x103; -constexpr int ALU_OPCODE_AND = 0x102; - -constexpr int ALU_REGISTER_R_0 = 0x0; -constexpr int ALU_REGISTER_R_1 = 0x1; -constexpr int ALU_REGISTER_R_SRCA = 0x20; -constexpr int ALU_REGISTER_R_SRCB = 0x21; -constexpr int ALU_REGISTER_R_ACCU = 0x31; - -constexpr unsigned int GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW = 0x23A8; - -template -void applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode); - -template -size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel); -} // namespace OCLRT diff --git a/runtime/command_queue/dispatch_walker_helper.inl b/runtime/command_queue/dispatch_walker_helper.inl deleted file mode 100644 index 6f663208a0..0000000000 --- a/runtime/command_queue/dispatch_walker_helper.inl +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2017, Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "runtime/command_queue/dispatch_walker_helper.h" - -namespace OCLRT { - -// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask -template -void addAluReadModifyWriteRegister( - OCLRT::LinearStream *pCommandStream, - uint32_t aluRegister, - uint32_t operation, - uint32_t mask) { - // Load "Register" value into CS_GPR_R0 - typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG; - typedef typename GfxFamily::MI_MATH MI_MATH; - typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE; - auto pCmd = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG))); - *pCmd = MI_LOAD_REGISTER_REG::sInit(); - pCmd->setSourceRegisterAddress(aluRegister); - pCmd->setDestinationRegisterAddress(CS_GPR_R0); - - // Load "Mask" into CS_GPR_R1 - typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; - auto pCmd2 = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM))); - *pCmd2 = MI_LOAD_REGISTER_IMM::sInit(); - pCmd2->setRegisterOffset(CS_GPR_R1); - pCmd2->setDataDword(mask); - - // Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands - auto pCmd3 = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE))); - reinterpret_cast(pCmd3)->DW0.Value = 0x0; - reinterpret_cast(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; - reinterpret_cast(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; - // 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE - reinterpret_cast(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1; - pCmd3++; - MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast(pCmd3); - - // Setup first operand of MI_MATH - load CS_GPR_R0 into register A - pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD; - pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA; - pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0; - pAluParam++; - - // Setup second operand of MI_MATH - load CS_GPR_R1 into register B - pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD; - pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB; - pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1; - pAluParam++; - - // Setup third operand of MI_MATH - "Operation" on registers A and B - pAluParam->DW0.BitField.ALUOpcode = operation; - pAluParam->DW0.BitField.Operand1 = 0; - pAluParam->DW0.BitField.Operand2 = 0; - pAluParam++; - - // Setup fourth operand of MI_MATH - store result into CS_GPR_R0 - pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE; - pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0; - pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU; - - // LOAD value of CS_GPR_R0 into "Register" - auto pCmd4 = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG))); - *pCmd4 = MI_LOAD_REGISTER_REG::sInit(); - pCmd4->setSourceRegisterAddress(CS_GPR_R0); - pCmd4->setDestinationRegisterAddress(aluRegister); - - // Add PIPE_CONTROL to flush caches - typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL; - auto pCmd5 = reinterpret_cast(pCommandStream->getSpace(sizeof(PIPE_CONTROL))); - *pCmd5 = PIPE_CONTROL::sInit(); - pCmd5->setCommandStreamerStallEnable(true); - pCmd5->setDcFlushEnable(true); - pCmd5->setTextureCacheInvalidationEnable(true); - pCmd5->setPipeControlFlushEnable(true); - pCmd5->setStateCacheInvalidationEnable(true); -} -} // namespace OCLRT diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index c271e08a2b..6529099fac 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -24,7 +24,7 @@ #include "runtime/builtin_kernels_simulation/scheduler_simulation.h" #include "hw_cmds.h" #include "runtime/command_queue/command_queue_hw.h" -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/command_stream/command_stream_receiver.h" #include "runtime/event/event_builder.h" #include "runtime/gtpin/gtpin_notify.h" @@ -243,7 +243,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } } - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *this, multiDispatchInfo, numEventsInWaitList, @@ -293,7 +293,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, this->getIndirectHeap(IndirectHeap::SURFACE_STATE).getGraphicsAllocation(), devQueueHw->getDebugQueue()); - dispatchScheduler( + GpgpuWalkerHelper::dispatchScheduler( *this, *devQueueHw, preemption, diff --git a/runtime/command_queue/enqueue_kernel.h b/runtime/command_queue/enqueue_kernel.h index 0dc3f0e88e..52c09264f8 100644 --- a/runtime/command_queue/enqueue_kernel.h +++ b/runtime/command_queue/enqueue_kernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -24,7 +24,7 @@ #include "hw_cmds.h" #include "runtime/command_queue/command_queue_hw.h" #include "runtime/command_stream/command_stream_receiver.h" -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/helpers/kernel_commands.h" #include "runtime/helpers/task_information.h" #include "runtime/mem_obj/buffer.h" @@ -69,7 +69,7 @@ struct EnqueueOperation { //user registers size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); } - size += getSizeForWADisableLSQCROPERFforOCL(pKernel); + size += GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(pKernel); return size; } diff --git a/runtime/command_queue/enqueue_marker.h b/runtime/command_queue/enqueue_marker.h index de28fda17c..29eef91778 100644 --- a/runtime/command_queue/enqueue_marker.h +++ b/runtime/command_queue/enqueue_marker.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -24,7 +24,7 @@ #include "hw_cmds.h" #include "runtime/command_queue/command_queue_hw.h" #include "runtime/command_stream/command_stream_receiver.h" -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/device/device.h" #include "runtime/event/event.h" #include "runtime/memory_manager/surface.h" diff --git a/runtime/command_queue/enqueue_migrate_mem_objects.h b/runtime/command_queue/enqueue_migrate_mem_objects.h index ad3971f1f7..28accd667e 100644 --- a/runtime/command_queue/enqueue_migrate_mem_objects.h +++ b/runtime/command_queue/enqueue_migrate_mem_objects.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -24,7 +24,7 @@ #include "hw_cmds.h" #include "runtime/command_queue/command_queue_hw.h" #include "runtime/command_stream/command_stream_receiver.h" -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/device/device.h" #include "runtime/event/event.h" #include "runtime/memory_manager/surface.h" diff --git a/runtime/command_queue/gpgpu_walker.h b/runtime/command_queue/gpgpu_walker.h new file mode 100644 index 0000000000..37f8e0ceee --- /dev/null +++ b/runtime/command_queue/gpgpu_walker.h @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#pragma once + +#include "runtime/built_ins/built_ins.h" +#include "runtime/context/context.h" +#include "runtime/command_queue/command_queue.h" +#include "runtime/command_stream/linear_stream.h" +#include "runtime/command_stream/preemption.h" +#include "runtime/device_queue/device_queue_hw.h" +#include "runtime/event/hw_timestamps.h" +#include "runtime/event/perf_counter.h" +#include "runtime/helpers/dispatch_info.h" +#include "runtime/helpers/kernel_commands.h" +#include "runtime/helpers/task_information.h" +#include "runtime/indirect_heap/indirect_heap.h" +#include "runtime/kernel/kernel.h" +#include "runtime/program/kernel_info.h" +#include "runtime/utilities/vec.h" + +namespace OCLRT { + +constexpr int32_t NUM_ALU_INST_FOR_READ_MODIFY_WRITE = 4; + +constexpr int32_t L3SQC_BIT_LQSC_RO_PERF_DIS = 0x08000000; +constexpr int32_t L3SQC_REG4 = 0xB118; + +constexpr int32_t GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER = 0xFFFFFFFF; +constexpr int32_t GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER = 0x00000000; + +constexpr int32_t CS_GPR_R0 = 0x2600; +constexpr int32_t CS_GPR_R1 = 0x2608; + +constexpr int32_t ALU_OPCODE_LOAD = 0x080; +constexpr int32_t ALU_OPCODE_STORE = 0x180; +constexpr int32_t ALU_OPCODE_OR = 0x103; +constexpr int32_t ALU_OPCODE_AND = 0x102; + +constexpr int32_t ALU_REGISTER_R_0 = 0x0; +constexpr int32_t ALU_REGISTER_R_1 = 0x1; +constexpr int32_t ALU_REGISTER_R_SRCA = 0x20; +constexpr int32_t ALU_REGISTER_R_SRCB = 0x21; +constexpr int32_t ALU_REGISTER_R_ACCU = 0x31; + +constexpr uint32_t GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW = 0x23A8; + +void computeWorkgroupSize1D( + uint32_t maxWorkGroupSize, + size_t workGroupSize[3], + const size_t workItems[3], + size_t simdSize); + +void computeWorkgroupSizeND( + WorkSizeInfo wsInfo, + size_t workGroupSize[3], + const size_t workItems[3], + const uint32_t workDim); + +void computeWorkgroupSize2D( + uint32_t maxWorkGroupSize, + size_t workGroupSize[3], + const size_t workItems[3], + size_t simdSize); + +void computeWorkgroupSizeSquared( + uint32_t maxWorkGroupSize, + size_t workGroupSize[3], + const size_t workItems[3], + size_t simdSize, + const uint32_t workDim); + +Vec3 computeWorkgroupSize( + const DispatchInfo &dispatchInfo); + +Vec3 generateWorkgroupSize( + const DispatchInfo &dispatchInfo); + +Vec3 computeWorkgroupsNumber( + const Vec3 gws, + const Vec3 lws); + +Vec3 generateWorkgroupsNumber( + const Vec3 gws, + const Vec3 lws); + +Vec3 generateWorkgroupsNumber( + const DispatchInfo &dispatchInfo); + +inline uint32_t calculateDispatchDim(Vec3 dispatchSize, Vec3 dispatchOffset) { + return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim())); +} + +Vec3 canonizeWorkgroup( + Vec3 workgroup); + +void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo); + +inline cl_uint computeDimensions(const size_t workItems[3]) { + return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1; +} + +template +IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) { + size_t alignment = MemoryConstants::pageSize; + size_t size = calc(std::forward(args)...); + return new IndirectHeap(alignedMalloc(size, alignment), size); +} + +template +class GpgpuWalkerHelper { + public: + static void addAluReadModifyWriteRegister( + LinearStream *pCommandStream, + uint32_t aluRegister, + uint32_t operation, + uint32_t mask); + + static void applyWADisableLSQCROPERFforOCL(LinearStream *pCommandStream, + const Kernel &kernel, + bool disablePerfMode); + + static size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel); + + static size_t setGpgpuWalkerThreadData( + typename GfxFamily::GPGPU_WALKER *pCmd, + const size_t globalOffsets[3], + const size_t startWorkGroups[3], + const size_t numWorkGroups[3], + const size_t localWorkSizesIn[3], + uint32_t simd); + + static void dispatchProfilingCommandsStart( + HwTimeStamps &hwTimeStamps, + OCLRT::LinearStream *commandStream); + + static void dispatchProfilingCommandsEnd( + HwTimeStamps &hwTimeStamps, + OCLRT::LinearStream *commandStream); + + static void dispatchPerfCountersNoopidRegisterCommands( + CommandQueue &commandQueue, + OCLRT::HwPerfCounter &hwPerfCounter, + OCLRT::LinearStream *commandStream, + bool start); + + static void dispatchPerfCountersReadFreqRegisterCommands( + CommandQueue &commandQueue, + OCLRT::HwPerfCounter &hwPerfCounter, + OCLRT::LinearStream *commandStream, + bool start); + + static void dispatchPerfCountersGeneralPurposeCounterCommands( + CommandQueue &commandQueue, + OCLRT::HwPerfCounter &hwPerfCounter, + OCLRT::LinearStream *commandStream, + bool start); + + static void dispatchPerfCountersUserCounterCommands( + CommandQueue &commandQueue, + OCLRT::HwPerfCounter &hwPerfCounter, + OCLRT::LinearStream *commandStream, + bool start); + + static void dispatchPerfCountersOABufferStateCommands( + CommandQueue &commandQueue, + OCLRT::HwPerfCounter &hwPerfCounter, + OCLRT::LinearStream *commandStream); + + static void dispatchPerfCountersCommandsStart( + CommandQueue &commandQueue, + OCLRT::HwPerfCounter &hwPerfCounter, + OCLRT::LinearStream *commandStream); + + static void dispatchPerfCountersCommandsEnd( + CommandQueue &commandQueue, + OCLRT::HwPerfCounter &hwPerfCounter, + OCLRT::LinearStream *commandStream); + + static void dispatchWalker( + CommandQueue &commandQueue, + const MultiDispatchInfo &multiDispatchInfo, + cl_uint numEventsInWaitList, + const cl_event *eventWaitList, + KernelOperation **blockedCommandsData, + HwTimeStamps *hwTimeStamps, + OCLRT::HwPerfCounter *hwPerfCounter, + PreemptionMode preemptionMode, + bool blockQueue, + unsigned int commandType = 0); + + static void dispatchWalker( + CommandQueue &commandQueue, + const Kernel &kernel, + cl_uint workDim, + const size_t globalOffsets[3], + const size_t workItems[3], + const size_t *localWorkSizesIn, + cl_uint numEventsInWaitList, + const cl_event *eventWaitList, + KernelOperation **blockedCommandsData, + HwTimeStamps *hwTimeStamps, + HwPerfCounter *hwPerfCounter, + PreemptionMode preemptionMode, + bool blockQueue); + + static void dispatchScheduler( + CommandQueue &commandQueue, + DeviceQueueHw &devQueueHw, + PreemptionMode preemptionMode, + SchedulerKernel &scheduler); +}; + +template +struct EnqueueOperation { + static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class"); + static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class"); + static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class"); + static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) { + size_t size = KernelCommandsHelper::getSizeRequiredCS() + + sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper::isPipeControlWArequired() ? 2 : 1); + if (reserveProfilingCmdsSpace) { + size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + } + if (reservePerfCounters) { + //start cmds + //P_C: flush CS & TimeStamp BEGIN + size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL); + //SRM NOOPID & Frequency + size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + //gp registers + size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + //report perf count + size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); + //user registers + size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + + //end cmds + //P_C: flush CS & TimeStamp END; + size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL); + //OA buffer (status head, tail) + size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + //report perf count + size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); + //gp registers + size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + //SRM NOOPID & Frequency + size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + //user registers + size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + } + Device &device = commandQueue.getDevice(); + for (auto &dispatchInfo : multiDispatchInfo) { + auto &kernel = *dispatchInfo.getKernel(); + size += sizeof(typename GfxFamily::GPGPU_WALKER); + size += GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(&kernel); + size += PreemptionHelper::getPreemptionWaCsSize(device); + } + return size; + } + + static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) { + size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper::getSizeRequiredCS() + + sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper::isPipeControlWArequired() ? 2 : 1); + size += PreemptionHelper::getPreemptionWaCsSize(commandQueue.getDevice()); + if (reserveProfilingCmdsSpace) { + size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + } + if (reservePerfCounters) { + //start cmds + //P_C: flush CS & TimeStamp BEGIN + size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL); + //SRM NOOPID & Frequency + size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + //gp registers + size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + //report perf count + size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); + //user registers + size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + + //end cmds + //P_C: flush CS & TimeStamp END; + size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL); + //OA buffer (status head, tail) + size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + //report perf count + size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); + //gp registers + size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + //SRM NOOPID & Frequency + size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + //user registers + size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + } + size += GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(pKernel); + + return size; + } +}; + +template +LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) { + auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel); + return commandQueue.getCS(expectedSizeCS); +} + +template +LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) { + size_t expectedSizeCS = 0; + Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr; + for (auto &dispatchInfo : multiDispatchInfo) { + expectedSizeCS += EnqueueOperation::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel()); + } + if (parentKernel && parentKernel->isParentKernel) { + SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext()); + expectedSizeCS += EnqueueOperation::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler); + } + return commandQueue.getCS(expectedSizeCS); +} + +template +IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) { + size_t expectedSize = 0; + IndirectHeap *ih = nullptr; + + // clang-format off + switch (heapType) { + case IndirectHeap::DYNAMIC_STATE: expectedSize = KernelCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); break; + case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); break; + case IndirectHeap::SURFACE_STATE: expectedSize = KernelCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); break; + } + // clang-format on + + if (multiDispatchInfo.begin()->getKernel()->isParentKernel) { + if (heapType == IndirectHeap::SURFACE_STATE) { + expectedSize += KernelCommandsHelper::template getSizeRequiredForExecutionModel(const_cast(*(multiDispatchInfo.begin()->getKernel()))); + } else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT) + { + DeviceQueueHw *pDevQueue = castToObject>(commandQueue.getContext().getDefaultDeviceQueue()); + DEBUG_BREAK_IF(pDevQueue == nullptr); + ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE); + } + } + + if (ih == nullptr) + ih = &commandQueue.getIndirectHeap(heapType, expectedSize); + + return *ih; +} + +} // namespace OCLRT diff --git a/runtime/command_queue/dispatch_walker.h b/runtime/command_queue/gpgpu_walker.inl similarity index 70% rename from runtime/command_queue/dispatch_walker.h rename to runtime/command_queue/gpgpu_walker.inl index 40bbb13bd0..c178bb2ac9 100644 --- a/runtime/command_queue/dispatch_walker.h +++ b/runtime/command_queue/gpgpu_walker.inl @@ -21,24 +21,17 @@ */ #pragma once -#include "runtime/context/context.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/command_queue/local_id_gen.h" -#include "runtime/command_queue/command_queue.h" -#include "runtime/command_queue/dispatch_walker_helper.h" #include "runtime/command_stream/command_stream_receiver.h" -#include "runtime/command_stream/preemption.h" #include "runtime/device/device_info.h" -#include "runtime/device_queue/device_queue_hw.h" #include "runtime/event/perf_counter.h" #include "runtime/event/user_event.h" #include "runtime/indirect_heap/indirect_heap.h" #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/debug_helpers.h" #include "runtime/helpers/kernel_commands.h" -#include "runtime/helpers/task_information.h" #include "runtime/helpers/validators.h" -#include "runtime/helpers/dispatch_info.h" -#include "runtime/kernel/kernel.h" #include "runtime/mem_obj/mem_obj.h" #include "runtime/memory_manager/graphics_allocation.h" #include @@ -46,57 +39,81 @@ namespace OCLRT { -void computeWorkgroupSize1D( - uint32_t maxWorkGroupSize, - size_t workGroupSize[3], - const size_t workItems[3], - size_t simdSize); +// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask +template +void GpgpuWalkerHelper::addAluReadModifyWriteRegister( + OCLRT::LinearStream *pCommandStream, + uint32_t aluRegister, + uint32_t operation, + uint32_t mask) { + // Load "Register" value into CS_GPR_R0 + typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG; + typedef typename GfxFamily::MI_MATH MI_MATH; + typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE; + auto pCmd = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG))); + *pCmd = MI_LOAD_REGISTER_REG::sInit(); + pCmd->setSourceRegisterAddress(aluRegister); + pCmd->setDestinationRegisterAddress(CS_GPR_R0); -void computeWorkgroupSizeND( - WorkSizeInfo wsInfo, - size_t workGroupSize[3], - const size_t workItems[3], - const uint32_t workDim); + // Load "Mask" into CS_GPR_R1 + typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; + auto pCmd2 = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM))); + *pCmd2 = MI_LOAD_REGISTER_IMM::sInit(); + pCmd2->setRegisterOffset(CS_GPR_R1); + pCmd2->setDataDword(mask); -void computeWorkgroupSize2D( - uint32_t maxWorkGroupSize, - size_t workGroupSize[3], - const size_t workItems[3], - size_t simdSize); + // Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands + auto pCmd3 = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE))); + reinterpret_cast(pCmd3)->DW0.Value = 0x0; + reinterpret_cast(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; + reinterpret_cast(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; + // 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE + reinterpret_cast(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1; + pCmd3++; + MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast(pCmd3); -void computeWorkgroupSizeSquared( - uint32_t maxWorkGroupSize, - size_t workGroupSize[3], - const size_t workItems[3], - size_t simdSize, - const uint32_t workDim); + // Setup first operand of MI_MATH - load CS_GPR_R0 into register A + pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD; + pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA; + pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0; + pAluParam++; -Vec3 computeWorkgroupSize( - const DispatchInfo &dispatchInfo); + // Setup second operand of MI_MATH - load CS_GPR_R1 into register B + pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD; + pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB; + pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1; + pAluParam++; -Vec3 generateWorkgroupSize( - const DispatchInfo &dispatchInfo); + // Setup third operand of MI_MATH - "Operation" on registers A and B + pAluParam->DW0.BitField.ALUOpcode = operation; + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; -Vec3 computeWorkgroupsNumber( - const Vec3 gws, - const Vec3 lws); + // Setup fourth operand of MI_MATH - store result into CS_GPR_R0 + pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE; + pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0; + pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU; -Vec3 generateWorkgroupsNumber( - const Vec3 gws, - const Vec3 lws); + // LOAD value of CS_GPR_R0 into "Register" + auto pCmd4 = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG))); + *pCmd4 = MI_LOAD_REGISTER_REG::sInit(); + pCmd4->setSourceRegisterAddress(CS_GPR_R0); + pCmd4->setDestinationRegisterAddress(aluRegister); -Vec3 generateWorkgroupsNumber( - const DispatchInfo &dispatchInfo); - -Vec3 canonizeWorkgroup( - Vec3 workgroup); - -inline uint32_t calculateDispatchDim(Vec3 dispatchSize, Vec3 dispatchOffset) { - return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim())); + // Add PIPE_CONTROL to flush caches + typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL; + auto pCmd5 = reinterpret_cast(pCommandStream->getSpace(sizeof(PIPE_CONTROL))); + *pCmd5 = PIPE_CONTROL::sInit(); + pCmd5->setCommandStreamerStallEnable(true); + pCmd5->setDcFlushEnable(true); + pCmd5->setTextureCacheInvalidationEnable(true); + pCmd5->setPipeControlFlushEnable(true); + pCmd5->setStateCacheInvalidationEnable(true); } template -inline size_t setGpgpuWalkerThreadData( +inline size_t GpgpuWalkerHelper::setGpgpuWalkerThreadData( typename GfxFamily::GPGPU_WALKER *pCmd, const size_t globalOffsets[3], const size_t startWorkGroups[3], @@ -132,21 +149,8 @@ inline size_t setGpgpuWalkerThreadData( return localWorkSize; } -inline cl_uint computeDimensions(const size_t workItems[3]) { - return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1; -} - -void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo); - -template -IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) { - size_t alignment = MemoryConstants::pageSize; - size_t size = calc(std::forward(args)...); - return new IndirectHeap(alignedMalloc(size, alignment), size); -} - template -void dispatchProfilingCommandsStart( +void GpgpuWalkerHelper::dispatchProfilingCommandsStart( HwTimeStamps &hwTimeStamps, OCLRT::LinearStream *commandStream) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; @@ -173,7 +177,7 @@ void dispatchProfilingCommandsStart( } template -void dispatchProfilingCommandsEnd( +void GpgpuWalkerHelper::dispatchProfilingCommandsEnd( HwTimeStamps &hwTimeStamps, OCLRT::LinearStream *commandStream) { @@ -196,7 +200,7 @@ void dispatchProfilingCommandsEnd( } template -void dispatchPerfCountersNoopidRegisterCommands( +void GpgpuWalkerHelper::dispatchPerfCountersNoopidRegisterCommands( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream, @@ -214,7 +218,7 @@ void dispatchPerfCountersNoopidRegisterCommands( } template -void dispatchPerfCountersReadFreqRegisterCommands( +void GpgpuWalkerHelper::dispatchPerfCountersReadFreqRegisterCommands( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream, @@ -232,7 +236,7 @@ void dispatchPerfCountersReadFreqRegisterCommands( } template -void dispatchPerfCountersGeneralPurposeCounterCommands( +void GpgpuWalkerHelper::dispatchPerfCountersGeneralPurposeCounterCommands( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream, @@ -256,7 +260,7 @@ void dispatchPerfCountersGeneralPurposeCounterCommands( } template -void dispatchPerfCountersUserCounterCommands( +void GpgpuWalkerHelper::dispatchPerfCountersUserCounterCommands( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream, @@ -297,7 +301,7 @@ void dispatchPerfCountersUserCounterCommands( } template -void dispatchPerfCountersOABufferStateCommands( +void GpgpuWalkerHelper::dispatchPerfCountersOABufferStateCommands( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream) { @@ -328,7 +332,7 @@ void dispatchPerfCountersOABufferStateCommands( } template -void dispatchPerfCountersCommandsStart( +void GpgpuWalkerHelper::dispatchPerfCountersCommandsStart( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream) { @@ -347,12 +351,12 @@ void dispatchPerfCountersCommandsStart( pPipeControlCmd->setCommandStreamerStallEnable(true); //Store value of NOOPID register - dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, true); + GpgpuWalkerHelper::dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, true); //Read Core Frequency - dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, true); + GpgpuWalkerHelper::dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, true); - dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, true); + GpgpuWalkerHelper::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, true); auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT)); *pReportPerfCount = MI_REPORT_PERF_COUNT::sInit(); @@ -369,13 +373,13 @@ void dispatchPerfCountersCommandsStart( pPipeControlCmd->setAddress(static_cast(address & ((uint64_t)UINT32_MAX))); pPipeControlCmd->setAddressHigh(static_cast(address >> 32)); - dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, true); + GpgpuWalkerHelper::dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, true); commandQueue.sendPerfCountersConfig(); } template -void dispatchPerfCountersCommandsEnd( +void GpgpuWalkerHelper::dispatchPerfCountersCommandsEnd( CommandQueue &commandQueue, OCLRT::HwPerfCounter &hwPerfCounter, OCLRT::LinearStream *commandStream) { @@ -394,7 +398,7 @@ void dispatchPerfCountersCommandsEnd( *pPipeControlCmd = PIPE_CONTROL::sInit(); pPipeControlCmd->setCommandStreamerStallEnable(true); - dispatchPerfCountersOABufferStateCommands(commandQueue, hwPerfCounter, commandStream); + GpgpuWalkerHelper::dispatchPerfCountersOABufferStateCommands(commandQueue, hwPerfCounter, commandStream); //Timestamp: Global End pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); @@ -411,21 +415,21 @@ void dispatchPerfCountersCommandsEnd( address = reinterpret_cast(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.Oa)); pReportPerfCount->setMemoryAddress(address); - dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, false); + GpgpuWalkerHelper::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, false); //Store value of NOOPID register - dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, false); + GpgpuWalkerHelper::dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, false); //Read Core Frequency - dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, false); + GpgpuWalkerHelper::dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, false); - dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, false); + GpgpuWalkerHelper::dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, false); perfCounters->setCpuTimestamp(); } template -void dispatchWalker( +void GpgpuWalkerHelper::dispatchWalker( CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, @@ -435,7 +439,7 @@ void dispatchWalker( OCLRT::HwPerfCounter *hwPerfCounter, PreemptionMode preemptionMode, bool blockQueue, - unsigned int commandType = 0) { + unsigned int commandType) { OCLRT::LinearStream *commandStream = nullptr; OCLRT::IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr; @@ -586,17 +590,17 @@ void dispatchWalker( if (&dispatchInfo == &*multiDispatchInfo.begin()) { // If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled if (hwTimeStamps != nullptr) { - dispatchProfilingCommandsStart(*hwTimeStamps, commandStream); + GpgpuWalkerHelper::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream); } if (hwPerfCounter != nullptr) { - dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream); + GpgpuWalkerHelper::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream); } } PreemptionHelper::applyPreemptionWaCmdsBegin(commandStream, commandQueue.getDevice()); // Implement enabling special WA DisableLSQCROPERFforOCL if needed - applyWADisableLSQCROPERFforOCL(commandStream, kernel, true); + GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(commandStream, kernel, true); // Program the walker. Invokes execution so all state should already be programmed typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER; @@ -606,7 +610,7 @@ void dispatchWalker( size_t globalOffsets[3] = {offset.x, offset.y, offset.z}; size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z}; size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z}; - auto localWorkSize = setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd); + auto localWorkSize = GpgpuWalkerHelper::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd); pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData); DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0); @@ -627,22 +631,22 @@ void dispatchWalker( pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength); // Implement disabling special WA DisableLSQCROPERFforOCL if needed - applyWADisableLSQCROPERFforOCL(commandStream, kernel, false); + GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(commandStream, kernel, false); PreemptionHelper::applyPreemptionWaCmdsEnd(commandStream, commandQueue.getDevice()); } // If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled if (hwTimeStamps != nullptr) { - dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream); + GpgpuWalkerHelper::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream); } if (hwPerfCounter != nullptr) { - dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream); + GpgpuWalkerHelper::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream); } } template -void dispatchWalker( +void GpgpuWalkerHelper::dispatchWalker( CommandQueue &commandQueue, const Kernel &kernel, cl_uint workDim, @@ -658,12 +662,12 @@ void dispatchWalker( bool blockQueue) { DispatchInfo dispatchInfo(const_cast(&kernel), workDim, workItems, localWorkSizesIn, globalOffsets); - dispatchWalker(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList, - blockedCommandsData, hwTimeStamps, hwPerfCounter, preemptionMode, blockQueue); + GpgpuWalkerHelper::dispatchWalker(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList, + blockedCommandsData, hwTimeStamps, hwPerfCounter, preemptionMode, blockQueue); } template -void dispatchScheduler( +void GpgpuWalkerHelper::dispatchScheduler( CommandQueue &commandQueue, DeviceQueueHw &devQueueHw, PreemptionMode preemptionMode, @@ -752,7 +756,7 @@ void dispatchScheduler( preemptionMode); // Implement enabling special WA DisableLSQCROPERFforOCL if needed - applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true); + GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true); // Program the walker. Invokes execution so all state should already be programmed auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER)); @@ -760,7 +764,7 @@ void dispatchScheduler( size_t globalOffsets[3] = {0, 0, 0}; size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1}; - auto localWorkSize = setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd); + auto localWorkSize = GpgpuWalkerHelper::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd); pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData); DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0); @@ -781,7 +785,7 @@ void dispatchScheduler( pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength); // Implement disabling special WA DisableLSQCROPERFforOCL if needed - applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false); + GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false); // Do not put BB_START only when returning in first Scheduler run if (devQueueHw.getSchedulerReturnInstance() != 1) { @@ -797,141 +801,13 @@ void dispatchScheduler( } } -template -struct EnqueueOperation { - static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class"); - static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class"); - static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class"); - static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) { - size_t size = KernelCommandsHelper::getSizeRequiredCS() + - sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper::isPipeControlWArequired() ? 2 : 1); - if (reserveProfilingCmdsSpace) { - size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - } - if (reservePerfCounters) { - //start cmds - //P_C: flush CS & TimeStamp BEGIN - size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL); - //SRM NOOPID & Frequency - size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - //gp registers - size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - //report perf count - size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); - //user registers - size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - - //end cmds - //P_C: flush CS & TimeStamp END; - size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL); - //OA buffer (status head, tail) - size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - //report perf count - size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); - //gp registers - size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - //SRM NOOPID & Frequency - size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - //user registers - size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - } - Device &device = commandQueue.getDevice(); - for (auto &dispatchInfo : multiDispatchInfo) { - auto &kernel = *dispatchInfo.getKernel(); - size += sizeof(typename GfxFamily::GPGPU_WALKER); - size += getSizeForWADisableLSQCROPERFforOCL(&kernel); - size += PreemptionHelper::getPreemptionWaCsSize(device); - } - return size; - } - - static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) { - size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper::getSizeRequiredCS() + - sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper::isPipeControlWArequired() ? 2 : 1); - size += PreemptionHelper::getPreemptionWaCsSize(commandQueue.getDevice()); - if (reserveProfilingCmdsSpace) { - size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - } - if (reservePerfCounters) { - //start cmds - //P_C: flush CS & TimeStamp BEGIN - size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL); - //SRM NOOPID & Frequency - size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - //gp registers - size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - //report perf count - size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); - //user registers - size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - - //end cmds - //P_C: flush CS & TimeStamp END; - size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL); - //OA buffer (status head, tail) - size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - //report perf count - size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT); - //gp registers - size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - //SRM NOOPID & Frequency - size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - //user registers - size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); - } - size += getSizeForWADisableLSQCROPERFforOCL(pKernel); - - return size; - } -}; - -template -LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) { - auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel); - return commandQueue.getCS(expectedSizeCS); +template +void GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) { } -template -LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) { - size_t expectedSizeCS = 0; - Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr; - for (auto &dispatchInfo : multiDispatchInfo) { - expectedSizeCS += EnqueueOperation::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel()); - } - if (parentKernel && parentKernel->isParentKernel) { - SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext()); - expectedSizeCS += EnqueueOperation::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler); - } - return commandQueue.getCS(expectedSizeCS); +template +size_t GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) { + return (size_t)0; } -template -IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) { - size_t expectedSize = 0; - IndirectHeap *ih = nullptr; - - // clang-format off - switch(heapType) { - case IndirectHeap::DYNAMIC_STATE: expectedSize = KernelCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); break; - case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); break; - case IndirectHeap::SURFACE_STATE: expectedSize = KernelCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); break; - } - // clang-format on - - if (multiDispatchInfo.begin()->getKernel()->isParentKernel) { - if (heapType == IndirectHeap::SURFACE_STATE) { - expectedSize += KernelCommandsHelper::template getSizeRequiredForExecutionModel(const_cast(*(multiDispatchInfo.begin()->getKernel()))); - } else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT) - { - DeviceQueueHw *pDevQueue = castToObject>(commandQueue.getContext().getDefaultDeviceQueue()); - DEBUG_BREAK_IF(pDevQueue == nullptr); - ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE); - } - } - - if (ih == nullptr) - ih = &commandQueue.getIndirectHeap(heapType, expectedSize); - - return *ih; -} } // namespace OCLRT diff --git a/runtime/command_stream/command_stream_receiver_hw.inl b/runtime/command_stream/command_stream_receiver_hw.inl index 2c20ef9930..bbf857ca93 100644 --- a/runtime/command_stream/command_stream_receiver_hw.inl +++ b/runtime/command_stream/command_stream_receiver_hw.inl @@ -32,7 +32,7 @@ #include "runtime/memory_manager/memory_manager.h" #include "runtime/os_interface/debug_settings_manager.h" #include "runtime/command_stream/preemption.h" -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "command_stream_receiver_hw.h" namespace OCLRT { diff --git a/runtime/command_stream/preemption.inl b/runtime/command_stream/preemption.inl index fe5c978b1b..8ed136d2fe 100644 --- a/runtime/command_stream/preemption.inl +++ b/runtime/command_stream/preemption.inl @@ -24,7 +24,7 @@ #include "runtime/built_ins/sip.h" #include "runtime/command_stream/preemption.h" #include "runtime/device/device.h" -#include "runtime/command_queue/dispatch_walker_helper.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/memory_manager/graphics_allocation.h" namespace OCLRT { diff --git a/runtime/device_queue/device_queue_hw.inl b/runtime/device_queue/device_queue_hw.inl index 5a0d953af9..2535a8e3d6 100644 --- a/runtime/device_queue/device_queue_hw.inl +++ b/runtime/device_queue/device_queue_hw.inl @@ -22,8 +22,7 @@ #pragma once #include "runtime/device_queue/device_queue_hw.h" -#include "runtime/command_queue/dispatch_walker.h" -#include "runtime/command_queue/dispatch_walker_helper.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/helpers/kernel_commands.h" #include "runtime/helpers/preamble.h" #include "runtime/helpers/string.h" @@ -217,7 +216,7 @@ void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKer offset = slbCS.getUsed(); igilQueue->m_controls.m_CleanupSectionAddress = ptrOffset(slbBuffer->getGpuAddress(), slbCS.getUsed()); - applyWADisableLSQCROPERFforOCL(&slbCS, *parentKernel, true); + GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(&slbCS, *parentKernel, true); using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; @@ -388,10 +387,10 @@ size_t DeviceQueueHw::setSchedulerCrossThreadData(SchedulerKernel &sc template void DeviceQueueHw::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) { - OCLRT::dispatchScheduler(cmdQ, - *this, - preemptionMode, - scheduler); + GpgpuWalkerHelper::dispatchScheduler(cmdQ, + *this, + preemptionMode, + scheduler); return; } diff --git a/runtime/enable_gens.cmake b/runtime/enable_gens.cmake index 4f0873ab97..b487c38020 100644 --- a/runtime/enable_gens.cmake +++ b/runtime/enable_gens.cmake @@ -37,6 +37,7 @@ set(RUNTIME_SRCS_GENX_BASE device_enqueue.h device_queue.cpp command_stream_receiver_hw.cpp + gpgpu_walker.cpp hw_cmds.h hw_cmds_generated.h hw_helper.cpp diff --git a/runtime/gen8/command_queue.cpp b/runtime/gen8/command_queue.cpp index 2e1ed96d25..d66f48ae84 100644 --- a/runtime/gen8/command_queue.cpp +++ b/runtime/gen8/command_queue.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -23,8 +23,6 @@ #include "runtime/memory_manager/svm_memory_manager.h" #include "runtime/command_queue/command_queue_hw.h" #include "runtime/command_queue/command_queue_hw.inl" -#include "runtime/command_queue/dispatch_walker_helper.h" -#include "runtime/command_queue/dispatch_walker_helper.inl" namespace OCLRT { @@ -37,43 +35,4 @@ void populateFactoryTable>() { commandQueueFactory[gfxCore] = CommandQueueHw::create; } -template <> -void applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) { - if (disablePerfMode) { - if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { - // Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4 - addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS); - } - } else { - if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { - // Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work - typedef typename Family::PIPE_CONTROL PIPE_CONTROL; - auto pCmd = reinterpret_cast(pCommandStream->getSpace(sizeof(PIPE_CONTROL))); - *pCmd = PIPE_CONTROL::sInit(); - pCmd->setCommandStreamerStallEnable(true); - // Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4 - addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS); - } - } -} - -template <> -size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) { - typedef typename Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG; - typedef typename Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; - typedef typename Family::PIPE_CONTROL PIPE_CONTROL; - typedef typename Family::MI_MATH MI_MATH; - typedef typename Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE; - size_t n = 0; - if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { - n += sizeof(PIPE_CONTROL) + - (2 * sizeof(MI_LOAD_REGISTER_REG) + - sizeof(MI_LOAD_REGISTER_IMM) + - sizeof(PIPE_CONTROL) + - sizeof(MI_MATH) + - NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) * - 2; // For 2 WADisableLSQCROPERFforOCL WAs - } - return n; -} } // namespace OCLRT diff --git a/runtime/gen8/gpgpu_walker.cpp b/runtime/gen8/gpgpu_walker.cpp new file mode 100644 index 0000000000..c8ee90c25d --- /dev/null +++ b/runtime/gen8/gpgpu_walker.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "runtime/gen8/hw_info.h" +#include "runtime/command_queue/gpgpu_walker.h" +#include "runtime/command_queue/gpgpu_walker.inl" + +namespace OCLRT { + +template <> +void GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) { + if (disablePerfMode) { + if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { + // Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4 + GpgpuWalkerHelper::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS); + } + } else { + if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { + // Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work + typedef typename BDWFamily::PIPE_CONTROL PIPE_CONTROL; + auto pCmd = reinterpret_cast(pCommandStream->getSpace(sizeof(PIPE_CONTROL))); + *pCmd = PIPE_CONTROL::sInit(); + pCmd->setCommandStreamerStallEnable(true); + // Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4 + GpgpuWalkerHelper::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS); + } + } +} + +template <> +size_t GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) { + typedef typename BDWFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG; + typedef typename BDWFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; + typedef typename BDWFamily::PIPE_CONTROL PIPE_CONTROL; + typedef typename BDWFamily::MI_MATH MI_MATH; + typedef typename BDWFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE; + size_t n = 0; + if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { + n += sizeof(PIPE_CONTROL) + + (2 * sizeof(MI_LOAD_REGISTER_REG) + + sizeof(MI_LOAD_REGISTER_IMM) + + sizeof(PIPE_CONTROL) + + sizeof(MI_MATH) + + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) * + 2; // For 2 WADisableLSQCROPERFforOCL WAs + } + return n; +} + +template class GpgpuWalkerHelper; + +} // namespace OCLRT diff --git a/runtime/gen9/command_queue.cpp b/runtime/gen9/command_queue.cpp index e5fab4e824..42ab5dd0db 100644 --- a/runtime/gen9/command_queue.cpp +++ b/runtime/gen9/command_queue.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -23,8 +23,6 @@ #include "runtime/memory_manager/svm_memory_manager.h" #include "runtime/command_queue/command_queue_hw.h" #include "runtime/command_queue/command_queue_hw.inl" -#include "runtime/command_queue/dispatch_walker_helper.h" -#include "runtime/command_queue/dispatch_walker_helper.inl" namespace OCLRT { @@ -37,43 +35,4 @@ void populateFactoryTable>() { commandQueueFactory[gfxCore] = CommandQueueHw::create; } -template <> -void applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) { - if (disablePerfMode) { - if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { - // Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4 - addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS); - } - } else { - if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { - // Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work - typedef typename Family::PIPE_CONTROL PIPE_CONTROL; - auto pCmd = reinterpret_cast(pCommandStream->getSpace(sizeof(PIPE_CONTROL))); - *pCmd = PIPE_CONTROL::sInit(); - pCmd->setCommandStreamerStallEnable(true); - // Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4 - addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS); - } - } -} - -template <> -size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) { - typedef typename Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG; - typedef typename Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; - typedef typename Family::PIPE_CONTROL PIPE_CONTROL; - typedef typename Family::MI_MATH MI_MATH; - typedef typename Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE; - size_t n = 0; - if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { - n += sizeof(PIPE_CONTROL) + - (2 * sizeof(MI_LOAD_REGISTER_REG) + - sizeof(MI_LOAD_REGISTER_IMM) + - sizeof(PIPE_CONTROL) + - sizeof(MI_MATH) + - NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) * - 2; // For 2 WADisableLSQCROPERFforOCL WAs - } - return n; -} } // namespace OCLRT diff --git a/runtime/gen9/gpgpu_walker.cpp b/runtime/gen9/gpgpu_walker.cpp new file mode 100644 index 0000000000..668ee963ae --- /dev/null +++ b/runtime/gen9/gpgpu_walker.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "runtime/gen9/hw_cmds_base.h" +#include "runtime/command_queue/gpgpu_walker.h" +#include "runtime/command_queue/gpgpu_walker.inl" + +namespace OCLRT { + +template <> +void GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) { + if (disablePerfMode) { + if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { + // Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4 + GpgpuWalkerHelper::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS); + } + } else { + if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { + // Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work + typedef typename SKLFamily::PIPE_CONTROL PIPE_CONTROL; + auto pCmd = reinterpret_cast(pCommandStream->getSpace(sizeof(PIPE_CONTROL))); + *pCmd = PIPE_CONTROL::sInit(); + pCmd->setCommandStreamerStallEnable(true); + // Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4 + GpgpuWalkerHelper::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS); + } + } +} + +template <> +size_t GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) { + typedef typename SKLFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG; + typedef typename SKLFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; + typedef typename SKLFamily::PIPE_CONTROL PIPE_CONTROL; + typedef typename SKLFamily::MI_MATH MI_MATH; + typedef typename SKLFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE; + size_t n = 0; + if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { + n += sizeof(PIPE_CONTROL) + + (2 * sizeof(MI_LOAD_REGISTER_REG) + + sizeof(MI_LOAD_REGISTER_IMM) + + sizeof(PIPE_CONTROL) + + sizeof(MI_MATH) + + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) * + 2; // For 2 WADisableLSQCROPERFforOCL WAs + } + return n; +} + +template class GpgpuWalkerHelper; + +} // namespace OCLRT diff --git a/runtime/helpers/dispatch_info_builder.h b/runtime/helpers/dispatch_info_builder.h index d71009260e..c0eb7b3c52 100644 --- a/runtime/helpers/dispatch_info_builder.h +++ b/runtime/helpers/dispatch_info_builder.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -24,7 +24,7 @@ #include "runtime/helpers/dispatch_info.h" #include "runtime/kernel/kernel.h" -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" namespace OCLRT { @@ -67,7 +67,7 @@ enum class RegionCoordZ : uint32_t { Middle = 1, Back = 2 }; -} +} // namespace SplitDispatch // Compute power in compile time static constexpr uint32_t powConst(uint32_t base, uint32_t currExp) { @@ -453,4 +453,4 @@ class DispatchInfoBuilder { return x % y ? 1 : 0; } }; -} +} // namespace OCLRT diff --git a/unit_tests/command_queue/dispatch_walker_tests.cpp b/unit_tests/command_queue/dispatch_walker_tests.cpp index dc1647ba0d..f6827c553c 100644 --- a/unit_tests/command_queue/dispatch_walker_tests.cpp +++ b/unit_tests/command_queue/dispatch_walker_tests.cpp @@ -21,13 +21,14 @@ */ #include "test.h" -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/event/perf_counter.h" #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/kernel_commands.h" #include "runtime/helpers/task_information.h" #include "unit_tests/fixtures/device_fixture.h" #include "unit_tests/command_queue/command_queue_fixture.h" +#include "unit_tests/libult/mock_gfx_family.h" #include "unit_tests/helpers/hw_parse.h" #include "unit_tests/helpers/debug_manager_state_restore.h" #include "unit_tests/mocks/mock_kernel.h" @@ -137,7 +138,7 @@ HWTEST_F(DispatchWalkerTest, shouldntChangeCommandStreamMemory) { size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; cl_uint dimensions = 1; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimensions, @@ -185,7 +186,7 @@ HWTEST_F(DispatchWalkerTest, noLocalIdsShouldntCrash) { size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; cl_uint dimensions = 1; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimensions, @@ -214,7 +215,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithDefaultLwsAlgorithm) size_t workItems[3] = {1, 1, 1}; for (uint32_t dimension = 1; dimension <= 3; ++dimension) { workItems[dimension - 1] = 256; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimension, @@ -244,7 +245,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithSquaredLwsAlgorithm) size_t workItems[3] = {1, 1, 1}; for (uint32_t dimension = 1; dimension <= 3; ++dimension) { workItems[dimension - 1] = 256; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimension, @@ -273,7 +274,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithNDLwsAlgorithm) { size_t workItems[3] = {1, 1, 1}; for (uint32_t dimension = 1; dimension <= 3; ++dimension) { workItems[dimension - 1] = 256; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimension, @@ -303,7 +304,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithOldLwsAlgorithm) { size_t workItems[3] = {1, 1, 1}; for (uint32_t dimension = 1; dimension <= 3; ++dimension) { workItems[dimension - 1] = 256; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimension, @@ -332,7 +333,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNumWorkGroups) { size_t workItems[3] = {2, 5, 10}; size_t workGroupSize[3] = {1, 1, 1}; cl_uint dimensions = 3; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimensions, @@ -363,7 +364,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithOutComputeND) { size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {2, 5, 10}; cl_uint dimensions = 3; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimensions, @@ -394,7 +395,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeND) { size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {2, 5, 10}; cl_uint dimensions = 3; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimensions, @@ -426,7 +427,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeSquared) { size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {2, 5, 10}; cl_uint dimensions = 3; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimensions, @@ -458,7 +459,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithOutComputeSquaredAn size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {2, 5, 10}; cl_uint dimensions = 3; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimensions, @@ -488,7 +489,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSize) { size_t workItems[3] = {2, 5, 10}; size_t workGroupSize[3] = {1, 2, 3}; cl_uint dimensions = 3; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimensions, @@ -521,7 +522,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizes) { size_t workItems[3] = {2, 5, 10}; size_t workGroupSize[3] = {1, 2, 3}; cl_uint dimensions = 3; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimensions, @@ -561,7 +562,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizeForSplitKernel) { MockMultiDispatchInfo multiDispatchInfo(std::vector({&di1, &di2})); - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, multiDispatchInfo, 0, @@ -604,7 +605,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizesForSplitWalker) { MockMultiDispatchInfo multiDispatchInfo(std::vector({&di1, &di2})); - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, multiDispatchInfo, 0, @@ -646,7 +647,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerDoesntConsumeCommandStreamWhenQueueIs KernelOperation *blockedCommandsData = nullptr; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimensions, @@ -686,7 +687,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromKernelW KernelOperation *blockedCommandsData = nullptr; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, kernel, dimensions, @@ -727,7 +728,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromMdiWhen KernelOperation *blockedCommandsData = nullptr; - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, multiDispatchInfo, 0, @@ -759,7 +760,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfo) { MockMultiDispatchInfo multiDispatchInfo(std::vector({&kernel1, &kernel2})); - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, multiDispatchInfo, 0, @@ -800,7 +801,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoCorrectlyProg indirectHeap.align(KernelCommandsHelper::alignInterfaceDescriptorData); auto dshBeforeMultiDisptach = indirectHeap.getUsed(); - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, multiDispatchInfo, 0, @@ -884,7 +885,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoCorrectlyProg // create commandStream auto &cmdStream = pCmdQ->getCS(0); - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, multiDispatchInfo, 0, @@ -929,7 +930,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoAndDifferentK // create commandStream auto &cmdStream = pCmdQ->getCS(0); - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, multiDispatchInfo, 0, @@ -979,7 +980,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoButSameKernel // create commandStream auto &cmdStream = pCmdQ->getCS(0); - dispatchWalker( + GpgpuWalkerHelper::dispatchWalker( *pCmdQ, multiDispatchInfo, 0, @@ -1030,7 +1031,7 @@ HWTEST_F(DispatchWalkerTest, givenMultiDispatchWhenWhitelistedRegisterForCoheren DispatchInfo di2(&kernel, 1, Vec3(1, 1, 1), Vec3(1, 1, 1), Vec3(0, 0, 0)); MockMultiDispatchInfo multiDispatchInfo(std::vector({&di1, &di2})); - dispatchWalker(*pCmdQ, multiDispatchInfo, 0, nullptr, nullptr, nullptr, nullptr, pDevice->getPreemptionMode(), false); + GpgpuWalkerHelper::dispatchWalker(*pCmdQ, multiDispatchInfo, 0, nullptr, nullptr, nullptr, nullptr, pDevice->getPreemptionMode(), false); hwParser.parseCommands(cmdStream, 0); @@ -1056,3 +1057,15 @@ TEST(DispatchWalker, calculateDispatchDim) { } } } + +HWTEST_F(DispatchWalkerTest, WhenCallingDefaultWaMethodsThenExpectNothing) { + auto &cmdStream = pCmdQ->getCS(0); + MockKernel kernel(&program, kernelInfo, *pDevice); + EXPECT_EQ(CL_SUCCESS, kernel.initialize()); + + GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(&cmdStream, kernel, false); + + size_t expectedSize = 0; + size_t actualSize = GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(&kernel); + EXPECT_EQ(expectedSize, actualSize); +} diff --git a/unit_tests/command_queue/get_size_required_buffer_tests.cpp b/unit_tests/command_queue/get_size_required_buffer_tests.cpp index beaf35e03e..57d3da77fa 100644 --- a/unit_tests/command_queue/get_size_required_buffer_tests.cpp +++ b/unit_tests/command_queue/get_size_required_buffer_tests.cpp @@ -20,7 +20,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/command_queue/enqueue_fill_buffer.h" #include "runtime/command_queue/enqueue_kernel.h" #include "runtime/command_queue/enqueue_read_buffer.h" @@ -43,8 +43,8 @@ struct GetSizeRequiredBufferTest : public CommandEnqueueFixture, public HelloWorldKernelFixture, public ::testing::Test { - using SimpleArgKernelFixture::SetUp; using HelloWorldKernelFixture::SetUp; + using SimpleArgKernelFixture::SetUp; GetSizeRequiredBufferTest() { } diff --git a/unit_tests/command_queue/get_size_required_image_tests.cpp b/unit_tests/command_queue/get_size_required_image_tests.cpp index a95762630e..56185df77f 100644 --- a/unit_tests/command_queue/get_size_required_image_tests.cpp +++ b/unit_tests/command_queue/get_size_required_image_tests.cpp @@ -22,7 +22,7 @@ #include "runtime/built_ins/built_ins.h" #include "runtime/command_queue/command_queue_hw.h" -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/command_queue/enqueue_copy_image.h" #include "runtime/command_queue/enqueue_fill_image.h" #include "runtime/command_queue/enqueue_read_image.h" diff --git a/unit_tests/command_queue/local_work_size_tests.cpp b/unit_tests/command_queue/local_work_size_tests.cpp index 3fea0011e0..ecccbda145 100644 --- a/unit_tests/command_queue/local_work_size_tests.cpp +++ b/unit_tests/command_queue/local_work_size_tests.cpp @@ -1,5 +1,5 @@ /* -* Copyright (c) 2017, Intel Corporation +* Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -20,7 +20,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/helpers/options.h" #include "unit_tests/mocks/mock_kernel.h" #include "unit_tests/mocks/mock_device.h" diff --git a/unit_tests/command_queue/work_group_size_tests.cpp b/unit_tests/command_queue/work_group_size_tests.cpp index 0bf8ef3859..6e56c78a3c 100644 --- a/unit_tests/command_queue/work_group_size_tests.cpp +++ b/unit_tests/command_queue/work_group_size_tests.cpp @@ -21,7 +21,7 @@ */ #include "hw_cmds.h" -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "unit_tests/fixtures/device_fixture.h" #include "unit_tests/helpers/debug_manager_state_restore.h" #include "test.h" @@ -109,7 +109,7 @@ struct WorkGroupSizeBase : public DeviceFixture { (workItems[0] + workGroupSize[0] - 1) / workGroupSize[0], (workItems[1] + workGroupSize[1] - 1) / workGroupSize[1], (workItems[2] + workGroupSize[2] - 1) / workGroupSize[2]}; - setGpgpuWalkerThreadData(&pCmd, globalOffsets, workGroupsStart, workGroupsNum, workGroupSize, simdSize); + GpgpuWalkerHelper::setGpgpuWalkerThreadData(&pCmd, globalOffsets, workGroupsStart, workGroupsNum, workGroupSize, simdSize); //And check if it is programmed correctly auto numWorkItems = computeWalkerWorkItems(pCmd); diff --git a/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp b/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp index 71d21d8e1a..014f7346b1 100644 --- a/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp +++ b/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp @@ -52,7 +52,7 @@ #include "gtest/gtest.h" #include "runtime/utilities/linux/debug_env_reader.h" #include "runtime/gmm_helper/gmm_helper.h" -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" using namespace OCLRT; diff --git a/unit_tests/context/driver_diagnostics_tests.h b/unit_tests/context/driver_diagnostics_tests.h index ff34b9cead..5b9b8f190c 100644 --- a/unit_tests/context/driver_diagnostics_tests.h +++ b/unit_tests/context/driver_diagnostics_tests.h @@ -21,7 +21,7 @@ */ #pragma once -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/context/context.h" #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/options.h" diff --git a/unit_tests/device_queue/device_queue_hw_tests.cpp b/unit_tests/device_queue/device_queue_hw_tests.cpp index 6676ed6924..b2e2711d39 100644 --- a/unit_tests/device_queue/device_queue_hw_tests.cpp +++ b/unit_tests/device_queue/device_queue_hw_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -31,7 +31,7 @@ #include "unit_tests/mocks/mock_kernel.h" #include "unit_tests/helpers/debug_manager_state_restore.h" -#include "runtime/command_queue/dispatch_walker_helper.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/helpers/kernel_commands.h" #include @@ -330,7 +330,7 @@ HWTEST_F(DeviceQueueSlb, cleanupSection) { if (mockParentKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) { - cleanupSectionOffsetToParse += getSizeForWADisableLSQCROPERFforOCL(mockParentKernel) / 2; + cleanupSectionOffsetToParse += GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(mockParentKernel) / 2; } hwParser.parseCommands(*slbCS, cleanupSectionOffsetToParse); @@ -394,7 +394,7 @@ HWTEST_F(DeviceQueueSlb, AddEMCleanupSectionWithProfiling) { auto pipeControlItor = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); - if (mockParentKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages && getSizeForWADisableLSQCROPERFforOCL(mockParentKernel) > 0) { + if (mockParentKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages && GpgpuWalkerHelper::getSizeForWADisableLSQCROPERFforOCL(mockParentKernel) > 0) { auto loadRegImmItor = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); EXPECT_NE(hwParser.cmdList.end(), loadRegImmItor); diff --git a/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp b/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp index b1f19f7e65..bc717aab42 100644 --- a/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp +++ b/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -20,7 +20,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/command_queue/local_id_gen.h" #include "runtime/device_queue/device_queue_hw.h" #include "runtime/helpers/per_thread_data.h" diff --git a/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp b/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp index 8c7f783c3b..62be275e86 100644 --- a/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp +++ b/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp @@ -53,19 +53,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenDev size_t executionModelDSHUsedBefore = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE)->getUsed(); - dispatchWalker(*pCmdQ, - *pKernel, - 1, - globalOffsets, - workItems, - nullptr, - 0, - nullptr, - &blockedCommandsData, - nullptr, - nullptr, - pDevice->getPreemptionMode(), - false); + GpgpuWalkerHelper::dispatchWalker(*pCmdQ, + *pKernel, + 1, + globalOffsets, + workItems, + nullptr, + 0, + nullptr, + &blockedCommandsData, + nullptr, + nullptr, + pDevice->getPreemptionMode(), + false); size_t dshUsedAfter = pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE).getUsed(); EXPECT_EQ(0u, dshUsedAfter); @@ -109,19 +109,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenDef auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT); - dispatchWalker(*pCmdQ, - *pKernel, - 1, - globalOffsets, - workItems, - nullptr, - 0, - nullptr, - &blockedCommandsData, - nullptr, - nullptr, - pDevice->getPreemptionMode(), - false); + GpgpuWalkerHelper::dispatchWalker(*pCmdQ, + *pKernel, + 1, + globalOffsets, + workItems, + nullptr, + 0, + nullptr, + &blockedCommandsData, + nullptr, + nullptr, + pDevice->getPreemptionMode(), + false); auto iohUsed = ioh.getUsed(); EXPECT_EQ(0u, iohUsed); @@ -136,19 +136,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenSSH MockMultiDispatchInfo multiDispatchInfo(pKernel); - dispatchWalker(*pCmdQ, - *pKernel, - 1, - globalOffsets, - workItems, - nullptr, - 0, - nullptr, - &blockedCommandsData, - nullptr, - nullptr, - pDevice->getPreemptionMode(), - false); + GpgpuWalkerHelper::dispatchWalker(*pCmdQ, + *pKernel, + 1, + globalOffsets, + workItems, + nullptr, + 0, + nullptr, + &blockedCommandsData, + nullptr, + nullptr, + pDevice->getPreemptionMode(), + false); auto &ssh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE); @@ -172,19 +172,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsBlockedThenSSHSiz MockMultiDispatchInfo multiDispatchInfo(pKernel); - dispatchWalker(*pCmdQ, - *pKernel, - 1, - globalOffsets, - workItems, - nullptr, - 0, - nullptr, - &blockedCommandsData, - nullptr, - nullptr, - pDevice->getPreemptionMode(), - true); // blockQueue + GpgpuWalkerHelper::dispatchWalker(*pCmdQ, + *pKernel, + 1, + globalOffsets, + workItems, + nullptr, + 0, + nullptr, + &blockedCommandsData, + nullptr, + nullptr, + pDevice->getPreemptionMode(), + true); // blockQueue ASSERT_NE(nullptr, blockedCommandsData); size_t minRequiredSize = KernelCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); @@ -269,19 +269,19 @@ HWTEST_F(MockParentKernelDispatch, GivenBlockedQueueWhenParentKernelIsDispatched const size_t globalOffsets[3] = {0, 0, 0}; const size_t workItems[3] = {1, 1, 1}; - dispatchWalker(*pCmdQ, - *mockParentKernel, - 1, - globalOffsets, - workItems, - nullptr, - 0, - nullptr, - &blockedCommandsData, - nullptr, - nullptr, - pDevice->getPreemptionMode(), - true); // blockQueue + GpgpuWalkerHelper::dispatchWalker(*pCmdQ, + *mockParentKernel, + 1, + globalOffsets, + workItems, + nullptr, + 0, + nullptr, + &blockedCommandsData, + nullptr, + nullptr, + pDevice->getPreemptionMode(), + true); // blockQueue ASSERT_NE(nullptr, blockedCommandsData); @@ -302,19 +302,19 @@ HWTEST_F(MockParentKernelDispatch, GivenParentKernelWhenDispatchedThenMediaInter const size_t globalOffsets[3] = {0, 0, 0}; const size_t workItems[3] = {1, 1, 1}; - dispatchWalker(*pCmdQ, - *mockParentKernel, - 1, - globalOffsets, - workItems, - nullptr, - 0, - nullptr, - &blockedCommandsData, - nullptr, - nullptr, - pDevice->getPreemptionMode(), - false); // blockQueue + GpgpuWalkerHelper::dispatchWalker(*pCmdQ, + *mockParentKernel, + 1, + globalOffsets, + workItems, + nullptr, + 0, + nullptr, + &blockedCommandsData, + nullptr, + nullptr, + pDevice->getPreemptionMode(), + false); // blockQueue LinearStream *commandStream = &pCmdQ->getCS(0); @@ -358,19 +358,19 @@ HWTEST_F(MockParentKernelDispatch, GivenUsedSSHHeapWhenParentKernelIsDispatchedT // If parent is not using SSH, then heap obtained has zero usage and the same buffer ASSERT_EQ(0u, mockParentKernel->getKernelInfo().heapInfo.pKernelHeader->SurfaceStateHeapSize); - dispatchWalker(*pCmdQ, - *mockParentKernel, - 1, - globalOffsets, - workItems, - nullptr, - 0, - nullptr, - &blockedCommandsData, - nullptr, - nullptr, - pDevice->getPreemptionMode(), - false); // blockQueue + GpgpuWalkerHelper::dispatchWalker(*pCmdQ, + *mockParentKernel, + 1, + globalOffsets, + workItems, + nullptr, + 0, + nullptr, + &blockedCommandsData, + nullptr, + nullptr, + pDevice->getPreemptionMode(), + false); // blockQueue EXPECT_EQ(0u, ssh.getUsed()); @@ -393,19 +393,19 @@ HWTEST_F(MockParentKernelDispatch, GivenNotUsedSSHHeapWhenParentKernelIsDispatch auto *bufferMemory = ssh.getCpuBase(); - dispatchWalker(*pCmdQ, - *mockParentKernel, - 1, - globalOffsets, - workItems, - nullptr, - 0, - nullptr, - &blockedCommandsData, - nullptr, - nullptr, - pDevice->getPreemptionMode(), - false); // blockQueue + GpgpuWalkerHelper::dispatchWalker(*pCmdQ, + *mockParentKernel, + 1, + globalOffsets, + workItems, + nullptr, + 0, + nullptr, + &blockedCommandsData, + nullptr, + nullptr, + pDevice->getPreemptionMode(), + false); // blockQueue EXPECT_EQ(bufferMemory, ssh.getCpuBase()); diff --git a/unit_tests/execution_model/scheduler_dispatch_tests.cpp b/unit_tests/execution_model/scheduler_dispatch_tests.cpp index c5ddb59834..adb9b36ace 100644 --- a/unit_tests/execution_model/scheduler_dispatch_tests.cpp +++ b/unit_tests/execution_model/scheduler_dispatch_tests.cpp @@ -72,7 +72,7 @@ HWTEST_F(ExecutionModelSchedulerFixture, dispatchScheduler) { LinearStream &commandStream = getCommandStream(*pCmdQ, false, false, &scheduler); pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH); - dispatchScheduler( + GpgpuWalkerHelper::dispatchScheduler( *pCmdQ, *pDevQueueHw, pDevice->getPreemptionMode(), @@ -188,7 +188,7 @@ HWTEST_F(ExecutionModelSchedulerFixture, dispatchSchedulerDoesNotUseStandardCmdQ getCommandStream(*pCmdQ, false, false, &scheduler); pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH); - dispatchScheduler( + GpgpuWalkerHelper::dispatchScheduler( *pCmdQ, *pDevQueueHw, pDevice->getPreemptionMode(), @@ -219,7 +219,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, dispatchSchedulerWithEarlyReturnSetToF LinearStream &commandStream = getCommandStream(*pCmdQ, false, false, &scheduler); pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH); - dispatchScheduler( + GpgpuWalkerHelper::dispatchScheduler( *pCmdQ, mockDevQueue, device->getPreemptionMode(), diff --git a/unit_tests/gen8/scheduler_dispatch_tests.cpp b/unit_tests/gen8/scheduler_dispatch_tests.cpp index e4256d1c17..d27d76ed84 100644 --- a/unit_tests/gen8/scheduler_dispatch_tests.cpp +++ b/unit_tests/gen8/scheduler_dispatch_tests.cpp @@ -22,7 +22,7 @@ #include "runtime/built_ins/built_ins.h" #include "runtime/command_queue/enqueue_kernel.h" -#include "runtime/command_queue/dispatch_walker.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "runtime/device_queue/device_queue.h" #include "runtime/device_queue/device_queue_hw.h" #include "runtime/helpers/kernel_commands.h" @@ -51,7 +51,7 @@ BDWTEST_F(BdwSchedulerTest, givenCallToDispatchSchedulerWhenPipeControlWithCSSta LinearStream &commandStream = getCommandStream(*pCmdQ, false, false, &scheduler); pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH); - dispatchScheduler( + GpgpuWalkerHelper::dispatchScheduler( *pCmdQ, *pDevQueueHw, pDevice->getPreemptionMode(), diff --git a/unit_tests/gen9/test_device_queue_hw.cpp b/unit_tests/gen9/test_device_queue_hw.cpp index 64b7b636fd..4f9365a122 100644 --- a/unit_tests/gen9/test_device_queue_hw.cpp +++ b/unit_tests/gen9/test_device_queue_hw.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, Intel Corporation + * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -21,7 +21,7 @@ */ #include "runtime/context/context.h" -#include "runtime/command_queue/dispatch_walker_helper.h" +#include "runtime/command_queue/gpgpu_walker.h" #include "unit_tests/fixtures/device_host_queue_fixture.h" #include "unit_tests/helpers/hw_parse.h" #include "unit_tests/mocks/mock_device_queue.h" diff --git a/unit_tests/libult/mock_gfx_family.cpp b/unit_tests/libult/mock_gfx_family.cpp index 4f0a834749..f31c39eecb 100644 --- a/unit_tests/libult/mock_gfx_family.cpp +++ b/unit_tests/libult/mock_gfx_family.cpp @@ -21,12 +21,23 @@ */ #include "unit_tests/libult/mock_gfx_family.h" +#include "runtime/command_queue/gpgpu_walker.inl" +#include "runtime/command_stream/preemption.inl" +#include "runtime/device_queue/device_queue_hw.h" +#include "runtime/device_queue/device_queue_hw.inl" #include "runtime/helpers/hw_helper.inl" +#include "runtime/helpers/kernel_commands.inl" +#include "runtime/helpers/preamble.inl" namespace OCLRT { bool (*GENX::isSimulationFcn)(unsigned short) = nullptr; +GENX::GPGPU_WALKER GENX::cmdInitGpgpuWalker = GENX::GPGPU_WALKER::sInit(); +GENX::INTERFACE_DESCRIPTOR_DATA GENX::cmdInitInterfaceDescriptorData = GENX::INTERFACE_DESCRIPTOR_DATA::sInit(); +GENX::MEDIA_STATE_FLUSH GENX::cmdInitMediaStateFlush = GENX::MEDIA_STATE_FLUSH::sInit(); +GENX::MEDIA_INTERFACE_DESCRIPTOR_LOAD GENX::cmdInitMediaInterfaceDescriptorLoad = GENX::MEDIA_INTERFACE_DESCRIPTOR_LOAD::sInit(); + template <> size_t HwHelperHw::getMaxBarrierRegisterPerSlice() const { return 32; @@ -57,4 +68,89 @@ struct hw_helper_static_init { template class HwHelperHw; hw_helper_static_init si; + +template class GpgpuWalkerHelper; + +template <> +bool KernelCommandsHelper::isPipeControlWArequired() { + return false; +} + +template struct KernelCommandsHelper; + +template <> +size_t PreemptionHelper::getRequiredCmdStreamSize(PreemptionMode newPreemptionMode, PreemptionMode oldPreemptionMode) { + return 0; +} + +template <> +void PreemptionHelper::programCmdStream(LinearStream &cmdStream, PreemptionMode newPreemptionMode, PreemptionMode oldPreemptionMode, + GraphicsAllocation *preemptionCsr, Device &device) { +} + +template <> +size_t PreemptionHelper::getRequiredPreambleSize(const Device &device) { + return 0; +} + +template <> +void PreemptionHelper::programPreamble(LinearStream &preambleCmdStream, Device &device, + const GraphicsAllocation *preemptionCsr) { +} + +template <> +size_t PreemptionHelper::getPreemptionWaCsSize(const Device &device) { + return 0; +} + +template void PreemptionHelper::programInterfaceDescriptorDataPreemption(INTERFACE_DESCRIPTOR_DATA *idd, PreemptionMode preemptionMode); + +template <> +size_t DeviceQueueHw::getWaCommandsSize() { + return (size_t)0; +} + +template <> +void DeviceQueueHw::addArbCheckCmdWa() { +} + +template <> +void DeviceQueueHw::addMiAtomicCmdWa(uint64_t atomicOpPlaceholder) { +} + +template <> +void DeviceQueueHw::addLriCmdWa(bool setArbCheck) { +} + +template <> +void DeviceQueueHw::addPipeControlCmdWa(bool isNoopCmd) { +} + +template <> +void DeviceQueueHw::addProfilingEndCmds(uint64_t timestampAddress) { +} + +template class DeviceQueueHw; + +template <> +void PreambleHelper::addPipeControlBeforeVfeCmd(LinearStream *pCommandStream, const HardwareInfo *hwInfo) { +} + +template <> +uint32_t PreambleHelper::getL3Config(const HardwareInfo &hwInfo, bool useSLM) { + uint32_t l3Config = 0; + return l3Config; +} + +template <> +void PreambleHelper::programPipelineSelect(LinearStream *pCommandStream, bool mediaSamplerRequired) { +} + +template <> +struct L3CNTLRegisterOffset { + static const uint32_t registerOffset = 0x7034; +}; + +template struct PreambleHelper; + } // namespace OCLRT diff --git a/unit_tests/libult/mock_gfx_family.h b/unit_tests/libult/mock_gfx_family.h index 7b723bd687..2bd8049279 100644 --- a/unit_tests/libult/mock_gfx_family.h +++ b/unit_tests/libult/mock_gfx_family.h @@ -31,9 +31,71 @@ extern HwHelper *hwHelperFactory[IGFX_MAX_CORE]; struct GENX { static bool (*isSimulationFcn)(unsigned short); typedef struct tagINTERFACE_DESCRIPTOR_DATA { + typedef enum tagDENORM_MODE { + DENORM_MODE_FTZ = 0x0, + DENORM_MODE_SETBYKERNEL = 0x1, + } DENORM_MODE; + typedef enum tagSAMPLERSTATEPOINTER { + SAMPLERSTATEPOINTER_BIT_SHIFT = 0x5, + SAMPLERSTATEPOINTER_ALIGN_SIZE = 0x20, + } SAMPLERSTATEPOINTER; + typedef enum tagSAMPLER_COUNT { + SAMPLER_COUNT_NO_SAMPLERS_USED = 0x0, + SAMPLER_COUNT_BETWEEN_1_AND_4_SAMPLERS_USED = 0x1, + SAMPLER_COUNT_BETWEEN_5_AND_8_SAMPLERS_USED = 0x2, + SAMPLER_COUNT_BETWEEN_9_AND_12_SAMPLERS_USED = 0x3, + SAMPLER_COUNT_BETWEEN_13_AND_16_SAMPLERS_USED = 0x4, + } SAMPLER_COUNT; + typedef enum tagSHARED_LOCAL_MEMORY_SIZE { + SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K = 0x0, + SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K = 0x1, + SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K = 0x2, + SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K = 0x3, + SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K = 0x4, + SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K = 0x5, + SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K = 0x6, + SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K = 0x7, + } SHARED_LOCAL_MEMORY_SIZE; + typedef enum tagBINDINGTABLEPOINTER { + BINDINGTABLEPOINTER_BIT_SHIFT = 0x5, + BINDINGTABLEPOINTER_ALIGN_SIZE = 0x20, + } BINDINGTABLEPOINTER; + static tagINTERFACE_DESCRIPTOR_DATA sInit(void) { + INTERFACE_DESCRIPTOR_DATA state; + return state; + } + inline void setKernelStartPointerHigh(const uint32_t value) { + } + inline void setKernelStartPointer(const uint64_t value) { + } + inline void setNumberOfThreadsInGpgpuThreadGroup(const uint32_t value) { + } + inline void setCrossThreadConstantDataReadLength(const uint32_t value) { + } + inline void setDenormMode(const DENORM_MODE value) { + } + inline void setConstantIndirectUrbEntryReadLength(const uint32_t value) { + } + inline void setBindingTablePointer(const uint64_t value) { + } + inline void setSamplerStatePointer(const uint64_t value) { + } + inline void setSamplerCount(const SAMPLER_COUNT value) { + } + inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) { + } + inline void setBarrierEnable(const bool value) { + } } INTERFACE_DESCRIPTOR_DATA; typedef struct tagBINDING_TABLE_STATE { + inline void init(void) { + } + inline uint32_t getSurfaceStatePointer(void) const { + return 0u; + } + inline void setSurfaceStatePointer(const uint64_t value) { + } inline uint32_t getRawData(const uint32_t index) { return 0; } @@ -42,6 +104,247 @@ struct GENX { SURFACESTATEPOINTER_ALIGN_SIZE = 0x40, } SURFACESTATEPOINTER; } BINDING_TABLE_STATE; + + typedef struct tagGPGPU_WALKER { + typedef enum tagSIMD_SIZE { + SIMD_SIZE_SIMD8 = 0x0, + SIMD_SIZE_SIMD16 = 0x1, + SIMD_SIZE_SIMD32 = 0x2, + } SIMD_SIZE; + typedef enum tagINDIRECTDATASTARTADDRESS { + INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6, + INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40, + } INDIRECTDATASTARTADDRESS; + static tagGPGPU_WALKER sInit(void) { + GPGPU_WALKER state; + return state; + } + inline void setThreadWidthCounterMaximum(const uint32_t value) { + } + inline void setThreadGroupIdXDimension(const uint32_t value) { + } + inline void setThreadGroupIdYDimension(const uint32_t value) { + } + inline void setThreadGroupIdZDimension(const uint32_t value) { + } + inline void setRightExecutionMask(const uint32_t value) { + } + inline void setBottomExecutionMask(const uint32_t value) { + } + inline void setSimdSize(const SIMD_SIZE value) { + } + inline void setThreadGroupIdStartingX(const uint32_t value) { + } + inline void setThreadGroupIdStartingY(const uint32_t value) { + } + inline void setThreadGroupIdStartingResumeZ(const uint32_t value) { + } + inline void setIndirectDataStartAddress(const uint32_t value) { + } + inline void setInterfaceDescriptorOffset(const uint32_t value) { + } + inline void setIndirectDataLength(const uint32_t value) { + } + } GPGPU_WALKER; + + typedef struct tagPIPE_CONTROL { + typedef enum tagPOST_SYNC_OPERATION { + POST_SYNC_OPERATION_NO_WRITE = 0x0, + POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA = 0x1, + POST_SYNC_OPERATION_WRITE_PS_DEPTH_COUNT = 0x2, + POST_SYNC_OPERATION_WRITE_TIMESTAMP = 0x3, + } POST_SYNC_OPERATION; + static tagPIPE_CONTROL sInit(void) { + PIPE_CONTROL state; + return state; + } + inline void setCommandStreamerStallEnable(const uint32_t value) { + } + inline void setDcFlushEnable(const bool value) { + } + inline void setStateCacheInvalidationEnable(const bool value) { + } + inline void setPipeControlFlushEnable(const bool value) { + } + inline void setTextureCacheInvalidationEnable(const bool value) { + } + inline void setPostSyncOperation(const POST_SYNC_OPERATION value) { + } + inline void setAddress(const uint32_t value) { + } + inline void setAddressHigh(const uint32_t value) { + } + inline void setImmediateData(const uint64_t value) { + } + inline void setGenericMediaStateClear(const bool value) { + } + } PIPE_CONTROL; + + typedef struct tagMI_LOAD_REGISTER_IMM { + static tagMI_LOAD_REGISTER_IMM sInit(void) { + MI_LOAD_REGISTER_IMM state; + return state; + } + inline void setRegisterOffset(const uint32_t value) { + } + inline void setDataDword(const uint32_t value) { + } + } MI_LOAD_REGISTER_IMM; + + typedef struct tagMI_LOAD_REGISTER_REG { + static tagMI_LOAD_REGISTER_REG sInit(void) { + MI_LOAD_REGISTER_REG state; + return state; + } + inline void setSourceRegisterAddress(const uint32_t value) { + } + inline void setDestinationRegisterAddress(const uint32_t value) { + } + } MI_LOAD_REGISTER_REG; + + typedef struct tagMI_MATH { + union _DW0 { + struct _BitField { + uint32_t DwordLength : BITFIELD_RANGE(0, 5); + uint32_t Reserved : BITFIELD_RANGE(6, 22); + uint32_t InstructionOpcode : BITFIELD_RANGE(23, 28); + uint32_t InstructionType : BITFIELD_RANGE(29, 31); + } BitField; + uint32_t Value; + } DW0; + typedef enum tagMI_COMMAND_OPCODE { + MI_COMMAND_OPCODE_MI_MATH = 0x0, + } MI_COMMAND_OPCODE; + typedef enum tagCOMMAND_TYPE { + COMMAND_TYPE_MI_COMMAND = 0x0, + } COMMAND_TYPE; + } MI_MATH; + + typedef struct tagMI_MATH_ALU_INST_INLINE { + union _DW0 { + struct _BitField { + uint32_t Operand2 : BITFIELD_RANGE(0, 9); + uint32_t Operand1 : BITFIELD_RANGE(10, 19); + uint32_t ALUOpcode : BITFIELD_RANGE(20, 31); + } BitField; + uint32_t Value; + } DW0; + } MI_MATH_ALU_INST_INLINE; + + typedef struct tagMI_COMMAND_OPCODE_MI_MATH { + } MI_COMMAND_OPCODE_MI_MATH; + + typedef struct tagMI_STORE_REGISTER_MEM { + static tagMI_STORE_REGISTER_MEM sInit(void) { + MI_STORE_REGISTER_MEM state; + return state; + } + inline void setRegisterAddress(const uint32_t value) { + } + inline void setMemoryAddress(const uint64_t value) { + } + } MI_STORE_REGISTER_MEM; + + typedef struct tagMI_REPORT_PERF_COUNT { + static tagMI_REPORT_PERF_COUNT sInit(void) { + MI_REPORT_PERF_COUNT state; + return state; + } + inline void setReportId(const uint32_t value) { + } + inline void setMemoryAddress(const uint64_t value) { + } + } MI_REPORT_PERF_COUNT; + + typedef struct tagMI_BATCH_BUFFER_START { + typedef enum tagSECOND_LEVEL_BATCH_BUFFER { + SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH = 0x0, + SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH = 0x1, + } SECOND_LEVEL_BATCH_BUFFER; + static tagMI_BATCH_BUFFER_START sInit(void) { + MI_BATCH_BUFFER_START state; + return state; + } + inline void setSecondLevelBatchBuffer(const SECOND_LEVEL_BATCH_BUFFER value) { + } + inline void setBatchBufferStartAddressGraphicsaddress472(const uint64_t value) { + } + } MI_BATCH_BUFFER_START; + + typedef struct tagMEDIA_STATE_FLUSH { + static tagMEDIA_STATE_FLUSH sInit(void) { + MEDIA_STATE_FLUSH state; + return state; + } + inline void setInterfaceDescriptorOffset(const uint32_t value) { + } + } MEDIA_STATE_FLUSH; + + typedef struct tagMEDIA_INTERFACE_DESCRIPTOR_LOAD { + static tagMEDIA_INTERFACE_DESCRIPTOR_LOAD sInit(void) { + MEDIA_INTERFACE_DESCRIPTOR_LOAD state; + return state; + } + inline void setInterfaceDescriptorDataStartAddress(const uint32_t value) { + } + inline void setInterfaceDescriptorTotalLength(const uint32_t value) { + } + } MEDIA_INTERFACE_DESCRIPTOR_LOAD; + + typedef struct tagMI_BATCH_BUFFER_END { + static tagMI_BATCH_BUFFER_END sInit(void) { + MI_BATCH_BUFFER_END state; + return state; + } + } MI_BATCH_BUFFER_END; + + typedef struct tagRENDER_SURFACE_STATE { + } RENDER_SURFACE_STATE; + + typedef struct tagMEDIA_VFE_STATE { + static tagMEDIA_VFE_STATE sInit(void) { + MEDIA_VFE_STATE state; + return state; + } + inline void setMaximumNumberOfThreads(const uint32_t value) { + } + inline void setNumberOfUrbEntries(const uint32_t value) { + } + inline void setUrbEntryAllocationSize(const uint32_t value) { + } + inline void setPerThreadScratchSpace(const uint32_t value) { + } + inline void setStackSize(const uint32_t value) { + } + inline void setScratchSpaceBasePointer(const uint32_t value) { + } + inline void setScratchSpaceBasePointerHigh(const uint32_t value) { + } + } MEDIA_VFE_STATE; + + typedef struct tagSAMPLER_STATE { + inline void setIndirectStatePointer(const uint32_t indirectStatePointerValue) { + } + } SAMPLER_STATE; + + typedef struct tagGPGPU_CSR_BASE_ADDRESS { + inline void init(void) { + } + inline void setGpgpuCsrBaseAddress(uint64_t value) { + } + } GPGPU_CSR_BASE_ADDRESS; + + typedef struct tagSTATE_SIP { + inline void init(void) { + } + inline void setSystemInstructionPointer(uint64_t value) { + } + } STATE_SIP; + + static GPGPU_WALKER cmdInitGpgpuWalker; + static INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData; + static MEDIA_STATE_FLUSH cmdInitMediaStateFlush; + static MEDIA_INTERFACE_DESCRIPTOR_LOAD cmdInitMediaInterfaceDescriptorLoad; }; } // namespace OCLRT