Create GpgpuWalkerHelper class
Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b
This commit is contained in:
parent
d51f2cd1ec
commit
b6b92ae808
|
@ -25,9 +25,6 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
|
|||
${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cpu_data_transfer_handler.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker_helper.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker_helper.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_barrier.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_common.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_copy_buffer.h
|
||||
|
@ -49,6 +46,8 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
|
|||
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_write_image.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/finish.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/flush.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -20,7 +20,7 @@
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/command_queue/enqueue_barrier.h"
|
||||
#include "runtime/command_queue/enqueue_copy_buffer.h"
|
||||
#include "runtime/command_queue/enqueue_copy_buffer_rect.h"
|
||||
|
|
|
@ -1,56 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace OCLRT {
|
||||
|
||||
constexpr int NUM_ALU_INST_FOR_READ_MODIFY_WRITE = 4;
|
||||
|
||||
constexpr int L3SQC_BIT_LQSC_RO_PERF_DIS = 0x08000000;
|
||||
constexpr int L3SQC_REG4 = 0xB118;
|
||||
|
||||
constexpr int GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER = 0xFFFFFFFF;
|
||||
constexpr int GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER = 0x00000000;
|
||||
|
||||
constexpr int CS_GPR_R0 = 0x2600;
|
||||
constexpr int CS_GPR_R1 = 0x2608;
|
||||
|
||||
constexpr int ALU_OPCODE_LOAD = 0x080;
|
||||
constexpr int ALU_OPCODE_STORE = 0x180;
|
||||
constexpr int ALU_OPCODE_OR = 0x103;
|
||||
constexpr int ALU_OPCODE_AND = 0x102;
|
||||
|
||||
constexpr int ALU_REGISTER_R_0 = 0x0;
|
||||
constexpr int ALU_REGISTER_R_1 = 0x1;
|
||||
constexpr int ALU_REGISTER_R_SRCA = 0x20;
|
||||
constexpr int ALU_REGISTER_R_SRCB = 0x21;
|
||||
constexpr int ALU_REGISTER_R_ACCU = 0x31;
|
||||
|
||||
constexpr unsigned int GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW = 0x23A8;
|
||||
|
||||
template <typename GfxFamily>
|
||||
void applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode);
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
|
||||
} // namespace OCLRT
|
|
@ -1,99 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "runtime/command_queue/dispatch_walker_helper.h"
|
||||
|
||||
namespace OCLRT {
|
||||
|
||||
// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask
|
||||
template <typename GfxFamily>
|
||||
void addAluReadModifyWriteRegister(
|
||||
OCLRT::LinearStream *pCommandStream,
|
||||
uint32_t aluRegister,
|
||||
uint32_t operation,
|
||||
uint32_t mask) {
|
||||
// Load "Register" value into CS_GPR_R0
|
||||
typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
|
||||
typedef typename GfxFamily::MI_MATH MI_MATH;
|
||||
typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
|
||||
auto pCmd = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
|
||||
*pCmd = MI_LOAD_REGISTER_REG::sInit();
|
||||
pCmd->setSourceRegisterAddress(aluRegister);
|
||||
pCmd->setDestinationRegisterAddress(CS_GPR_R0);
|
||||
|
||||
// Load "Mask" into CS_GPR_R1
|
||||
typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
|
||||
auto pCmd2 = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM)));
|
||||
*pCmd2 = MI_LOAD_REGISTER_IMM::sInit();
|
||||
pCmd2->setRegisterOffset(CS_GPR_R1);
|
||||
pCmd2->setDataDword(mask);
|
||||
|
||||
// Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands
|
||||
auto pCmd3 = reinterpret_cast<uint32_t *>(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)));
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.Value = 0x0;
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
|
||||
// 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1;
|
||||
pCmd3++;
|
||||
MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(pCmd3);
|
||||
|
||||
// Setup first operand of MI_MATH - load CS_GPR_R0 into register A
|
||||
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
|
||||
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA;
|
||||
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0;
|
||||
pAluParam++;
|
||||
|
||||
// Setup second operand of MI_MATH - load CS_GPR_R1 into register B
|
||||
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
|
||||
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB;
|
||||
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1;
|
||||
pAluParam++;
|
||||
|
||||
// Setup third operand of MI_MATH - "Operation" on registers A and B
|
||||
pAluParam->DW0.BitField.ALUOpcode = operation;
|
||||
pAluParam->DW0.BitField.Operand1 = 0;
|
||||
pAluParam->DW0.BitField.Operand2 = 0;
|
||||
pAluParam++;
|
||||
|
||||
// Setup fourth operand of MI_MATH - store result into CS_GPR_R0
|
||||
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE;
|
||||
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0;
|
||||
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;
|
||||
|
||||
// LOAD value of CS_GPR_R0 into "Register"
|
||||
auto pCmd4 = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
|
||||
*pCmd4 = MI_LOAD_REGISTER_REG::sInit();
|
||||
pCmd4->setSourceRegisterAddress(CS_GPR_R0);
|
||||
pCmd4->setDestinationRegisterAddress(aluRegister);
|
||||
|
||||
// Add PIPE_CONTROL to flush caches
|
||||
typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL;
|
||||
auto pCmd5 = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
|
||||
*pCmd5 = PIPE_CONTROL::sInit();
|
||||
pCmd5->setCommandStreamerStallEnable(true);
|
||||
pCmd5->setDcFlushEnable(true);
|
||||
pCmd5->setTextureCacheInvalidationEnable(true);
|
||||
pCmd5->setPipeControlFlushEnable(true);
|
||||
pCmd5->setStateCacheInvalidationEnable(true);
|
||||
}
|
||||
} // namespace OCLRT
|
|
@ -24,7 +24,7 @@
|
|||
#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
|
||||
#include "hw_cmds.h"
|
||||
#include "runtime/command_queue/command_queue_hw.h"
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/command_stream/command_stream_receiver.h"
|
||||
#include "runtime/event/event_builder.h"
|
||||
#include "runtime/gtpin/gtpin_notify.h"
|
||||
|
@ -243,7 +243,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
|||
}
|
||||
}
|
||||
|
||||
dispatchWalker<GfxFamily>(
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
|
||||
*this,
|
||||
multiDispatchInfo,
|
||||
numEventsInWaitList,
|
||||
|
@ -293,7 +293,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
|||
this->getIndirectHeap(IndirectHeap::SURFACE_STATE).getGraphicsAllocation(),
|
||||
devQueueHw->getDebugQueue());
|
||||
|
||||
dispatchScheduler<GfxFamily>(
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
|
||||
*this,
|
||||
*devQueueHw,
|
||||
preemption,
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -24,7 +24,7 @@
|
|||
#include "hw_cmds.h"
|
||||
#include "runtime/command_queue/command_queue_hw.h"
|
||||
#include "runtime/command_stream/command_stream_receiver.h"
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/helpers/kernel_commands.h"
|
||||
#include "runtime/helpers/task_information.h"
|
||||
#include "runtime/mem_obj/buffer.h"
|
||||
|
@ -69,7 +69,7 @@ struct EnqueueOperation<GfxFamily, CL_COMMAND_NDRANGE_KERNEL> {
|
|||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(pKernel);
|
||||
size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -24,7 +24,7 @@
|
|||
#include "hw_cmds.h"
|
||||
#include "runtime/command_queue/command_queue_hw.h"
|
||||
#include "runtime/command_stream/command_stream_receiver.h"
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/device/device.h"
|
||||
#include "runtime/event/event.h"
|
||||
#include "runtime/memory_manager/surface.h"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -24,7 +24,7 @@
|
|||
#include "hw_cmds.h"
|
||||
#include "runtime/command_queue/command_queue_hw.h"
|
||||
#include "runtime/command_stream/command_stream_receiver.h"
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/device/device.h"
|
||||
#include "runtime/event/event.h"
|
||||
#include "runtime/memory_manager/surface.h"
|
||||
|
|
|
@ -0,0 +1,371 @@
|
|||
/*
|
||||
* Copyright (c) 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "runtime/built_ins/built_ins.h"
|
||||
#include "runtime/context/context.h"
|
||||
#include "runtime/command_queue/command_queue.h"
|
||||
#include "runtime/command_stream/linear_stream.h"
|
||||
#include "runtime/command_stream/preemption.h"
|
||||
#include "runtime/device_queue/device_queue_hw.h"
|
||||
#include "runtime/event/hw_timestamps.h"
|
||||
#include "runtime/event/perf_counter.h"
|
||||
#include "runtime/helpers/dispatch_info.h"
|
||||
#include "runtime/helpers/kernel_commands.h"
|
||||
#include "runtime/helpers/task_information.h"
|
||||
#include "runtime/indirect_heap/indirect_heap.h"
|
||||
#include "runtime/kernel/kernel.h"
|
||||
#include "runtime/program/kernel_info.h"
|
||||
#include "runtime/utilities/vec.h"
|
||||
|
||||
namespace OCLRT {
|
||||
|
||||
constexpr int32_t NUM_ALU_INST_FOR_READ_MODIFY_WRITE = 4;
|
||||
|
||||
constexpr int32_t L3SQC_BIT_LQSC_RO_PERF_DIS = 0x08000000;
|
||||
constexpr int32_t L3SQC_REG4 = 0xB118;
|
||||
|
||||
constexpr int32_t GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER = 0xFFFFFFFF;
|
||||
constexpr int32_t GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER = 0x00000000;
|
||||
|
||||
constexpr int32_t CS_GPR_R0 = 0x2600;
|
||||
constexpr int32_t CS_GPR_R1 = 0x2608;
|
||||
|
||||
constexpr int32_t ALU_OPCODE_LOAD = 0x080;
|
||||
constexpr int32_t ALU_OPCODE_STORE = 0x180;
|
||||
constexpr int32_t ALU_OPCODE_OR = 0x103;
|
||||
constexpr int32_t ALU_OPCODE_AND = 0x102;
|
||||
|
||||
constexpr int32_t ALU_REGISTER_R_0 = 0x0;
|
||||
constexpr int32_t ALU_REGISTER_R_1 = 0x1;
|
||||
constexpr int32_t ALU_REGISTER_R_SRCA = 0x20;
|
||||
constexpr int32_t ALU_REGISTER_R_SRCB = 0x21;
|
||||
constexpr int32_t ALU_REGISTER_R_ACCU = 0x31;
|
||||
|
||||
constexpr uint32_t GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW = 0x23A8;
|
||||
|
||||
void computeWorkgroupSize1D(
|
||||
uint32_t maxWorkGroupSize,
|
||||
size_t workGroupSize[3],
|
||||
const size_t workItems[3],
|
||||
size_t simdSize);
|
||||
|
||||
void computeWorkgroupSizeND(
|
||||
WorkSizeInfo wsInfo,
|
||||
size_t workGroupSize[3],
|
||||
const size_t workItems[3],
|
||||
const uint32_t workDim);
|
||||
|
||||
void computeWorkgroupSize2D(
|
||||
uint32_t maxWorkGroupSize,
|
||||
size_t workGroupSize[3],
|
||||
const size_t workItems[3],
|
||||
size_t simdSize);
|
||||
|
||||
void computeWorkgroupSizeSquared(
|
||||
uint32_t maxWorkGroupSize,
|
||||
size_t workGroupSize[3],
|
||||
const size_t workItems[3],
|
||||
size_t simdSize,
|
||||
const uint32_t workDim);
|
||||
|
||||
Vec3<size_t> computeWorkgroupSize(
|
||||
const DispatchInfo &dispatchInfo);
|
||||
|
||||
Vec3<size_t> generateWorkgroupSize(
|
||||
const DispatchInfo &dispatchInfo);
|
||||
|
||||
Vec3<size_t> computeWorkgroupsNumber(
|
||||
const Vec3<size_t> gws,
|
||||
const Vec3<size_t> lws);
|
||||
|
||||
Vec3<size_t> generateWorkgroupsNumber(
|
||||
const Vec3<size_t> gws,
|
||||
const Vec3<size_t> lws);
|
||||
|
||||
Vec3<size_t> generateWorkgroupsNumber(
|
||||
const DispatchInfo &dispatchInfo);
|
||||
|
||||
inline uint32_t calculateDispatchDim(Vec3<size_t> dispatchSize, Vec3<size_t> dispatchOffset) {
|
||||
return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
|
||||
}
|
||||
|
||||
Vec3<size_t> canonizeWorkgroup(
|
||||
Vec3<size_t> workgroup);
|
||||
|
||||
void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo);
|
||||
|
||||
inline cl_uint computeDimensions(const size_t workItems[3]) {
|
||||
return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
|
||||
}
|
||||
|
||||
template <typename SizeAndAllocCalcT, typename... CalcArgsT>
|
||||
IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
|
||||
size_t alignment = MemoryConstants::pageSize;
|
||||
size_t size = calc(std::forward<CalcArgsT>(args)...);
|
||||
return new IndirectHeap(alignedMalloc(size, alignment), size);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
class GpgpuWalkerHelper {
|
||||
public:
|
||||
static void addAluReadModifyWriteRegister(
|
||||
LinearStream *pCommandStream,
|
||||
uint32_t aluRegister,
|
||||
uint32_t operation,
|
||||
uint32_t mask);
|
||||
|
||||
static void applyWADisableLSQCROPERFforOCL(LinearStream *pCommandStream,
|
||||
const Kernel &kernel,
|
||||
bool disablePerfMode);
|
||||
|
||||
static size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
|
||||
|
||||
static size_t setGpgpuWalkerThreadData(
|
||||
typename GfxFamily::GPGPU_WALKER *pCmd,
|
||||
const size_t globalOffsets[3],
|
||||
const size_t startWorkGroups[3],
|
||||
const size_t numWorkGroups[3],
|
||||
const size_t localWorkSizesIn[3],
|
||||
uint32_t simd);
|
||||
|
||||
static void dispatchProfilingCommandsStart(
|
||||
HwTimeStamps &hwTimeStamps,
|
||||
OCLRT::LinearStream *commandStream);
|
||||
|
||||
static void dispatchProfilingCommandsEnd(
|
||||
HwTimeStamps &hwTimeStamps,
|
||||
OCLRT::LinearStream *commandStream);
|
||||
|
||||
static void dispatchPerfCountersNoopidRegisterCommands(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream,
|
||||
bool start);
|
||||
|
||||
static void dispatchPerfCountersReadFreqRegisterCommands(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream,
|
||||
bool start);
|
||||
|
||||
static void dispatchPerfCountersGeneralPurposeCounterCommands(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream,
|
||||
bool start);
|
||||
|
||||
static void dispatchPerfCountersUserCounterCommands(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream,
|
||||
bool start);
|
||||
|
||||
static void dispatchPerfCountersOABufferStateCommands(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream);
|
||||
|
||||
static void dispatchPerfCountersCommandsStart(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream);
|
||||
|
||||
static void dispatchPerfCountersCommandsEnd(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream);
|
||||
|
||||
static void dispatchWalker(
|
||||
CommandQueue &commandQueue,
|
||||
const MultiDispatchInfo &multiDispatchInfo,
|
||||
cl_uint numEventsInWaitList,
|
||||
const cl_event *eventWaitList,
|
||||
KernelOperation **blockedCommandsData,
|
||||
HwTimeStamps *hwTimeStamps,
|
||||
OCLRT::HwPerfCounter *hwPerfCounter,
|
||||
PreemptionMode preemptionMode,
|
||||
bool blockQueue,
|
||||
unsigned int commandType = 0);
|
||||
|
||||
static void dispatchWalker(
|
||||
CommandQueue &commandQueue,
|
||||
const Kernel &kernel,
|
||||
cl_uint workDim,
|
||||
const size_t globalOffsets[3],
|
||||
const size_t workItems[3],
|
||||
const size_t *localWorkSizesIn,
|
||||
cl_uint numEventsInWaitList,
|
||||
const cl_event *eventWaitList,
|
||||
KernelOperation **blockedCommandsData,
|
||||
HwTimeStamps *hwTimeStamps,
|
||||
HwPerfCounter *hwPerfCounter,
|
||||
PreemptionMode preemptionMode,
|
||||
bool blockQueue);
|
||||
|
||||
static void dispatchScheduler(
|
||||
CommandQueue &commandQueue,
|
||||
DeviceQueueHw<GfxFamily> &devQueueHw,
|
||||
PreemptionMode preemptionMode,
|
||||
SchedulerKernel &scheduler);
|
||||
};
|
||||
|
||||
template <typename GfxFamily, uint32_t eventType>
|
||||
struct EnqueueOperation {
|
||||
static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class");
|
||||
static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class");
|
||||
static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class");
|
||||
static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
|
||||
size_t size = KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
|
||||
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
|
||||
if (reserveProfilingCmdsSpace) {
|
||||
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
if (reservePerfCounters) {
|
||||
//start cmds
|
||||
//P_C: flush CS & TimeStamp BEGIN
|
||||
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
|
||||
//SRM NOOPID & Frequency
|
||||
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//gp registers
|
||||
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//report perf count
|
||||
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
|
||||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
|
||||
//end cmds
|
||||
//P_C: flush CS & TimeStamp END;
|
||||
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
|
||||
//OA buffer (status head, tail)
|
||||
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//report perf count
|
||||
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
|
||||
//gp registers
|
||||
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//SRM NOOPID & Frequency
|
||||
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
Device &device = commandQueue.getDevice();
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
auto &kernel = *dispatchInfo.getKernel();
|
||||
size += sizeof(typename GfxFamily::GPGPU_WALKER);
|
||||
size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(&kernel);
|
||||
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(device);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
|
||||
size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
|
||||
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
|
||||
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
|
||||
if (reserveProfilingCmdsSpace) {
|
||||
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
if (reservePerfCounters) {
|
||||
//start cmds
|
||||
//P_C: flush CS & TimeStamp BEGIN
|
||||
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
|
||||
//SRM NOOPID & Frequency
|
||||
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//gp registers
|
||||
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//report perf count
|
||||
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
|
||||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
|
||||
//end cmds
|
||||
//P_C: flush CS & TimeStamp END;
|
||||
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
|
||||
//OA buffer (status head, tail)
|
||||
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//report perf count
|
||||
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
|
||||
//gp registers
|
||||
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//SRM NOOPID & Frequency
|
||||
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
|
||||
|
||||
return size;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GfxFamily, uint32_t eventType>
|
||||
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) {
|
||||
auto expectedSizeCS = EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel);
|
||||
return commandQueue.getCS(expectedSizeCS);
|
||||
}
|
||||
|
||||
template <typename GfxFamily, uint32_t eventType>
|
||||
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
|
||||
size_t expectedSizeCS = 0;
|
||||
Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr;
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel());
|
||||
}
|
||||
if (parentKernel && parentKernel->isParentKernel) {
|
||||
SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext());
|
||||
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler);
|
||||
}
|
||||
return commandQueue.getCS(expectedSizeCS);
|
||||
}
|
||||
|
||||
template <typename GfxFamily, IndirectHeap::Type heapType>
|
||||
IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
|
||||
size_t expectedSize = 0;
|
||||
IndirectHeap *ih = nullptr;
|
||||
|
||||
// clang-format off
|
||||
switch (heapType) {
|
||||
case IndirectHeap::DYNAMIC_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo); break;
|
||||
case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo); break;
|
||||
case IndirectHeap::SURFACE_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo); break;
|
||||
}
|
||||
// clang-format on
|
||||
|
||||
if (multiDispatchInfo.begin()->getKernel()->isParentKernel) {
|
||||
if (heapType == IndirectHeap::SURFACE_STATE) {
|
||||
expectedSize += KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<heapType>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
|
||||
} else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
|
||||
{
|
||||
DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
|
||||
DEBUG_BREAK_IF(pDevQueue == nullptr);
|
||||
ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
|
||||
}
|
||||
}
|
||||
|
||||
if (ih == nullptr)
|
||||
ih = &commandQueue.getIndirectHeap(heapType, expectedSize);
|
||||
|
||||
return *ih;
|
||||
}
|
||||
|
||||
} // namespace OCLRT
|
|
@ -21,24 +21,17 @@
|
|||
*/
|
||||
|
||||
#pragma once
|
||||
#include "runtime/context/context.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/command_queue/local_id_gen.h"
|
||||
#include "runtime/command_queue/command_queue.h"
|
||||
#include "runtime/command_queue/dispatch_walker_helper.h"
|
||||
#include "runtime/command_stream/command_stream_receiver.h"
|
||||
#include "runtime/command_stream/preemption.h"
|
||||
#include "runtime/device/device_info.h"
|
||||
#include "runtime/device_queue/device_queue_hw.h"
|
||||
#include "runtime/event/perf_counter.h"
|
||||
#include "runtime/event/user_event.h"
|
||||
#include "runtime/indirect_heap/indirect_heap.h"
|
||||
#include "runtime/helpers/aligned_memory.h"
|
||||
#include "runtime/helpers/debug_helpers.h"
|
||||
#include "runtime/helpers/kernel_commands.h"
|
||||
#include "runtime/helpers/task_information.h"
|
||||
#include "runtime/helpers/validators.h"
|
||||
#include "runtime/helpers/dispatch_info.h"
|
||||
#include "runtime/kernel/kernel.h"
|
||||
#include "runtime/mem_obj/mem_obj.h"
|
||||
#include "runtime/memory_manager/graphics_allocation.h"
|
||||
#include <algorithm>
|
||||
|
@ -46,57 +39,81 @@
|
|||
|
||||
namespace OCLRT {
|
||||
|
||||
void computeWorkgroupSize1D(
|
||||
uint32_t maxWorkGroupSize,
|
||||
size_t workGroupSize[3],
|
||||
const size_t workItems[3],
|
||||
size_t simdSize);
|
||||
// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
|
||||
OCLRT::LinearStream *pCommandStream,
|
||||
uint32_t aluRegister,
|
||||
uint32_t operation,
|
||||
uint32_t mask) {
|
||||
// Load "Register" value into CS_GPR_R0
|
||||
typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
|
||||
typedef typename GfxFamily::MI_MATH MI_MATH;
|
||||
typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
|
||||
auto pCmd = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
|
||||
*pCmd = MI_LOAD_REGISTER_REG::sInit();
|
||||
pCmd->setSourceRegisterAddress(aluRegister);
|
||||
pCmd->setDestinationRegisterAddress(CS_GPR_R0);
|
||||
|
||||
void computeWorkgroupSizeND(
|
||||
WorkSizeInfo wsInfo,
|
||||
size_t workGroupSize[3],
|
||||
const size_t workItems[3],
|
||||
const uint32_t workDim);
|
||||
// Load "Mask" into CS_GPR_R1
|
||||
typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
|
||||
auto pCmd2 = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM)));
|
||||
*pCmd2 = MI_LOAD_REGISTER_IMM::sInit();
|
||||
pCmd2->setRegisterOffset(CS_GPR_R1);
|
||||
pCmd2->setDataDword(mask);
|
||||
|
||||
void computeWorkgroupSize2D(
|
||||
uint32_t maxWorkGroupSize,
|
||||
size_t workGroupSize[3],
|
||||
const size_t workItems[3],
|
||||
size_t simdSize);
|
||||
// Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands
|
||||
auto pCmd3 = reinterpret_cast<uint32_t *>(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)));
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.Value = 0x0;
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
|
||||
// 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1;
|
||||
pCmd3++;
|
||||
MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(pCmd3);
|
||||
|
||||
void computeWorkgroupSizeSquared(
|
||||
uint32_t maxWorkGroupSize,
|
||||
size_t workGroupSize[3],
|
||||
const size_t workItems[3],
|
||||
size_t simdSize,
|
||||
const uint32_t workDim);
|
||||
// Setup first operand of MI_MATH - load CS_GPR_R0 into register A
|
||||
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
|
||||
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA;
|
||||
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0;
|
||||
pAluParam++;
|
||||
|
||||
Vec3<size_t> computeWorkgroupSize(
|
||||
const DispatchInfo &dispatchInfo);
|
||||
// Setup second operand of MI_MATH - load CS_GPR_R1 into register B
|
||||
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
|
||||
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB;
|
||||
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1;
|
||||
pAluParam++;
|
||||
|
||||
Vec3<size_t> generateWorkgroupSize(
|
||||
const DispatchInfo &dispatchInfo);
|
||||
// Setup third operand of MI_MATH - "Operation" on registers A and B
|
||||
pAluParam->DW0.BitField.ALUOpcode = operation;
|
||||
pAluParam->DW0.BitField.Operand1 = 0;
|
||||
pAluParam->DW0.BitField.Operand2 = 0;
|
||||
pAluParam++;
|
||||
|
||||
Vec3<size_t> computeWorkgroupsNumber(
|
||||
const Vec3<size_t> gws,
|
||||
const Vec3<size_t> lws);
|
||||
// Setup fourth operand of MI_MATH - store result into CS_GPR_R0
|
||||
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE;
|
||||
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0;
|
||||
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;
|
||||
|
||||
Vec3<size_t> generateWorkgroupsNumber(
|
||||
const Vec3<size_t> gws,
|
||||
const Vec3<size_t> lws);
|
||||
// LOAD value of CS_GPR_R0 into "Register"
|
||||
auto pCmd4 = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
|
||||
*pCmd4 = MI_LOAD_REGISTER_REG::sInit();
|
||||
pCmd4->setSourceRegisterAddress(CS_GPR_R0);
|
||||
pCmd4->setDestinationRegisterAddress(aluRegister);
|
||||
|
||||
Vec3<size_t> generateWorkgroupsNumber(
|
||||
const DispatchInfo &dispatchInfo);
|
||||
|
||||
Vec3<size_t> canonizeWorkgroup(
|
||||
Vec3<size_t> workgroup);
|
||||
|
||||
inline uint32_t calculateDispatchDim(Vec3<size_t> dispatchSize, Vec3<size_t> dispatchOffset) {
|
||||
return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
|
||||
// Add PIPE_CONTROL to flush caches
|
||||
typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL;
|
||||
auto pCmd5 = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
|
||||
*pCmd5 = PIPE_CONTROL::sInit();
|
||||
pCmd5->setCommandStreamerStallEnable(true);
|
||||
pCmd5->setDcFlushEnable(true);
|
||||
pCmd5->setTextureCacheInvalidationEnable(true);
|
||||
pCmd5->setPipeControlFlushEnable(true);
|
||||
pCmd5->setStateCacheInvalidationEnable(true);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline size_t setGpgpuWalkerThreadData(
|
||||
inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
typename GfxFamily::GPGPU_WALKER *pCmd,
|
||||
const size_t globalOffsets[3],
|
||||
const size_t startWorkGroups[3],
|
||||
|
@ -132,21 +149,8 @@ inline size_t setGpgpuWalkerThreadData(
|
|||
return localWorkSize;
|
||||
}
|
||||
|
||||
inline cl_uint computeDimensions(const size_t workItems[3]) {
|
||||
return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
|
||||
}
|
||||
|
||||
void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo);
|
||||
|
||||
template <typename SizeAndAllocCalcT, typename... CalcArgsT>
|
||||
IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
|
||||
size_t alignment = MemoryConstants::pageSize;
|
||||
size_t size = calc(std::forward<CalcArgsT>(args)...);
|
||||
return new IndirectHeap(alignedMalloc(size, alignment), size);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void dispatchProfilingCommandsStart(
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
|
||||
HwTimeStamps &hwTimeStamps,
|
||||
OCLRT::LinearStream *commandStream) {
|
||||
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
|
||||
|
@ -173,7 +177,7 @@ void dispatchProfilingCommandsStart(
|
|||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void dispatchProfilingCommandsEnd(
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
|
||||
HwTimeStamps &hwTimeStamps,
|
||||
OCLRT::LinearStream *commandStream) {
|
||||
|
||||
|
@ -196,7 +200,7 @@ void dispatchProfilingCommandsEnd(
|
|||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void dispatchPerfCountersNoopidRegisterCommands(
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersNoopidRegisterCommands(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream,
|
||||
|
@ -214,7 +218,7 @@ void dispatchPerfCountersNoopidRegisterCommands(
|
|||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void dispatchPerfCountersReadFreqRegisterCommands(
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersReadFreqRegisterCommands(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream,
|
||||
|
@ -232,7 +236,7 @@ void dispatchPerfCountersReadFreqRegisterCommands(
|
|||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void dispatchPerfCountersGeneralPurposeCounterCommands(
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream,
|
||||
|
@ -256,7 +260,7 @@ void dispatchPerfCountersGeneralPurposeCounterCommands(
|
|||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void dispatchPerfCountersUserCounterCommands(
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream,
|
||||
|
@ -297,7 +301,7 @@ void dispatchPerfCountersUserCounterCommands(
|
|||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void dispatchPerfCountersOABufferStateCommands(
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream) {
|
||||
|
@ -328,7 +332,7 @@ void dispatchPerfCountersOABufferStateCommands(
|
|||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void dispatchPerfCountersCommandsStart(
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream) {
|
||||
|
@ -347,12 +351,12 @@ void dispatchPerfCountersCommandsStart(
|
|||
pPipeControlCmd->setCommandStreamerStallEnable(true);
|
||||
|
||||
//Store value of NOOPID register
|
||||
dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, true);
|
||||
|
||||
//Read Core Frequency
|
||||
dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, true);
|
||||
|
||||
dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, true);
|
||||
|
||||
auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT));
|
||||
*pReportPerfCount = MI_REPORT_PERF_COUNT::sInit();
|
||||
|
@ -369,13 +373,13 @@ void dispatchPerfCountersCommandsStart(
|
|||
pPipeControlCmd->setAddress(static_cast<uint32_t>(address & ((uint64_t)UINT32_MAX)));
|
||||
pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
|
||||
|
||||
dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, true);
|
||||
|
||||
commandQueue.sendPerfCountersConfig();
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void dispatchPerfCountersCommandsEnd(
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(
|
||||
CommandQueue &commandQueue,
|
||||
OCLRT::HwPerfCounter &hwPerfCounter,
|
||||
OCLRT::LinearStream *commandStream) {
|
||||
|
@ -394,7 +398,7 @@ void dispatchPerfCountersCommandsEnd(
|
|||
*pPipeControlCmd = PIPE_CONTROL::sInit();
|
||||
pPipeControlCmd->setCommandStreamerStallEnable(true);
|
||||
|
||||
dispatchPerfCountersOABufferStateCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(commandQueue, hwPerfCounter, commandStream);
|
||||
|
||||
//Timestamp: Global End
|
||||
pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
|
||||
|
@ -411,21 +415,21 @@ void dispatchPerfCountersCommandsEnd(
|
|||
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.Oa));
|
||||
pReportPerfCount->setMemoryAddress(address);
|
||||
|
||||
dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, false);
|
||||
|
||||
//Store value of NOOPID register
|
||||
dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, false);
|
||||
|
||||
//Read Core Frequency
|
||||
dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, false);
|
||||
|
||||
dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, false);
|
||||
|
||||
perfCounters->setCpuTimestamp();
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void dispatchWalker(
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
|
||||
CommandQueue &commandQueue,
|
||||
const MultiDispatchInfo &multiDispatchInfo,
|
||||
cl_uint numEventsInWaitList,
|
||||
|
@ -435,7 +439,7 @@ void dispatchWalker(
|
|||
OCLRT::HwPerfCounter *hwPerfCounter,
|
||||
PreemptionMode preemptionMode,
|
||||
bool blockQueue,
|
||||
unsigned int commandType = 0) {
|
||||
unsigned int commandType) {
|
||||
|
||||
OCLRT::LinearStream *commandStream = nullptr;
|
||||
OCLRT::IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
|
||||
|
@ -586,17 +590,17 @@ void dispatchWalker(
|
|||
if (&dispatchInfo == &*multiDispatchInfo.begin()) {
|
||||
// If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
|
||||
if (hwTimeStamps != nullptr) {
|
||||
dispatchProfilingCommandsStart<GfxFamily>(*hwTimeStamps, commandStream);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream);
|
||||
}
|
||||
if (hwPerfCounter != nullptr) {
|
||||
dispatchPerfCountersCommandsStart<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);
|
||||
}
|
||||
}
|
||||
|
||||
PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
|
||||
|
||||
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
|
||||
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, true);
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, true);
|
||||
|
||||
// Program the walker. Invokes execution so all state should already be programmed
|
||||
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
|
||||
|
@ -606,7 +610,7 @@ void dispatchWalker(
|
|||
size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
|
||||
size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
|
||||
size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
|
||||
auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd);
|
||||
auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd);
|
||||
|
||||
pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
|
||||
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
|
||||
|
@ -627,22 +631,22 @@ void dispatchWalker(
|
|||
pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
|
||||
|
||||
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
|
||||
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, false);
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, false);
|
||||
|
||||
PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
|
||||
}
|
||||
|
||||
// If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
|
||||
if (hwTimeStamps != nullptr) {
|
||||
dispatchProfilingCommandsEnd<GfxFamily>(*hwTimeStamps, commandStream);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream);
|
||||
}
|
||||
if (hwPerfCounter != nullptr) {
|
||||
dispatchPerfCountersCommandsEnd<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void dispatchWalker(
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
|
||||
CommandQueue &commandQueue,
|
||||
const Kernel &kernel,
|
||||
cl_uint workDim,
|
||||
|
@ -658,12 +662,12 @@ void dispatchWalker(
|
|||
bool blockQueue) {
|
||||
|
||||
DispatchInfo dispatchInfo(const_cast<Kernel *>(&kernel), workDim, workItems, localWorkSizesIn, globalOffsets);
|
||||
dispatchWalker<GfxFamily>(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList,
|
||||
blockedCommandsData, hwTimeStamps, hwPerfCounter, preemptionMode, blockQueue);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchWalker(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList,
|
||||
blockedCommandsData, hwTimeStamps, hwPerfCounter, preemptionMode, blockQueue);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void dispatchScheduler(
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
|
||||
CommandQueue &commandQueue,
|
||||
DeviceQueueHw<GfxFamily> &devQueueHw,
|
||||
PreemptionMode preemptionMode,
|
||||
|
@ -752,7 +756,7 @@ void dispatchScheduler(
|
|||
preemptionMode);
|
||||
|
||||
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
|
||||
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, true);
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true);
|
||||
|
||||
// Program the walker. Invokes execution so all state should already be programmed
|
||||
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
|
||||
|
@ -760,7 +764,7 @@ void dispatchScheduler(
|
|||
|
||||
size_t globalOffsets[3] = {0, 0, 0};
|
||||
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
|
||||
auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd);
|
||||
auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd);
|
||||
|
||||
pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
|
||||
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
|
||||
|
@ -781,7 +785,7 @@ void dispatchScheduler(
|
|||
pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
|
||||
|
||||
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
|
||||
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, false);
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false);
|
||||
|
||||
// Do not put BB_START only when returning in first Scheduler run
|
||||
if (devQueueHw.getSchedulerReturnInstance() != 1) {
|
||||
|
@ -797,141 +801,13 @@ void dispatchScheduler(
|
|||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily, unsigned int eventType>
|
||||
struct EnqueueOperation {
|
||||
static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class");
|
||||
static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class");
|
||||
static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class");
|
||||
static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
|
||||
size_t size = KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
|
||||
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
|
||||
if (reserveProfilingCmdsSpace) {
|
||||
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
if (reservePerfCounters) {
|
||||
//start cmds
|
||||
//P_C: flush CS & TimeStamp BEGIN
|
||||
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
|
||||
//SRM NOOPID & Frequency
|
||||
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//gp registers
|
||||
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//report perf count
|
||||
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
|
||||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
|
||||
//end cmds
|
||||
//P_C: flush CS & TimeStamp END;
|
||||
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
|
||||
//OA buffer (status head, tail)
|
||||
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//report perf count
|
||||
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
|
||||
//gp registers
|
||||
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//SRM NOOPID & Frequency
|
||||
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
Device &device = commandQueue.getDevice();
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
auto &kernel = *dispatchInfo.getKernel();
|
||||
size += sizeof(typename GfxFamily::GPGPU_WALKER);
|
||||
size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(&kernel);
|
||||
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(device);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
|
||||
size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
|
||||
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
|
||||
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
|
||||
if (reserveProfilingCmdsSpace) {
|
||||
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
if (reservePerfCounters) {
|
||||
//start cmds
|
||||
//P_C: flush CS & TimeStamp BEGIN
|
||||
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
|
||||
//SRM NOOPID & Frequency
|
||||
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//gp registers
|
||||
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//report perf count
|
||||
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
|
||||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
|
||||
//end cmds
|
||||
//P_C: flush CS & TimeStamp END;
|
||||
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
|
||||
//OA buffer (status head, tail)
|
||||
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//report perf count
|
||||
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
|
||||
//gp registers
|
||||
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//SRM NOOPID & Frequency
|
||||
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(pKernel);
|
||||
|
||||
return size;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GfxFamily, unsigned int eventType>
|
||||
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) {
|
||||
auto expectedSizeCS = EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel);
|
||||
return commandQueue.getCS(expectedSizeCS);
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
|
||||
}
|
||||
|
||||
template <typename GfxFamily, unsigned int eventType>
|
||||
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
|
||||
size_t expectedSizeCS = 0;
|
||||
Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr;
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel());
|
||||
}
|
||||
if (parentKernel && parentKernel->isParentKernel) {
|
||||
SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext());
|
||||
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler);
|
||||
}
|
||||
return commandQueue.getCS(expectedSizeCS);
|
||||
template <typename GfxFamily>
|
||||
size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
|
||||
return (size_t)0;
|
||||
}
|
||||
|
||||
template <typename GfxFamily, IndirectHeap::Type heapType>
|
||||
IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
|
||||
size_t expectedSize = 0;
|
||||
IndirectHeap *ih = nullptr;
|
||||
|
||||
// clang-format off
|
||||
switch(heapType) {
|
||||
case IndirectHeap::DYNAMIC_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo); break;
|
||||
case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo); break;
|
||||
case IndirectHeap::SURFACE_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo); break;
|
||||
}
|
||||
// clang-format on
|
||||
|
||||
if (multiDispatchInfo.begin()->getKernel()->isParentKernel) {
|
||||
if (heapType == IndirectHeap::SURFACE_STATE) {
|
||||
expectedSize += KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<heapType>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
|
||||
} else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
|
||||
{
|
||||
DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
|
||||
DEBUG_BREAK_IF(pDevQueue == nullptr);
|
||||
ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
|
||||
}
|
||||
}
|
||||
|
||||
if (ih == nullptr)
|
||||
ih = &commandQueue.getIndirectHeap(heapType, expectedSize);
|
||||
|
||||
return *ih;
|
||||
}
|
||||
} // namespace OCLRT
|
|
@ -32,7 +32,7 @@
|
|||
#include "runtime/memory_manager/memory_manager.h"
|
||||
#include "runtime/os_interface/debug_settings_manager.h"
|
||||
#include "runtime/command_stream/preemption.h"
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "command_stream_receiver_hw.h"
|
||||
|
||||
namespace OCLRT {
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
#include "runtime/built_ins/sip.h"
|
||||
#include "runtime/command_stream/preemption.h"
|
||||
#include "runtime/device/device.h"
|
||||
#include "runtime/command_queue/dispatch_walker_helper.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/memory_manager/graphics_allocation.h"
|
||||
|
||||
namespace OCLRT {
|
||||
|
|
|
@ -22,8 +22,7 @@
|
|||
|
||||
#pragma once
|
||||
#include "runtime/device_queue/device_queue_hw.h"
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/dispatch_walker_helper.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/helpers/kernel_commands.h"
|
||||
#include "runtime/helpers/preamble.h"
|
||||
#include "runtime/helpers/string.h"
|
||||
|
@ -217,7 +216,7 @@ void DeviceQueueHw<GfxFamily>::addExecutionModelCleanUpSection(Kernel *parentKer
|
|||
offset = slbCS.getUsed();
|
||||
|
||||
igilQueue->m_controls.m_CleanupSectionAddress = ptrOffset(slbBuffer->getGpuAddress(), slbCS.getUsed());
|
||||
applyWADisableLSQCROPERFforOCL<GfxFamily>(&slbCS, *parentKernel, true);
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&slbCS, *parentKernel, true);
|
||||
|
||||
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
|
||||
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
|
||||
|
@ -388,10 +387,10 @@ size_t DeviceQueueHw<GfxFamily>::setSchedulerCrossThreadData(SchedulerKernel &sc
|
|||
|
||||
template <typename GfxFamily>
|
||||
void DeviceQueueHw<GfxFamily>::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) {
|
||||
OCLRT::dispatchScheduler<GfxFamily>(cmdQ,
|
||||
*this,
|
||||
preemptionMode,
|
||||
scheduler);
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(cmdQ,
|
||||
*this,
|
||||
preemptionMode,
|
||||
scheduler);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -37,6 +37,7 @@ set(RUNTIME_SRCS_GENX_BASE
|
|||
device_enqueue.h
|
||||
device_queue.cpp
|
||||
command_stream_receiver_hw.cpp
|
||||
gpgpu_walker.cpp
|
||||
hw_cmds.h
|
||||
hw_cmds_generated.h
|
||||
hw_helper.cpp
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -23,8 +23,6 @@
|
|||
#include "runtime/memory_manager/svm_memory_manager.h"
|
||||
#include "runtime/command_queue/command_queue_hw.h"
|
||||
#include "runtime/command_queue/command_queue_hw.inl"
|
||||
#include "runtime/command_queue/dispatch_walker_helper.h"
|
||||
#include "runtime/command_queue/dispatch_walker_helper.inl"
|
||||
|
||||
namespace OCLRT {
|
||||
|
||||
|
@ -37,43 +35,4 @@ void populateFactoryTable<CommandQueueHw<Family>>() {
|
|||
commandQueueFactory[gfxCore] = CommandQueueHw<Family>::create;
|
||||
}
|
||||
|
||||
template <>
|
||||
void applyWADisableLSQCROPERFforOCL<Family>(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
|
||||
if (disablePerfMode) {
|
||||
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
// Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
|
||||
addAluReadModifyWriteRegister<Family>(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
}
|
||||
} else {
|
||||
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
// Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
|
||||
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
|
||||
auto pCmd = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
|
||||
*pCmd = PIPE_CONTROL::sInit();
|
||||
pCmd->setCommandStreamerStallEnable(true);
|
||||
// Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
|
||||
addAluReadModifyWriteRegister<Family>(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
size_t getSizeForWADisableLSQCROPERFforOCL<Family>(const Kernel *pKernel) {
|
||||
typedef typename Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
|
||||
typedef typename Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
|
||||
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
|
||||
typedef typename Family::MI_MATH MI_MATH;
|
||||
typedef typename Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
|
||||
size_t n = 0;
|
||||
if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
n += sizeof(PIPE_CONTROL) +
|
||||
(2 * sizeof(MI_LOAD_REGISTER_REG) +
|
||||
sizeof(MI_LOAD_REGISTER_IMM) +
|
||||
sizeof(PIPE_CONTROL) +
|
||||
sizeof(MI_MATH) +
|
||||
NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) *
|
||||
2; // For 2 WADisableLSQCROPERFforOCL WAs
|
||||
}
|
||||
return n;
|
||||
}
|
||||
} // namespace OCLRT
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Copyright (c) 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "runtime/gen8/hw_info.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.inl"
|
||||
|
||||
namespace OCLRT {
|
||||
|
||||
template <>
|
||||
void GpgpuWalkerHelper<BDWFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
|
||||
if (disablePerfMode) {
|
||||
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
// Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
|
||||
GpgpuWalkerHelper<BDWFamily>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
}
|
||||
} else {
|
||||
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
// Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
|
||||
typedef typename BDWFamily::PIPE_CONTROL PIPE_CONTROL;
|
||||
auto pCmd = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
|
||||
*pCmd = PIPE_CONTROL::sInit();
|
||||
pCmd->setCommandStreamerStallEnable(true);
|
||||
// Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
|
||||
GpgpuWalkerHelper<BDWFamily>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
size_t GpgpuWalkerHelper<BDWFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
|
||||
typedef typename BDWFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
|
||||
typedef typename BDWFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
|
||||
typedef typename BDWFamily::PIPE_CONTROL PIPE_CONTROL;
|
||||
typedef typename BDWFamily::MI_MATH MI_MATH;
|
||||
typedef typename BDWFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
|
||||
size_t n = 0;
|
||||
if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
n += sizeof(PIPE_CONTROL) +
|
||||
(2 * sizeof(MI_LOAD_REGISTER_REG) +
|
||||
sizeof(MI_LOAD_REGISTER_IMM) +
|
||||
sizeof(PIPE_CONTROL) +
|
||||
sizeof(MI_MATH) +
|
||||
NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) *
|
||||
2; // For 2 WADisableLSQCROPERFforOCL WAs
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
template class GpgpuWalkerHelper<BDWFamily>;
|
||||
|
||||
} // namespace OCLRT
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -23,8 +23,6 @@
|
|||
#include "runtime/memory_manager/svm_memory_manager.h"
|
||||
#include "runtime/command_queue/command_queue_hw.h"
|
||||
#include "runtime/command_queue/command_queue_hw.inl"
|
||||
#include "runtime/command_queue/dispatch_walker_helper.h"
|
||||
#include "runtime/command_queue/dispatch_walker_helper.inl"
|
||||
|
||||
namespace OCLRT {
|
||||
|
||||
|
@ -37,43 +35,4 @@ void populateFactoryTable<CommandQueueHw<Family>>() {
|
|||
commandQueueFactory[gfxCore] = CommandQueueHw<Family>::create;
|
||||
}
|
||||
|
||||
template <>
|
||||
void applyWADisableLSQCROPERFforOCL<Family>(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
|
||||
if (disablePerfMode) {
|
||||
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
// Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
|
||||
addAluReadModifyWriteRegister<Family>(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
}
|
||||
} else {
|
||||
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
// Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
|
||||
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
|
||||
auto pCmd = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
|
||||
*pCmd = PIPE_CONTROL::sInit();
|
||||
pCmd->setCommandStreamerStallEnable(true);
|
||||
// Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
|
||||
addAluReadModifyWriteRegister<Family>(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
size_t getSizeForWADisableLSQCROPERFforOCL<Family>(const Kernel *pKernel) {
|
||||
typedef typename Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
|
||||
typedef typename Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
|
||||
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
|
||||
typedef typename Family::MI_MATH MI_MATH;
|
||||
typedef typename Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
|
||||
size_t n = 0;
|
||||
if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
n += sizeof(PIPE_CONTROL) +
|
||||
(2 * sizeof(MI_LOAD_REGISTER_REG) +
|
||||
sizeof(MI_LOAD_REGISTER_IMM) +
|
||||
sizeof(PIPE_CONTROL) +
|
||||
sizeof(MI_MATH) +
|
||||
NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) *
|
||||
2; // For 2 WADisableLSQCROPERFforOCL WAs
|
||||
}
|
||||
return n;
|
||||
}
|
||||
} // namespace OCLRT
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Copyright (c) 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "runtime/gen9/hw_cmds_base.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.inl"
|
||||
|
||||
namespace OCLRT {
|
||||
|
||||
template <>
|
||||
void GpgpuWalkerHelper<SKLFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
|
||||
if (disablePerfMode) {
|
||||
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
// Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
|
||||
GpgpuWalkerHelper<SKLFamily>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
}
|
||||
} else {
|
||||
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
// Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
|
||||
typedef typename SKLFamily::PIPE_CONTROL PIPE_CONTROL;
|
||||
auto pCmd = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
|
||||
*pCmd = PIPE_CONTROL::sInit();
|
||||
pCmd->setCommandStreamerStallEnable(true);
|
||||
// Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
|
||||
GpgpuWalkerHelper<SKLFamily>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
size_t GpgpuWalkerHelper<SKLFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
|
||||
typedef typename SKLFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
|
||||
typedef typename SKLFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
|
||||
typedef typename SKLFamily::PIPE_CONTROL PIPE_CONTROL;
|
||||
typedef typename SKLFamily::MI_MATH MI_MATH;
|
||||
typedef typename SKLFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
|
||||
size_t n = 0;
|
||||
if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
n += sizeof(PIPE_CONTROL) +
|
||||
(2 * sizeof(MI_LOAD_REGISTER_REG) +
|
||||
sizeof(MI_LOAD_REGISTER_IMM) +
|
||||
sizeof(PIPE_CONTROL) +
|
||||
sizeof(MI_MATH) +
|
||||
NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) *
|
||||
2; // For 2 WADisableLSQCROPERFforOCL WAs
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
template class GpgpuWalkerHelper<SKLFamily>;
|
||||
|
||||
} // namespace OCLRT
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -24,7 +24,7 @@
|
|||
|
||||
#include "runtime/helpers/dispatch_info.h"
|
||||
#include "runtime/kernel/kernel.h"
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
|
||||
namespace OCLRT {
|
||||
|
||||
|
@ -67,7 +67,7 @@ enum class RegionCoordZ : uint32_t {
|
|||
Middle = 1,
|
||||
Back = 2
|
||||
};
|
||||
}
|
||||
} // namespace SplitDispatch
|
||||
|
||||
// Compute power in compile time
|
||||
static constexpr uint32_t powConst(uint32_t base, uint32_t currExp) {
|
||||
|
@ -453,4 +453,4 @@ class DispatchInfoBuilder {
|
|||
return x % y ? 1 : 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
} // namespace OCLRT
|
||||
|
|
|
@ -21,13 +21,14 @@
|
|||
*/
|
||||
|
||||
#include "test.h"
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/event/perf_counter.h"
|
||||
#include "runtime/helpers/aligned_memory.h"
|
||||
#include "runtime/helpers/kernel_commands.h"
|
||||
#include "runtime/helpers/task_information.h"
|
||||
#include "unit_tests/fixtures/device_fixture.h"
|
||||
#include "unit_tests/command_queue/command_queue_fixture.h"
|
||||
#include "unit_tests/libult/mock_gfx_family.h"
|
||||
#include "unit_tests/helpers/hw_parse.h"
|
||||
#include "unit_tests/helpers/debug_manager_state_restore.h"
|
||||
#include "unit_tests/mocks/mock_kernel.h"
|
||||
|
@ -137,7 +138,7 @@ HWTEST_F(DispatchWalkerTest, shouldntChangeCommandStreamMemory) {
|
|||
size_t globalOffsets[3] = {0, 0, 0};
|
||||
size_t workItems[3] = {1, 1, 1};
|
||||
cl_uint dimensions = 1;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimensions,
|
||||
|
@ -185,7 +186,7 @@ HWTEST_F(DispatchWalkerTest, noLocalIdsShouldntCrash) {
|
|||
size_t globalOffsets[3] = {0, 0, 0};
|
||||
size_t workItems[3] = {1, 1, 1};
|
||||
cl_uint dimensions = 1;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimensions,
|
||||
|
@ -214,7 +215,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithDefaultLwsAlgorithm)
|
|||
size_t workItems[3] = {1, 1, 1};
|
||||
for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
|
||||
workItems[dimension - 1] = 256;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimension,
|
||||
|
@ -244,7 +245,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithSquaredLwsAlgorithm)
|
|||
size_t workItems[3] = {1, 1, 1};
|
||||
for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
|
||||
workItems[dimension - 1] = 256;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimension,
|
||||
|
@ -273,7 +274,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithNDLwsAlgorithm) {
|
|||
size_t workItems[3] = {1, 1, 1};
|
||||
for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
|
||||
workItems[dimension - 1] = 256;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimension,
|
||||
|
@ -303,7 +304,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithOldLwsAlgorithm) {
|
|||
size_t workItems[3] = {1, 1, 1};
|
||||
for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
|
||||
workItems[dimension - 1] = 256;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimension,
|
||||
|
@ -332,7 +333,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNumWorkGroups) {
|
|||
size_t workItems[3] = {2, 5, 10};
|
||||
size_t workGroupSize[3] = {1, 1, 1};
|
||||
cl_uint dimensions = 3;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimensions,
|
||||
|
@ -363,7 +364,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithOutComputeND) {
|
|||
size_t globalOffsets[3] = {0, 0, 0};
|
||||
size_t workItems[3] = {2, 5, 10};
|
||||
cl_uint dimensions = 3;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimensions,
|
||||
|
@ -394,7 +395,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeND) {
|
|||
size_t globalOffsets[3] = {0, 0, 0};
|
||||
size_t workItems[3] = {2, 5, 10};
|
||||
cl_uint dimensions = 3;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimensions,
|
||||
|
@ -426,7 +427,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeSquared) {
|
|||
size_t globalOffsets[3] = {0, 0, 0};
|
||||
size_t workItems[3] = {2, 5, 10};
|
||||
cl_uint dimensions = 3;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimensions,
|
||||
|
@ -458,7 +459,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithOutComputeSquaredAn
|
|||
size_t globalOffsets[3] = {0, 0, 0};
|
||||
size_t workItems[3] = {2, 5, 10};
|
||||
cl_uint dimensions = 3;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimensions,
|
||||
|
@ -488,7 +489,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSize) {
|
|||
size_t workItems[3] = {2, 5, 10};
|
||||
size_t workGroupSize[3] = {1, 2, 3};
|
||||
cl_uint dimensions = 3;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimensions,
|
||||
|
@ -521,7 +522,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizes) {
|
|||
size_t workItems[3] = {2, 5, 10};
|
||||
size_t workGroupSize[3] = {1, 2, 3};
|
||||
cl_uint dimensions = 3;
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimensions,
|
||||
|
@ -561,7 +562,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizeForSplitKernel) {
|
|||
|
||||
MockMultiDispatchInfo multiDispatchInfo(std::vector<DispatchInfo *>({&di1, &di2}));
|
||||
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
0,
|
||||
|
@ -604,7 +605,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizesForSplitWalker) {
|
|||
|
||||
MockMultiDispatchInfo multiDispatchInfo(std::vector<DispatchInfo *>({&di1, &di2}));
|
||||
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
0,
|
||||
|
@ -646,7 +647,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerDoesntConsumeCommandStreamWhenQueueIs
|
|||
|
||||
KernelOperation *blockedCommandsData = nullptr;
|
||||
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimensions,
|
||||
|
@ -686,7 +687,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromKernelW
|
|||
|
||||
KernelOperation *blockedCommandsData = nullptr;
|
||||
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
kernel,
|
||||
dimensions,
|
||||
|
@ -727,7 +728,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromMdiWhen
|
|||
|
||||
KernelOperation *blockedCommandsData = nullptr;
|
||||
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
0,
|
||||
|
@ -759,7 +760,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfo) {
|
|||
|
||||
MockMultiDispatchInfo multiDispatchInfo(std::vector<Kernel *>({&kernel1, &kernel2}));
|
||||
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
0,
|
||||
|
@ -800,7 +801,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoCorrectlyProg
|
|||
indirectHeap.align(KernelCommandsHelper<FamilyType>::alignInterfaceDescriptorData);
|
||||
auto dshBeforeMultiDisptach = indirectHeap.getUsed();
|
||||
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
0,
|
||||
|
@ -884,7 +885,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoCorrectlyProg
|
|||
// create commandStream
|
||||
auto &cmdStream = pCmdQ->getCS(0);
|
||||
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
0,
|
||||
|
@ -929,7 +930,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoAndDifferentK
|
|||
// create commandStream
|
||||
auto &cmdStream = pCmdQ->getCS(0);
|
||||
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
0,
|
||||
|
@ -979,7 +980,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoButSameKernel
|
|||
// create commandStream
|
||||
auto &cmdStream = pCmdQ->getCS(0);
|
||||
|
||||
dispatchWalker<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
|
||||
*pCmdQ,
|
||||
multiDispatchInfo,
|
||||
0,
|
||||
|
@ -1030,7 +1031,7 @@ HWTEST_F(DispatchWalkerTest, givenMultiDispatchWhenWhitelistedRegisterForCoheren
|
|||
DispatchInfo di2(&kernel, 1, Vec3<size_t>(1, 1, 1), Vec3<size_t>(1, 1, 1), Vec3<size_t>(0, 0, 0));
|
||||
MockMultiDispatchInfo multiDispatchInfo(std::vector<DispatchInfo *>({&di1, &di2}));
|
||||
|
||||
dispatchWalker<FamilyType>(*pCmdQ, multiDispatchInfo, 0, nullptr, nullptr, nullptr, nullptr, pDevice->getPreemptionMode(), false);
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ, multiDispatchInfo, 0, nullptr, nullptr, nullptr, nullptr, pDevice->getPreemptionMode(), false);
|
||||
|
||||
hwParser.parseCommands<FamilyType>(cmdStream, 0);
|
||||
|
||||
|
@ -1056,3 +1057,15 @@ TEST(DispatchWalker, calculateDispatchDim) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(DispatchWalkerTest, WhenCallingDefaultWaMethodsThenExpectNothing) {
|
||||
auto &cmdStream = pCmdQ->getCS(0);
|
||||
MockKernel kernel(&program, kernelInfo, *pDevice);
|
||||
EXPECT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
GpgpuWalkerHelper<GENX>::applyWADisableLSQCROPERFforOCL(&cmdStream, kernel, false);
|
||||
|
||||
size_t expectedSize = 0;
|
||||
size_t actualSize = GpgpuWalkerHelper<GENX>::getSizeForWADisableLSQCROPERFforOCL(&kernel);
|
||||
EXPECT_EQ(expectedSize, actualSize);
|
||||
}
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/command_queue/enqueue_fill_buffer.h"
|
||||
#include "runtime/command_queue/enqueue_kernel.h"
|
||||
#include "runtime/command_queue/enqueue_read_buffer.h"
|
||||
|
@ -43,8 +43,8 @@ struct GetSizeRequiredBufferTest : public CommandEnqueueFixture,
|
|||
public HelloWorldKernelFixture,
|
||||
public ::testing::Test {
|
||||
|
||||
using SimpleArgKernelFixture::SetUp;
|
||||
using HelloWorldKernelFixture::SetUp;
|
||||
using SimpleArgKernelFixture::SetUp;
|
||||
|
||||
GetSizeRequiredBufferTest() {
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
|
||||
#include "runtime/built_ins/built_ins.h"
|
||||
#include "runtime/command_queue/command_queue_hw.h"
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/command_queue/enqueue_copy_image.h"
|
||||
#include "runtime/command_queue/enqueue_fill_image.h"
|
||||
#include "runtime/command_queue/enqueue_read_image.h"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -20,7 +20,7 @@
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/helpers/options.h"
|
||||
#include "unit_tests/mocks/mock_kernel.h"
|
||||
#include "unit_tests/mocks/mock_device.h"
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
*/
|
||||
|
||||
#include "hw_cmds.h"
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "unit_tests/fixtures/device_fixture.h"
|
||||
#include "unit_tests/helpers/debug_manager_state_restore.h"
|
||||
#include "test.h"
|
||||
|
@ -109,7 +109,7 @@ struct WorkGroupSizeBase : public DeviceFixture {
|
|||
(workItems[0] + workGroupSize[0] - 1) / workGroupSize[0],
|
||||
(workItems[1] + workGroupSize[1] - 1) / workGroupSize[1],
|
||||
(workItems[2] + workGroupSize[2] - 1) / workGroupSize[2]};
|
||||
setGpgpuWalkerThreadData<FamilyType>(&pCmd, globalOffsets, workGroupsStart, workGroupsNum, workGroupSize, simdSize);
|
||||
GpgpuWalkerHelper<FamilyType>::setGpgpuWalkerThreadData(&pCmd, globalOffsets, workGroupsStart, workGroupsNum, workGroupSize, simdSize);
|
||||
|
||||
//And check if it is programmed correctly
|
||||
auto numWorkItems = computeWalkerWorkItems<FamilyType>(pCmd);
|
||||
|
|
|
@ -52,7 +52,7 @@
|
|||
#include "gtest/gtest.h"
|
||||
#include "runtime/utilities/linux/debug_env_reader.h"
|
||||
#include "runtime/gmm_helper/gmm_helper.h"
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
|
||||
using namespace OCLRT;
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
*/
|
||||
|
||||
#pragma once
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/context/context.h"
|
||||
#include "runtime/helpers/aligned_memory.h"
|
||||
#include "runtime/helpers/options.h"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -31,7 +31,7 @@
|
|||
#include "unit_tests/mocks/mock_kernel.h"
|
||||
#include "unit_tests/helpers/debug_manager_state_restore.h"
|
||||
|
||||
#include "runtime/command_queue/dispatch_walker_helper.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/helpers/kernel_commands.h"
|
||||
|
||||
#include <memory>
|
||||
|
@ -330,7 +330,7 @@ HWTEST_F(DeviceQueueSlb, cleanupSection) {
|
|||
|
||||
if (mockParentKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
|
||||
|
||||
cleanupSectionOffsetToParse += getSizeForWADisableLSQCROPERFforOCL<FamilyType>(mockParentKernel) / 2;
|
||||
cleanupSectionOffsetToParse += GpgpuWalkerHelper<FamilyType>::getSizeForWADisableLSQCROPERFforOCL(mockParentKernel) / 2;
|
||||
}
|
||||
|
||||
hwParser.parseCommands<FamilyType>(*slbCS, cleanupSectionOffsetToParse);
|
||||
|
@ -394,7 +394,7 @@ HWTEST_F(DeviceQueueSlb, AddEMCleanupSectionWithProfiling) {
|
|||
|
||||
auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
|
||||
|
||||
if (mockParentKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages && getSizeForWADisableLSQCROPERFforOCL<FamilyType>(mockParentKernel) > 0) {
|
||||
if (mockParentKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages && GpgpuWalkerHelper<FamilyType>::getSizeForWADisableLSQCROPERFforOCL(mockParentKernel) > 0) {
|
||||
auto loadRegImmItor = find<MI_LOAD_REGISTER_IMM *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
|
||||
EXPECT_NE(hwParser.cmdList.end(), loadRegImmItor);
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -20,7 +20,7 @@
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/command_queue/local_id_gen.h"
|
||||
#include "runtime/device_queue/device_queue_hw.h"
|
||||
#include "runtime/helpers/per_thread_data.h"
|
||||
|
|
|
@ -53,19 +53,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenDev
|
|||
|
||||
size_t executionModelDSHUsedBefore = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE)->getUsed();
|
||||
|
||||
dispatchWalker<FamilyType>(*pCmdQ,
|
||||
*pKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
false);
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
|
||||
*pKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
false);
|
||||
|
||||
size_t dshUsedAfter = pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE).getUsed();
|
||||
EXPECT_EQ(0u, dshUsedAfter);
|
||||
|
@ -109,19 +109,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenDef
|
|||
|
||||
auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT);
|
||||
|
||||
dispatchWalker<FamilyType>(*pCmdQ,
|
||||
*pKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
false);
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
|
||||
*pKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
false);
|
||||
|
||||
auto iohUsed = ioh.getUsed();
|
||||
EXPECT_EQ(0u, iohUsed);
|
||||
|
@ -136,19 +136,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenSSH
|
|||
|
||||
MockMultiDispatchInfo multiDispatchInfo(pKernel);
|
||||
|
||||
dispatchWalker<FamilyType>(*pCmdQ,
|
||||
*pKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
false);
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
|
||||
*pKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
false);
|
||||
|
||||
auto &ssh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE);
|
||||
|
||||
|
@ -172,19 +172,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsBlockedThenSSHSiz
|
|||
|
||||
MockMultiDispatchInfo multiDispatchInfo(pKernel);
|
||||
|
||||
dispatchWalker<FamilyType>(*pCmdQ,
|
||||
*pKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
true); // blockQueue
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
|
||||
*pKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
true); // blockQueue
|
||||
ASSERT_NE(nullptr, blockedCommandsData);
|
||||
|
||||
size_t minRequiredSize = KernelCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
|
||||
|
@ -269,19 +269,19 @@ HWTEST_F(MockParentKernelDispatch, GivenBlockedQueueWhenParentKernelIsDispatched
|
|||
const size_t globalOffsets[3] = {0, 0, 0};
|
||||
const size_t workItems[3] = {1, 1, 1};
|
||||
|
||||
dispatchWalker<FamilyType>(*pCmdQ,
|
||||
*mockParentKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
true); // blockQueue
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
|
||||
*mockParentKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
true); // blockQueue
|
||||
|
||||
ASSERT_NE(nullptr, blockedCommandsData);
|
||||
|
||||
|
@ -302,19 +302,19 @@ HWTEST_F(MockParentKernelDispatch, GivenParentKernelWhenDispatchedThenMediaInter
|
|||
const size_t globalOffsets[3] = {0, 0, 0};
|
||||
const size_t workItems[3] = {1, 1, 1};
|
||||
|
||||
dispatchWalker<FamilyType>(*pCmdQ,
|
||||
*mockParentKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
false); // blockQueue
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
|
||||
*mockParentKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
false); // blockQueue
|
||||
|
||||
LinearStream *commandStream = &pCmdQ->getCS(0);
|
||||
|
||||
|
@ -358,19 +358,19 @@ HWTEST_F(MockParentKernelDispatch, GivenUsedSSHHeapWhenParentKernelIsDispatchedT
|
|||
// If parent is not using SSH, then heap obtained has zero usage and the same buffer
|
||||
ASSERT_EQ(0u, mockParentKernel->getKernelInfo().heapInfo.pKernelHeader->SurfaceStateHeapSize);
|
||||
|
||||
dispatchWalker<FamilyType>(*pCmdQ,
|
||||
*mockParentKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
false); // blockQueue
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
|
||||
*mockParentKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
false); // blockQueue
|
||||
|
||||
EXPECT_EQ(0u, ssh.getUsed());
|
||||
|
||||
|
@ -393,19 +393,19 @@ HWTEST_F(MockParentKernelDispatch, GivenNotUsedSSHHeapWhenParentKernelIsDispatch
|
|||
|
||||
auto *bufferMemory = ssh.getCpuBase();
|
||||
|
||||
dispatchWalker<FamilyType>(*pCmdQ,
|
||||
*mockParentKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
false); // blockQueue
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
|
||||
*mockParentKernel,
|
||||
1,
|
||||
globalOffsets,
|
||||
workItems,
|
||||
nullptr,
|
||||
0,
|
||||
nullptr,
|
||||
&blockedCommandsData,
|
||||
nullptr,
|
||||
nullptr,
|
||||
pDevice->getPreemptionMode(),
|
||||
false); // blockQueue
|
||||
|
||||
EXPECT_EQ(bufferMemory, ssh.getCpuBase());
|
||||
|
||||
|
|
|
@ -72,7 +72,7 @@ HWTEST_F(ExecutionModelSchedulerFixture, dispatchScheduler) {
|
|||
LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
|
||||
pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
|
||||
|
||||
dispatchScheduler<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchScheduler(
|
||||
*pCmdQ,
|
||||
*pDevQueueHw,
|
||||
pDevice->getPreemptionMode(),
|
||||
|
@ -188,7 +188,7 @@ HWTEST_F(ExecutionModelSchedulerFixture, dispatchSchedulerDoesNotUseStandardCmdQ
|
|||
getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
|
||||
pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
|
||||
|
||||
dispatchScheduler<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchScheduler(
|
||||
*pCmdQ,
|
||||
*pDevQueueHw,
|
||||
pDevice->getPreemptionMode(),
|
||||
|
@ -219,7 +219,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, dispatchSchedulerWithEarlyReturnSetToF
|
|||
LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
|
||||
pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
|
||||
|
||||
dispatchScheduler<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchScheduler(
|
||||
*pCmdQ,
|
||||
mockDevQueue,
|
||||
device->getPreemptionMode(),
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
|
||||
#include "runtime/built_ins/built_ins.h"
|
||||
#include "runtime/command_queue/enqueue_kernel.h"
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/device_queue/device_queue.h"
|
||||
#include "runtime/device_queue/device_queue_hw.h"
|
||||
#include "runtime/helpers/kernel_commands.h"
|
||||
|
@ -51,7 +51,7 @@ BDWTEST_F(BdwSchedulerTest, givenCallToDispatchSchedulerWhenPipeControlWithCSSta
|
|||
LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
|
||||
pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
|
||||
|
||||
dispatchScheduler<FamilyType>(
|
||||
GpgpuWalkerHelper<FamilyType>::dispatchScheduler(
|
||||
*pCmdQ,
|
||||
*pDevQueueHw,
|
||||
pDevice->getPreemptionMode(),
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -21,7 +21,7 @@
|
|||
*/
|
||||
|
||||
#include "runtime/context/context.h"
|
||||
#include "runtime/command_queue/dispatch_walker_helper.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "unit_tests/fixtures/device_host_queue_fixture.h"
|
||||
#include "unit_tests/helpers/hw_parse.h"
|
||||
#include "unit_tests/mocks/mock_device_queue.h"
|
||||
|
|
|
@ -21,12 +21,23 @@
|
|||
*/
|
||||
|
||||
#include "unit_tests/libult/mock_gfx_family.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.inl"
|
||||
#include "runtime/command_stream/preemption.inl"
|
||||
#include "runtime/device_queue/device_queue_hw.h"
|
||||
#include "runtime/device_queue/device_queue_hw.inl"
|
||||
#include "runtime/helpers/hw_helper.inl"
|
||||
#include "runtime/helpers/kernel_commands.inl"
|
||||
#include "runtime/helpers/preamble.inl"
|
||||
|
||||
namespace OCLRT {
|
||||
|
||||
bool (*GENX::isSimulationFcn)(unsigned short) = nullptr;
|
||||
|
||||
GENX::GPGPU_WALKER GENX::cmdInitGpgpuWalker = GENX::GPGPU_WALKER::sInit();
|
||||
GENX::INTERFACE_DESCRIPTOR_DATA GENX::cmdInitInterfaceDescriptorData = GENX::INTERFACE_DESCRIPTOR_DATA::sInit();
|
||||
GENX::MEDIA_STATE_FLUSH GENX::cmdInitMediaStateFlush = GENX::MEDIA_STATE_FLUSH::sInit();
|
||||
GENX::MEDIA_INTERFACE_DESCRIPTOR_LOAD GENX::cmdInitMediaInterfaceDescriptorLoad = GENX::MEDIA_INTERFACE_DESCRIPTOR_LOAD::sInit();
|
||||
|
||||
template <>
|
||||
size_t HwHelperHw<GENX>::getMaxBarrierRegisterPerSlice() const {
|
||||
return 32;
|
||||
|
@ -57,4 +68,89 @@ struct hw_helper_static_init {
|
|||
template class HwHelperHw<GENX>;
|
||||
|
||||
hw_helper_static_init si;
|
||||
|
||||
template class GpgpuWalkerHelper<GENX>;
|
||||
|
||||
template <>
|
||||
bool KernelCommandsHelper<GENX>::isPipeControlWArequired() {
|
||||
return false;
|
||||
}
|
||||
|
||||
template struct KernelCommandsHelper<GENX>;
|
||||
|
||||
template <>
|
||||
size_t PreemptionHelper::getRequiredCmdStreamSize<GENX>(PreemptionMode newPreemptionMode, PreemptionMode oldPreemptionMode) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <>
|
||||
void PreemptionHelper::programCmdStream<GENX>(LinearStream &cmdStream, PreemptionMode newPreemptionMode, PreemptionMode oldPreemptionMode,
|
||||
GraphicsAllocation *preemptionCsr, Device &device) {
|
||||
}
|
||||
|
||||
template <>
|
||||
size_t PreemptionHelper::getRequiredPreambleSize<GENX>(const Device &device) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <>
|
||||
void PreemptionHelper::programPreamble<GENX>(LinearStream &preambleCmdStream, Device &device,
|
||||
const GraphicsAllocation *preemptionCsr) {
|
||||
}
|
||||
|
||||
template <>
|
||||
size_t PreemptionHelper::getPreemptionWaCsSize<GENX>(const Device &device) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
template void PreemptionHelper::programInterfaceDescriptorDataPreemption<GENX>(INTERFACE_DESCRIPTOR_DATA<GENX> *idd, PreemptionMode preemptionMode);
|
||||
|
||||
template <>
|
||||
size_t DeviceQueueHw<GENX>::getWaCommandsSize() {
|
||||
return (size_t)0;
|
||||
}
|
||||
|
||||
template <>
|
||||
void DeviceQueueHw<GENX>::addArbCheckCmdWa() {
|
||||
}
|
||||
|
||||
template <>
|
||||
void DeviceQueueHw<GENX>::addMiAtomicCmdWa(uint64_t atomicOpPlaceholder) {
|
||||
}
|
||||
|
||||
template <>
|
||||
void DeviceQueueHw<GENX>::addLriCmdWa(bool setArbCheck) {
|
||||
}
|
||||
|
||||
template <>
|
||||
void DeviceQueueHw<GENX>::addPipeControlCmdWa(bool isNoopCmd) {
|
||||
}
|
||||
|
||||
template <>
|
||||
void DeviceQueueHw<GENX>::addProfilingEndCmds(uint64_t timestampAddress) {
|
||||
}
|
||||
|
||||
template class DeviceQueueHw<GENX>;
|
||||
|
||||
template <>
|
||||
void PreambleHelper<GENX>::addPipeControlBeforeVfeCmd(LinearStream *pCommandStream, const HardwareInfo *hwInfo) {
|
||||
}
|
||||
|
||||
template <>
|
||||
uint32_t PreambleHelper<GENX>::getL3Config(const HardwareInfo &hwInfo, bool useSLM) {
|
||||
uint32_t l3Config = 0;
|
||||
return l3Config;
|
||||
}
|
||||
|
||||
template <>
|
||||
void PreambleHelper<GENX>::programPipelineSelect(LinearStream *pCommandStream, bool mediaSamplerRequired) {
|
||||
}
|
||||
|
||||
template <>
|
||||
struct L3CNTLRegisterOffset<GENX> {
|
||||
static const uint32_t registerOffset = 0x7034;
|
||||
};
|
||||
|
||||
template struct PreambleHelper<GENX>;
|
||||
|
||||
} // namespace OCLRT
|
||||
|
|
|
@ -31,9 +31,71 @@ extern HwHelper *hwHelperFactory[IGFX_MAX_CORE];
|
|||
struct GENX {
|
||||
static bool (*isSimulationFcn)(unsigned short);
|
||||
typedef struct tagINTERFACE_DESCRIPTOR_DATA {
|
||||
typedef enum tagDENORM_MODE {
|
||||
DENORM_MODE_FTZ = 0x0,
|
||||
DENORM_MODE_SETBYKERNEL = 0x1,
|
||||
} DENORM_MODE;
|
||||
typedef enum tagSAMPLERSTATEPOINTER {
|
||||
SAMPLERSTATEPOINTER_BIT_SHIFT = 0x5,
|
||||
SAMPLERSTATEPOINTER_ALIGN_SIZE = 0x20,
|
||||
} SAMPLERSTATEPOINTER;
|
||||
typedef enum tagSAMPLER_COUNT {
|
||||
SAMPLER_COUNT_NO_SAMPLERS_USED = 0x0,
|
||||
SAMPLER_COUNT_BETWEEN_1_AND_4_SAMPLERS_USED = 0x1,
|
||||
SAMPLER_COUNT_BETWEEN_5_AND_8_SAMPLERS_USED = 0x2,
|
||||
SAMPLER_COUNT_BETWEEN_9_AND_12_SAMPLERS_USED = 0x3,
|
||||
SAMPLER_COUNT_BETWEEN_13_AND_16_SAMPLERS_USED = 0x4,
|
||||
} SAMPLER_COUNT;
|
||||
typedef enum tagSHARED_LOCAL_MEMORY_SIZE {
|
||||
SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K = 0x0,
|
||||
SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K = 0x1,
|
||||
SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K = 0x2,
|
||||
SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K = 0x3,
|
||||
SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K = 0x4,
|
||||
SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K = 0x5,
|
||||
SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K = 0x6,
|
||||
SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K = 0x7,
|
||||
} SHARED_LOCAL_MEMORY_SIZE;
|
||||
typedef enum tagBINDINGTABLEPOINTER {
|
||||
BINDINGTABLEPOINTER_BIT_SHIFT = 0x5,
|
||||
BINDINGTABLEPOINTER_ALIGN_SIZE = 0x20,
|
||||
} BINDINGTABLEPOINTER;
|
||||
static tagINTERFACE_DESCRIPTOR_DATA sInit(void) {
|
||||
INTERFACE_DESCRIPTOR_DATA state;
|
||||
return state;
|
||||
}
|
||||
inline void setKernelStartPointerHigh(const uint32_t value) {
|
||||
}
|
||||
inline void setKernelStartPointer(const uint64_t value) {
|
||||
}
|
||||
inline void setNumberOfThreadsInGpgpuThreadGroup(const uint32_t value) {
|
||||
}
|
||||
inline void setCrossThreadConstantDataReadLength(const uint32_t value) {
|
||||
}
|
||||
inline void setDenormMode(const DENORM_MODE value) {
|
||||
}
|
||||
inline void setConstantIndirectUrbEntryReadLength(const uint32_t value) {
|
||||
}
|
||||
inline void setBindingTablePointer(const uint64_t value) {
|
||||
}
|
||||
inline void setSamplerStatePointer(const uint64_t value) {
|
||||
}
|
||||
inline void setSamplerCount(const SAMPLER_COUNT value) {
|
||||
}
|
||||
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
|
||||
}
|
||||
inline void setBarrierEnable(const bool value) {
|
||||
}
|
||||
} INTERFACE_DESCRIPTOR_DATA;
|
||||
|
||||
typedef struct tagBINDING_TABLE_STATE {
|
||||
inline void init(void) {
|
||||
}
|
||||
inline uint32_t getSurfaceStatePointer(void) const {
|
||||
return 0u;
|
||||
}
|
||||
inline void setSurfaceStatePointer(const uint64_t value) {
|
||||
}
|
||||
inline uint32_t getRawData(const uint32_t index) {
|
||||
return 0;
|
||||
}
|
||||
|
@ -42,6 +104,247 @@ struct GENX {
|
|||
SURFACESTATEPOINTER_ALIGN_SIZE = 0x40,
|
||||
} SURFACESTATEPOINTER;
|
||||
} BINDING_TABLE_STATE;
|
||||
|
||||
typedef struct tagGPGPU_WALKER {
|
||||
typedef enum tagSIMD_SIZE {
|
||||
SIMD_SIZE_SIMD8 = 0x0,
|
||||
SIMD_SIZE_SIMD16 = 0x1,
|
||||
SIMD_SIZE_SIMD32 = 0x2,
|
||||
} SIMD_SIZE;
|
||||
typedef enum tagINDIRECTDATASTARTADDRESS {
|
||||
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
|
||||
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
|
||||
} INDIRECTDATASTARTADDRESS;
|
||||
static tagGPGPU_WALKER sInit(void) {
|
||||
GPGPU_WALKER state;
|
||||
return state;
|
||||
}
|
||||
inline void setThreadWidthCounterMaximum(const uint32_t value) {
|
||||
}
|
||||
inline void setThreadGroupIdXDimension(const uint32_t value) {
|
||||
}
|
||||
inline void setThreadGroupIdYDimension(const uint32_t value) {
|
||||
}
|
||||
inline void setThreadGroupIdZDimension(const uint32_t value) {
|
||||
}
|
||||
inline void setRightExecutionMask(const uint32_t value) {
|
||||
}
|
||||
inline void setBottomExecutionMask(const uint32_t value) {
|
||||
}
|
||||
inline void setSimdSize(const SIMD_SIZE value) {
|
||||
}
|
||||
inline void setThreadGroupIdStartingX(const uint32_t value) {
|
||||
}
|
||||
inline void setThreadGroupIdStartingY(const uint32_t value) {
|
||||
}
|
||||
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
|
||||
}
|
||||
inline void setIndirectDataStartAddress(const uint32_t value) {
|
||||
}
|
||||
inline void setInterfaceDescriptorOffset(const uint32_t value) {
|
||||
}
|
||||
inline void setIndirectDataLength(const uint32_t value) {
|
||||
}
|
||||
} GPGPU_WALKER;
|
||||
|
||||
typedef struct tagPIPE_CONTROL {
|
||||
typedef enum tagPOST_SYNC_OPERATION {
|
||||
POST_SYNC_OPERATION_NO_WRITE = 0x0,
|
||||
POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA = 0x1,
|
||||
POST_SYNC_OPERATION_WRITE_PS_DEPTH_COUNT = 0x2,
|
||||
POST_SYNC_OPERATION_WRITE_TIMESTAMP = 0x3,
|
||||
} POST_SYNC_OPERATION;
|
||||
static tagPIPE_CONTROL sInit(void) {
|
||||
PIPE_CONTROL state;
|
||||
return state;
|
||||
}
|
||||
inline void setCommandStreamerStallEnable(const uint32_t value) {
|
||||
}
|
||||
inline void setDcFlushEnable(const bool value) {
|
||||
}
|
||||
inline void setStateCacheInvalidationEnable(const bool value) {
|
||||
}
|
||||
inline void setPipeControlFlushEnable(const bool value) {
|
||||
}
|
||||
inline void setTextureCacheInvalidationEnable(const bool value) {
|
||||
}
|
||||
inline void setPostSyncOperation(const POST_SYNC_OPERATION value) {
|
||||
}
|
||||
inline void setAddress(const uint32_t value) {
|
||||
}
|
||||
inline void setAddressHigh(const uint32_t value) {
|
||||
}
|
||||
inline void setImmediateData(const uint64_t value) {
|
||||
}
|
||||
inline void setGenericMediaStateClear(const bool value) {
|
||||
}
|
||||
} PIPE_CONTROL;
|
||||
|
||||
typedef struct tagMI_LOAD_REGISTER_IMM {
|
||||
static tagMI_LOAD_REGISTER_IMM sInit(void) {
|
||||
MI_LOAD_REGISTER_IMM state;
|
||||
return state;
|
||||
}
|
||||
inline void setRegisterOffset(const uint32_t value) {
|
||||
}
|
||||
inline void setDataDword(const uint32_t value) {
|
||||
}
|
||||
} MI_LOAD_REGISTER_IMM;
|
||||
|
||||
typedef struct tagMI_LOAD_REGISTER_REG {
|
||||
static tagMI_LOAD_REGISTER_REG sInit(void) {
|
||||
MI_LOAD_REGISTER_REG state;
|
||||
return state;
|
||||
}
|
||||
inline void setSourceRegisterAddress(const uint32_t value) {
|
||||
}
|
||||
inline void setDestinationRegisterAddress(const uint32_t value) {
|
||||
}
|
||||
} MI_LOAD_REGISTER_REG;
|
||||
|
||||
typedef struct tagMI_MATH {
|
||||
union _DW0 {
|
||||
struct _BitField {
|
||||
uint32_t DwordLength : BITFIELD_RANGE(0, 5);
|
||||
uint32_t Reserved : BITFIELD_RANGE(6, 22);
|
||||
uint32_t InstructionOpcode : BITFIELD_RANGE(23, 28);
|
||||
uint32_t InstructionType : BITFIELD_RANGE(29, 31);
|
||||
} BitField;
|
||||
uint32_t Value;
|
||||
} DW0;
|
||||
typedef enum tagMI_COMMAND_OPCODE {
|
||||
MI_COMMAND_OPCODE_MI_MATH = 0x0,
|
||||
} MI_COMMAND_OPCODE;
|
||||
typedef enum tagCOMMAND_TYPE {
|
||||
COMMAND_TYPE_MI_COMMAND = 0x0,
|
||||
} COMMAND_TYPE;
|
||||
} MI_MATH;
|
||||
|
||||
typedef struct tagMI_MATH_ALU_INST_INLINE {
|
||||
union _DW0 {
|
||||
struct _BitField {
|
||||
uint32_t Operand2 : BITFIELD_RANGE(0, 9);
|
||||
uint32_t Operand1 : BITFIELD_RANGE(10, 19);
|
||||
uint32_t ALUOpcode : BITFIELD_RANGE(20, 31);
|
||||
} BitField;
|
||||
uint32_t Value;
|
||||
} DW0;
|
||||
} MI_MATH_ALU_INST_INLINE;
|
||||
|
||||
typedef struct tagMI_COMMAND_OPCODE_MI_MATH {
|
||||
} MI_COMMAND_OPCODE_MI_MATH;
|
||||
|
||||
typedef struct tagMI_STORE_REGISTER_MEM {
|
||||
static tagMI_STORE_REGISTER_MEM sInit(void) {
|
||||
MI_STORE_REGISTER_MEM state;
|
||||
return state;
|
||||
}
|
||||
inline void setRegisterAddress(const uint32_t value) {
|
||||
}
|
||||
inline void setMemoryAddress(const uint64_t value) {
|
||||
}
|
||||
} MI_STORE_REGISTER_MEM;
|
||||
|
||||
typedef struct tagMI_REPORT_PERF_COUNT {
|
||||
static tagMI_REPORT_PERF_COUNT sInit(void) {
|
||||
MI_REPORT_PERF_COUNT state;
|
||||
return state;
|
||||
}
|
||||
inline void setReportId(const uint32_t value) {
|
||||
}
|
||||
inline void setMemoryAddress(const uint64_t value) {
|
||||
}
|
||||
} MI_REPORT_PERF_COUNT;
|
||||
|
||||
typedef struct tagMI_BATCH_BUFFER_START {
|
||||
typedef enum tagSECOND_LEVEL_BATCH_BUFFER {
|
||||
SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH = 0x0,
|
||||
SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH = 0x1,
|
||||
} SECOND_LEVEL_BATCH_BUFFER;
|
||||
static tagMI_BATCH_BUFFER_START sInit(void) {
|
||||
MI_BATCH_BUFFER_START state;
|
||||
return state;
|
||||
}
|
||||
inline void setSecondLevelBatchBuffer(const SECOND_LEVEL_BATCH_BUFFER value) {
|
||||
}
|
||||
inline void setBatchBufferStartAddressGraphicsaddress472(const uint64_t value) {
|
||||
}
|
||||
} MI_BATCH_BUFFER_START;
|
||||
|
||||
typedef struct tagMEDIA_STATE_FLUSH {
|
||||
static tagMEDIA_STATE_FLUSH sInit(void) {
|
||||
MEDIA_STATE_FLUSH state;
|
||||
return state;
|
||||
}
|
||||
inline void setInterfaceDescriptorOffset(const uint32_t value) {
|
||||
}
|
||||
} MEDIA_STATE_FLUSH;
|
||||
|
||||
typedef struct tagMEDIA_INTERFACE_DESCRIPTOR_LOAD {
|
||||
static tagMEDIA_INTERFACE_DESCRIPTOR_LOAD sInit(void) {
|
||||
MEDIA_INTERFACE_DESCRIPTOR_LOAD state;
|
||||
return state;
|
||||
}
|
||||
inline void setInterfaceDescriptorDataStartAddress(const uint32_t value) {
|
||||
}
|
||||
inline void setInterfaceDescriptorTotalLength(const uint32_t value) {
|
||||
}
|
||||
} MEDIA_INTERFACE_DESCRIPTOR_LOAD;
|
||||
|
||||
typedef struct tagMI_BATCH_BUFFER_END {
|
||||
static tagMI_BATCH_BUFFER_END sInit(void) {
|
||||
MI_BATCH_BUFFER_END state;
|
||||
return state;
|
||||
}
|
||||
} MI_BATCH_BUFFER_END;
|
||||
|
||||
typedef struct tagRENDER_SURFACE_STATE {
|
||||
} RENDER_SURFACE_STATE;
|
||||
|
||||
typedef struct tagMEDIA_VFE_STATE {
|
||||
static tagMEDIA_VFE_STATE sInit(void) {
|
||||
MEDIA_VFE_STATE state;
|
||||
return state;
|
||||
}
|
||||
inline void setMaximumNumberOfThreads(const uint32_t value) {
|
||||
}
|
||||
inline void setNumberOfUrbEntries(const uint32_t value) {
|
||||
}
|
||||
inline void setUrbEntryAllocationSize(const uint32_t value) {
|
||||
}
|
||||
inline void setPerThreadScratchSpace(const uint32_t value) {
|
||||
}
|
||||
inline void setStackSize(const uint32_t value) {
|
||||
}
|
||||
inline void setScratchSpaceBasePointer(const uint32_t value) {
|
||||
}
|
||||
inline void setScratchSpaceBasePointerHigh(const uint32_t value) {
|
||||
}
|
||||
} MEDIA_VFE_STATE;
|
||||
|
||||
typedef struct tagSAMPLER_STATE {
|
||||
inline void setIndirectStatePointer(const uint32_t indirectStatePointerValue) {
|
||||
}
|
||||
} SAMPLER_STATE;
|
||||
|
||||
typedef struct tagGPGPU_CSR_BASE_ADDRESS {
|
||||
inline void init(void) {
|
||||
}
|
||||
inline void setGpgpuCsrBaseAddress(uint64_t value) {
|
||||
}
|
||||
} GPGPU_CSR_BASE_ADDRESS;
|
||||
|
||||
typedef struct tagSTATE_SIP {
|
||||
inline void init(void) {
|
||||
}
|
||||
inline void setSystemInstructionPointer(uint64_t value) {
|
||||
}
|
||||
} STATE_SIP;
|
||||
|
||||
static GPGPU_WALKER cmdInitGpgpuWalker;
|
||||
static INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;
|
||||
static MEDIA_STATE_FLUSH cmdInitMediaStateFlush;
|
||||
static MEDIA_INTERFACE_DESCRIPTOR_LOAD cmdInitMediaInterfaceDescriptorLoad;
|
||||
};
|
||||
|
||||
} // namespace OCLRT
|
||||
|
|
Loading…
Reference in New Issue