Create GpgpuWalkerHelper class

Change-Id: Ia9aa7b816356aff57234b46ea3509b6bd9b7f14b
This commit is contained in:
Zdanowicz, Zbigniew 2018-03-30 17:57:51 +02:00 committed by sys_ocldev
parent d51f2cd1ec
commit b6b92ae808
34 changed files with 1209 additions and 646 deletions

View File

@ -25,9 +25,6 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw.h
${CMAKE_CURRENT_SOURCE_DIR}/command_queue_hw.inl
${CMAKE_CURRENT_SOURCE_DIR}/cpu_data_transfer_handler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker.h
${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker_helper.h
${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker_helper.inl
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_barrier.h
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_common.h
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_copy_buffer.h
@ -49,6 +46,8 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_write_image.h
${CMAKE_CURRENT_SOURCE_DIR}/finish.h
${CMAKE_CURRENT_SOURCE_DIR}/flush.h
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.h
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.inl
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.h
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -20,7 +20,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/command_queue/enqueue_barrier.h"
#include "runtime/command_queue/enqueue_copy_buffer.h"
#include "runtime/command_queue/enqueue_copy_buffer_rect.h"

View File

@ -1,56 +0,0 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
namespace OCLRT {
constexpr int NUM_ALU_INST_FOR_READ_MODIFY_WRITE = 4;
constexpr int L3SQC_BIT_LQSC_RO_PERF_DIS = 0x08000000;
constexpr int L3SQC_REG4 = 0xB118;
constexpr int GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER = 0xFFFFFFFF;
constexpr int GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER = 0x00000000;
constexpr int CS_GPR_R0 = 0x2600;
constexpr int CS_GPR_R1 = 0x2608;
constexpr int ALU_OPCODE_LOAD = 0x080;
constexpr int ALU_OPCODE_STORE = 0x180;
constexpr int ALU_OPCODE_OR = 0x103;
constexpr int ALU_OPCODE_AND = 0x102;
constexpr int ALU_REGISTER_R_0 = 0x0;
constexpr int ALU_REGISTER_R_1 = 0x1;
constexpr int ALU_REGISTER_R_SRCA = 0x20;
constexpr int ALU_REGISTER_R_SRCB = 0x21;
constexpr int ALU_REGISTER_R_ACCU = 0x31;
constexpr unsigned int GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW = 0x23A8;
template <typename GfxFamily>
void applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode);
template <typename GfxFamily>
size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
} // namespace OCLRT

View File

@ -1,99 +0,0 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/command_queue/dispatch_walker_helper.h"
namespace OCLRT {
// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask
template <typename GfxFamily>
void addAluReadModifyWriteRegister(
OCLRT::LinearStream *pCommandStream,
uint32_t aluRegister,
uint32_t operation,
uint32_t mask) {
// Load "Register" value into CS_GPR_R0
typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
typedef typename GfxFamily::MI_MATH MI_MATH;
typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
auto pCmd = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
*pCmd = MI_LOAD_REGISTER_REG::sInit();
pCmd->setSourceRegisterAddress(aluRegister);
pCmd->setDestinationRegisterAddress(CS_GPR_R0);
// Load "Mask" into CS_GPR_R1
typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
auto pCmd2 = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM)));
*pCmd2 = MI_LOAD_REGISTER_IMM::sInit();
pCmd2->setRegisterOffset(CS_GPR_R1);
pCmd2->setDataDword(mask);
// Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands
auto pCmd3 = reinterpret_cast<uint32_t *>(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)));
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.Value = 0x0;
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
// 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1;
pCmd3++;
MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(pCmd3);
// Setup first operand of MI_MATH - load CS_GPR_R0 into register A
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA;
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0;
pAluParam++;
// Setup second operand of MI_MATH - load CS_GPR_R1 into register B
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB;
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1;
pAluParam++;
// Setup third operand of MI_MATH - "Operation" on registers A and B
pAluParam->DW0.BitField.ALUOpcode = operation;
pAluParam->DW0.BitField.Operand1 = 0;
pAluParam->DW0.BitField.Operand2 = 0;
pAluParam++;
// Setup fourth operand of MI_MATH - store result into CS_GPR_R0
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE;
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0;
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;
// LOAD value of CS_GPR_R0 into "Register"
auto pCmd4 = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
*pCmd4 = MI_LOAD_REGISTER_REG::sInit();
pCmd4->setSourceRegisterAddress(CS_GPR_R0);
pCmd4->setDestinationRegisterAddress(aluRegister);
// Add PIPE_CONTROL to flush caches
typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL;
auto pCmd5 = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
*pCmd5 = PIPE_CONTROL::sInit();
pCmd5->setCommandStreamerStallEnable(true);
pCmd5->setDcFlushEnable(true);
pCmd5->setTextureCacheInvalidationEnable(true);
pCmd5->setPipeControlFlushEnable(true);
pCmd5->setStateCacheInvalidationEnable(true);
}
} // namespace OCLRT

View File

@ -24,7 +24,7 @@
#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
#include "hw_cmds.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/event/event_builder.h"
#include "runtime/gtpin/gtpin_notify.h"
@ -243,7 +243,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
}
}
dispatchWalker<GfxFamily>(
GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
*this,
multiDispatchInfo,
numEventsInWaitList,
@ -293,7 +293,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
this->getIndirectHeap(IndirectHeap::SURFACE_STATE).getGraphicsAllocation(),
devQueueHw->getDebugQueue());
dispatchScheduler<GfxFamily>(
GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
*this,
*devQueueHw,
preemption,

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -24,7 +24,7 @@
#include "hw_cmds.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/task_information.h"
#include "runtime/mem_obj/buffer.h"
@ -69,7 +69,7 @@ struct EnqueueOperation<GfxFamily, CL_COMMAND_NDRANGE_KERNEL> {
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(pKernel);
size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
return size;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -24,7 +24,7 @@
#include "hw_cmds.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/device/device.h"
#include "runtime/event/event.h"
#include "runtime/memory_manager/surface.h"

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -24,7 +24,7 @@
#include "hw_cmds.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/device/device.h"
#include "runtime/event/event.h"
#include "runtime/memory_manager/surface.h"

View File

@ -0,0 +1,371 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/built_ins/built_ins.h"
#include "runtime/context/context.h"
#include "runtime/command_queue/command_queue.h"
#include "runtime/command_stream/linear_stream.h"
#include "runtime/command_stream/preemption.h"
#include "runtime/device_queue/device_queue_hw.h"
#include "runtime/event/hw_timestamps.h"
#include "runtime/event/perf_counter.h"
#include "runtime/helpers/dispatch_info.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/task_information.h"
#include "runtime/indirect_heap/indirect_heap.h"
#include "runtime/kernel/kernel.h"
#include "runtime/program/kernel_info.h"
#include "runtime/utilities/vec.h"
namespace OCLRT {
constexpr int32_t NUM_ALU_INST_FOR_READ_MODIFY_WRITE = 4;
constexpr int32_t L3SQC_BIT_LQSC_RO_PERF_DIS = 0x08000000;
constexpr int32_t L3SQC_REG4 = 0xB118;
constexpr int32_t GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER = 0xFFFFFFFF;
constexpr int32_t GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER = 0x00000000;
constexpr int32_t CS_GPR_R0 = 0x2600;
constexpr int32_t CS_GPR_R1 = 0x2608;
constexpr int32_t ALU_OPCODE_LOAD = 0x080;
constexpr int32_t ALU_OPCODE_STORE = 0x180;
constexpr int32_t ALU_OPCODE_OR = 0x103;
constexpr int32_t ALU_OPCODE_AND = 0x102;
constexpr int32_t ALU_REGISTER_R_0 = 0x0;
constexpr int32_t ALU_REGISTER_R_1 = 0x1;
constexpr int32_t ALU_REGISTER_R_SRCA = 0x20;
constexpr int32_t ALU_REGISTER_R_SRCB = 0x21;
constexpr int32_t ALU_REGISTER_R_ACCU = 0x31;
constexpr uint32_t GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW = 0x23A8;
void computeWorkgroupSize1D(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize);
void computeWorkgroupSizeND(
WorkSizeInfo wsInfo,
size_t workGroupSize[3],
const size_t workItems[3],
const uint32_t workDim);
void computeWorkgroupSize2D(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize);
void computeWorkgroupSizeSquared(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize,
const uint32_t workDim);
Vec3<size_t> computeWorkgroupSize(
const DispatchInfo &dispatchInfo);
Vec3<size_t> generateWorkgroupSize(
const DispatchInfo &dispatchInfo);
Vec3<size_t> computeWorkgroupsNumber(
const Vec3<size_t> gws,
const Vec3<size_t> lws);
Vec3<size_t> generateWorkgroupsNumber(
const Vec3<size_t> gws,
const Vec3<size_t> lws);
Vec3<size_t> generateWorkgroupsNumber(
const DispatchInfo &dispatchInfo);
inline uint32_t calculateDispatchDim(Vec3<size_t> dispatchSize, Vec3<size_t> dispatchOffset) {
return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
}
Vec3<size_t> canonizeWorkgroup(
Vec3<size_t> workgroup);
void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo);
inline cl_uint computeDimensions(const size_t workItems[3]) {
return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
}
template <typename SizeAndAllocCalcT, typename... CalcArgsT>
IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
size_t alignment = MemoryConstants::pageSize;
size_t size = calc(std::forward<CalcArgsT>(args)...);
return new IndirectHeap(alignedMalloc(size, alignment), size);
}
template <typename GfxFamily>
class GpgpuWalkerHelper {
public:
static void addAluReadModifyWriteRegister(
LinearStream *pCommandStream,
uint32_t aluRegister,
uint32_t operation,
uint32_t mask);
static void applyWADisableLSQCROPERFforOCL(LinearStream *pCommandStream,
const Kernel &kernel,
bool disablePerfMode);
static size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
static size_t setGpgpuWalkerThreadData(
typename GfxFamily::GPGPU_WALKER *pCmd,
const size_t globalOffsets[3],
const size_t startWorkGroups[3],
const size_t numWorkGroups[3],
const size_t localWorkSizesIn[3],
uint32_t simd);
static void dispatchProfilingCommandsStart(
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream);
static void dispatchProfilingCommandsEnd(
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream);
static void dispatchPerfCountersNoopidRegisterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start);
static void dispatchPerfCountersReadFreqRegisterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start);
static void dispatchPerfCountersGeneralPurposeCounterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start);
static void dispatchPerfCountersUserCounterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start);
static void dispatchPerfCountersOABufferStateCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream);
static void dispatchPerfCountersCommandsStart(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream);
static void dispatchPerfCountersCommandsEnd(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream);
static void dispatchWalker(
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
KernelOperation **blockedCommandsData,
HwTimeStamps *hwTimeStamps,
OCLRT::HwPerfCounter *hwPerfCounter,
PreemptionMode preemptionMode,
bool blockQueue,
unsigned int commandType = 0);
static void dispatchWalker(
CommandQueue &commandQueue,
const Kernel &kernel,
cl_uint workDim,
const size_t globalOffsets[3],
const size_t workItems[3],
const size_t *localWorkSizesIn,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
KernelOperation **blockedCommandsData,
HwTimeStamps *hwTimeStamps,
HwPerfCounter *hwPerfCounter,
PreemptionMode preemptionMode,
bool blockQueue);
static void dispatchScheduler(
CommandQueue &commandQueue,
DeviceQueueHw<GfxFamily> &devQueueHw,
PreemptionMode preemptionMode,
SchedulerKernel &scheduler);
};
template <typename GfxFamily, uint32_t eventType>
struct EnqueueOperation {
static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class");
static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class");
static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class");
static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
size_t size = KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
if (reserveProfilingCmdsSpace) {
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
if (reservePerfCounters) {
//start cmds
//P_C: flush CS & TimeStamp BEGIN
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//end cmds
//P_C: flush CS & TimeStamp END;
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//OA buffer (status head, tail)
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
Device &device = commandQueue.getDevice();
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
size += sizeof(typename GfxFamily::GPGPU_WALKER);
size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(&kernel);
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(device);
}
return size;
}
static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
if (reserveProfilingCmdsSpace) {
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
if (reservePerfCounters) {
//start cmds
//P_C: flush CS & TimeStamp BEGIN
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//end cmds
//P_C: flush CS & TimeStamp END;
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//OA buffer (status head, tail)
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
return size;
}
};
template <typename GfxFamily, uint32_t eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) {
auto expectedSizeCS = EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel);
return commandQueue.getCS(expectedSizeCS);
}
template <typename GfxFamily, uint32_t eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSizeCS = 0;
Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr;
for (auto &dispatchInfo : multiDispatchInfo) {
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel());
}
if (parentKernel && parentKernel->isParentKernel) {
SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext());
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler);
}
return commandQueue.getCS(expectedSizeCS);
}
template <typename GfxFamily, IndirectHeap::Type heapType>
IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSize = 0;
IndirectHeap *ih = nullptr;
// clang-format off
switch (heapType) {
case IndirectHeap::DYNAMIC_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo); break;
case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo); break;
case IndirectHeap::SURFACE_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo); break;
}
// clang-format on
if (multiDispatchInfo.begin()->getKernel()->isParentKernel) {
if (heapType == IndirectHeap::SURFACE_STATE) {
expectedSize += KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<heapType>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
} else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
{
DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
DEBUG_BREAK_IF(pDevQueue == nullptr);
ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
}
}
if (ih == nullptr)
ih = &commandQueue.getIndirectHeap(heapType, expectedSize);
return *ih;
}
} // namespace OCLRT

View File

@ -21,24 +21,17 @@
*/
#pragma once
#include "runtime/context/context.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/command_queue/local_id_gen.h"
#include "runtime/command_queue/command_queue.h"
#include "runtime/command_queue/dispatch_walker_helper.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/command_stream/preemption.h"
#include "runtime/device/device_info.h"
#include "runtime/device_queue/device_queue_hw.h"
#include "runtime/event/perf_counter.h"
#include "runtime/event/user_event.h"
#include "runtime/indirect_heap/indirect_heap.h"
#include "runtime/helpers/aligned_memory.h"
#include "runtime/helpers/debug_helpers.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/task_information.h"
#include "runtime/helpers/validators.h"
#include "runtime/helpers/dispatch_info.h"
#include "runtime/kernel/kernel.h"
#include "runtime/mem_obj/mem_obj.h"
#include "runtime/memory_manager/graphics_allocation.h"
#include <algorithm>
@ -46,57 +39,81 @@
namespace OCLRT {
void computeWorkgroupSize1D(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize);
// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask
template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
OCLRT::LinearStream *pCommandStream,
uint32_t aluRegister,
uint32_t operation,
uint32_t mask) {
// Load "Register" value into CS_GPR_R0
typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
typedef typename GfxFamily::MI_MATH MI_MATH;
typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
auto pCmd = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
*pCmd = MI_LOAD_REGISTER_REG::sInit();
pCmd->setSourceRegisterAddress(aluRegister);
pCmd->setDestinationRegisterAddress(CS_GPR_R0);
void computeWorkgroupSizeND(
WorkSizeInfo wsInfo,
size_t workGroupSize[3],
const size_t workItems[3],
const uint32_t workDim);
// Load "Mask" into CS_GPR_R1
typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
auto pCmd2 = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM)));
*pCmd2 = MI_LOAD_REGISTER_IMM::sInit();
pCmd2->setRegisterOffset(CS_GPR_R1);
pCmd2->setDataDword(mask);
void computeWorkgroupSize2D(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize);
// Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands
auto pCmd3 = reinterpret_cast<uint32_t *>(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)));
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.Value = 0x0;
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
// 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1;
pCmd3++;
MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(pCmd3);
void computeWorkgroupSizeSquared(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize,
const uint32_t workDim);
// Setup first operand of MI_MATH - load CS_GPR_R0 into register A
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA;
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0;
pAluParam++;
Vec3<size_t> computeWorkgroupSize(
const DispatchInfo &dispatchInfo);
// Setup second operand of MI_MATH - load CS_GPR_R1 into register B
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB;
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1;
pAluParam++;
Vec3<size_t> generateWorkgroupSize(
const DispatchInfo &dispatchInfo);
// Setup third operand of MI_MATH - "Operation" on registers A and B
pAluParam->DW0.BitField.ALUOpcode = operation;
pAluParam->DW0.BitField.Operand1 = 0;
pAluParam->DW0.BitField.Operand2 = 0;
pAluParam++;
Vec3<size_t> computeWorkgroupsNumber(
const Vec3<size_t> gws,
const Vec3<size_t> lws);
// Setup fourth operand of MI_MATH - store result into CS_GPR_R0
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE;
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0;
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;
Vec3<size_t> generateWorkgroupsNumber(
const Vec3<size_t> gws,
const Vec3<size_t> lws);
// LOAD value of CS_GPR_R0 into "Register"
auto pCmd4 = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
*pCmd4 = MI_LOAD_REGISTER_REG::sInit();
pCmd4->setSourceRegisterAddress(CS_GPR_R0);
pCmd4->setDestinationRegisterAddress(aluRegister);
Vec3<size_t> generateWorkgroupsNumber(
const DispatchInfo &dispatchInfo);
Vec3<size_t> canonizeWorkgroup(
Vec3<size_t> workgroup);
inline uint32_t calculateDispatchDim(Vec3<size_t> dispatchSize, Vec3<size_t> dispatchOffset) {
return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
// Add PIPE_CONTROL to flush caches
typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL;
auto pCmd5 = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
*pCmd5 = PIPE_CONTROL::sInit();
pCmd5->setCommandStreamerStallEnable(true);
pCmd5->setDcFlushEnable(true);
pCmd5->setTextureCacheInvalidationEnable(true);
pCmd5->setPipeControlFlushEnable(true);
pCmd5->setStateCacheInvalidationEnable(true);
}
template <typename GfxFamily>
inline size_t setGpgpuWalkerThreadData(
inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
typename GfxFamily::GPGPU_WALKER *pCmd,
const size_t globalOffsets[3],
const size_t startWorkGroups[3],
@ -132,21 +149,8 @@ inline size_t setGpgpuWalkerThreadData(
return localWorkSize;
}
inline cl_uint computeDimensions(const size_t workItems[3]) {
return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
}
void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo);
template <typename SizeAndAllocCalcT, typename... CalcArgsT>
IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
size_t alignment = MemoryConstants::pageSize;
size_t size = calc(std::forward<CalcArgsT>(args)...);
return new IndirectHeap(alignedMalloc(size, alignment), size);
}
template <typename GfxFamily>
void dispatchProfilingCommandsStart(
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
@ -173,7 +177,7 @@ void dispatchProfilingCommandsStart(
}
template <typename GfxFamily>
void dispatchProfilingCommandsEnd(
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream) {
@ -196,7 +200,7 @@ void dispatchProfilingCommandsEnd(
}
template <typename GfxFamily>
void dispatchPerfCountersNoopidRegisterCommands(
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersNoopidRegisterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
@ -214,7 +218,7 @@ void dispatchPerfCountersNoopidRegisterCommands(
}
template <typename GfxFamily>
void dispatchPerfCountersReadFreqRegisterCommands(
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersReadFreqRegisterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
@ -232,7 +236,7 @@ void dispatchPerfCountersReadFreqRegisterCommands(
}
template <typename GfxFamily>
void dispatchPerfCountersGeneralPurposeCounterCommands(
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
@ -256,7 +260,7 @@ void dispatchPerfCountersGeneralPurposeCounterCommands(
}
template <typename GfxFamily>
void dispatchPerfCountersUserCounterCommands(
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
@ -297,7 +301,7 @@ void dispatchPerfCountersUserCounterCommands(
}
template <typename GfxFamily>
void dispatchPerfCountersOABufferStateCommands(
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream) {
@ -328,7 +332,7 @@ void dispatchPerfCountersOABufferStateCommands(
}
template <typename GfxFamily>
void dispatchPerfCountersCommandsStart(
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream) {
@ -347,12 +351,12 @@ void dispatchPerfCountersCommandsStart(
pPipeControlCmd->setCommandStreamerStallEnable(true);
//Store value of NOOPID register
dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, true);
//Read Core Frequency
dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, true);
dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, true);
auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT));
*pReportPerfCount = MI_REPORT_PERF_COUNT::sInit();
@ -369,13 +373,13 @@ void dispatchPerfCountersCommandsStart(
pPipeControlCmd->setAddress(static_cast<uint32_t>(address & ((uint64_t)UINT32_MAX)));
pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, true);
commandQueue.sendPerfCountersConfig();
}
template <typename GfxFamily>
void dispatchPerfCountersCommandsEnd(
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream) {
@ -394,7 +398,7 @@ void dispatchPerfCountersCommandsEnd(
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
dispatchPerfCountersOABufferStateCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(commandQueue, hwPerfCounter, commandStream);
//Timestamp: Global End
pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
@ -411,21 +415,21 @@ void dispatchPerfCountersCommandsEnd(
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.Oa));
pReportPerfCount->setMemoryAddress(address);
dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, false);
//Store value of NOOPID register
dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, false);
//Read Core Frequency
dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, false);
dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, false);
perfCounters->setCpuTimestamp();
}
template <typename GfxFamily>
void dispatchWalker(
void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
cl_uint numEventsInWaitList,
@ -435,7 +439,7 @@ void dispatchWalker(
OCLRT::HwPerfCounter *hwPerfCounter,
PreemptionMode preemptionMode,
bool blockQueue,
unsigned int commandType = 0) {
unsigned int commandType) {
OCLRT::LinearStream *commandStream = nullptr;
OCLRT::IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
@ -586,17 +590,17 @@ void dispatchWalker(
if (&dispatchInfo == &*multiDispatchInfo.begin()) {
// If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
if (hwTimeStamps != nullptr) {
dispatchProfilingCommandsStart<GfxFamily>(*hwTimeStamps, commandStream);
GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream);
}
if (hwPerfCounter != nullptr) {
dispatchPerfCountersCommandsStart<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);
}
}
PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, true);
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, true);
// Program the walker. Invokes execution so all state should already be programmed
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
@ -606,7 +610,7 @@ void dispatchWalker(
size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd);
auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd);
pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
@ -627,22 +631,22 @@ void dispatchWalker(
pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, false);
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, false);
PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
}
// If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
if (hwTimeStamps != nullptr) {
dispatchProfilingCommandsEnd<GfxFamily>(*hwTimeStamps, commandStream);
GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream);
}
if (hwPerfCounter != nullptr) {
dispatchPerfCountersCommandsEnd<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);
}
}
template <typename GfxFamily>
void dispatchWalker(
void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
CommandQueue &commandQueue,
const Kernel &kernel,
cl_uint workDim,
@ -658,12 +662,12 @@ void dispatchWalker(
bool blockQueue) {
DispatchInfo dispatchInfo(const_cast<Kernel *>(&kernel), workDim, workItems, localWorkSizesIn, globalOffsets);
dispatchWalker<GfxFamily>(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList,
blockedCommandsData, hwTimeStamps, hwPerfCounter, preemptionMode, blockQueue);
GpgpuWalkerHelper<GfxFamily>::dispatchWalker(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList,
blockedCommandsData, hwTimeStamps, hwPerfCounter, preemptionMode, blockQueue);
}
template <typename GfxFamily>
void dispatchScheduler(
void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
CommandQueue &commandQueue,
DeviceQueueHw<GfxFamily> &devQueueHw,
PreemptionMode preemptionMode,
@ -752,7 +756,7 @@ void dispatchScheduler(
preemptionMode);
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, true);
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true);
// Program the walker. Invokes execution so all state should already be programmed
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
@ -760,7 +764,7 @@ void dispatchScheduler(
size_t globalOffsets[3] = {0, 0, 0};
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd);
auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd);
pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
@ -781,7 +785,7 @@ void dispatchScheduler(
pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, false);
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false);
// Do not put BB_START only when returning in first Scheduler run
if (devQueueHw.getSchedulerReturnInstance() != 1) {
@ -797,141 +801,13 @@ void dispatchScheduler(
}
}
template <typename GfxFamily, unsigned int eventType>
struct EnqueueOperation {
static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class");
static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class");
static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class");
static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
size_t size = KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
if (reserveProfilingCmdsSpace) {
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
if (reservePerfCounters) {
//start cmds
//P_C: flush CS & TimeStamp BEGIN
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//end cmds
//P_C: flush CS & TimeStamp END;
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//OA buffer (status head, tail)
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
Device &device = commandQueue.getDevice();
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
size += sizeof(typename GfxFamily::GPGPU_WALKER);
size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(&kernel);
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(device);
}
return size;
}
static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
if (reserveProfilingCmdsSpace) {
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
if (reservePerfCounters) {
//start cmds
//P_C: flush CS & TimeStamp BEGIN
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//end cmds
//P_C: flush CS & TimeStamp END;
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//OA buffer (status head, tail)
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(pKernel);
return size;
}
};
template <typename GfxFamily, unsigned int eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) {
auto expectedSizeCS = EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel);
return commandQueue.getCS(expectedSizeCS);
template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
}
template <typename GfxFamily, unsigned int eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSizeCS = 0;
Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr;
for (auto &dispatchInfo : multiDispatchInfo) {
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel());
}
if (parentKernel && parentKernel->isParentKernel) {
SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext());
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler);
}
return commandQueue.getCS(expectedSizeCS);
template <typename GfxFamily>
size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
return (size_t)0;
}
template <typename GfxFamily, IndirectHeap::Type heapType>
IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSize = 0;
IndirectHeap *ih = nullptr;
// clang-format off
switch(heapType) {
case IndirectHeap::DYNAMIC_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo); break;
case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo); break;
case IndirectHeap::SURFACE_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo); break;
}
// clang-format on
if (multiDispatchInfo.begin()->getKernel()->isParentKernel) {
if (heapType == IndirectHeap::SURFACE_STATE) {
expectedSize += KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<heapType>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
} else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
{
DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
DEBUG_BREAK_IF(pDevQueue == nullptr);
ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
}
}
if (ih == nullptr)
ih = &commandQueue.getIndirectHeap(heapType, expectedSize);
return *ih;
}
} // namespace OCLRT

View File

@ -32,7 +32,7 @@
#include "runtime/memory_manager/memory_manager.h"
#include "runtime/os_interface/debug_settings_manager.h"
#include "runtime/command_stream/preemption.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "command_stream_receiver_hw.h"
namespace OCLRT {

View File

@ -24,7 +24,7 @@
#include "runtime/built_ins/sip.h"
#include "runtime/command_stream/preemption.h"
#include "runtime/device/device.h"
#include "runtime/command_queue/dispatch_walker_helper.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/memory_manager/graphics_allocation.h"
namespace OCLRT {

View File

@ -22,8 +22,7 @@
#pragma once
#include "runtime/device_queue/device_queue_hw.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/dispatch_walker_helper.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/preamble.h"
#include "runtime/helpers/string.h"
@ -217,7 +216,7 @@ void DeviceQueueHw<GfxFamily>::addExecutionModelCleanUpSection(Kernel *parentKer
offset = slbCS.getUsed();
igilQueue->m_controls.m_CleanupSectionAddress = ptrOffset(slbBuffer->getGpuAddress(), slbCS.getUsed());
applyWADisableLSQCROPERFforOCL<GfxFamily>(&slbCS, *parentKernel, true);
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&slbCS, *parentKernel, true);
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
@ -388,10 +387,10 @@ size_t DeviceQueueHw<GfxFamily>::setSchedulerCrossThreadData(SchedulerKernel &sc
template <typename GfxFamily>
void DeviceQueueHw<GfxFamily>::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) {
OCLRT::dispatchScheduler<GfxFamily>(cmdQ,
*this,
preemptionMode,
scheduler);
GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(cmdQ,
*this,
preemptionMode,
scheduler);
return;
}

View File

@ -37,6 +37,7 @@ set(RUNTIME_SRCS_GENX_BASE
device_enqueue.h
device_queue.cpp
command_stream_receiver_hw.cpp
gpgpu_walker.cpp
hw_cmds.h
hw_cmds_generated.h
hw_helper.cpp

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -23,8 +23,6 @@
#include "runtime/memory_manager/svm_memory_manager.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_queue/command_queue_hw.inl"
#include "runtime/command_queue/dispatch_walker_helper.h"
#include "runtime/command_queue/dispatch_walker_helper.inl"
namespace OCLRT {
@ -37,43 +35,4 @@ void populateFactoryTable<CommandQueueHw<Family>>() {
commandQueueFactory[gfxCore] = CommandQueueHw<Family>::create;
}
template <>
void applyWADisableLSQCROPERFforOCL<Family>(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
if (disablePerfMode) {
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
// Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
addAluReadModifyWriteRegister<Family>(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
}
} else {
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
// Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
auto pCmd = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
*pCmd = PIPE_CONTROL::sInit();
pCmd->setCommandStreamerStallEnable(true);
// Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
addAluReadModifyWriteRegister<Family>(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
}
}
}
template <>
size_t getSizeForWADisableLSQCROPERFforOCL<Family>(const Kernel *pKernel) {
typedef typename Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
typedef typename Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
typedef typename Family::MI_MATH MI_MATH;
typedef typename Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
size_t n = 0;
if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
n += sizeof(PIPE_CONTROL) +
(2 * sizeof(MI_LOAD_REGISTER_REG) +
sizeof(MI_LOAD_REGISTER_IMM) +
sizeof(PIPE_CONTROL) +
sizeof(MI_MATH) +
NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) *
2; // For 2 WADisableLSQCROPERFforOCL WAs
}
return n;
}
} // namespace OCLRT

View File

@ -0,0 +1,71 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/gen8/hw_info.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/command_queue/gpgpu_walker.inl"
namespace OCLRT {
template <>
void GpgpuWalkerHelper<BDWFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
if (disablePerfMode) {
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
// Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
GpgpuWalkerHelper<BDWFamily>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
}
} else {
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
// Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
typedef typename BDWFamily::PIPE_CONTROL PIPE_CONTROL;
auto pCmd = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
*pCmd = PIPE_CONTROL::sInit();
pCmd->setCommandStreamerStallEnable(true);
// Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
GpgpuWalkerHelper<BDWFamily>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
}
}
}
template <>
size_t GpgpuWalkerHelper<BDWFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
typedef typename BDWFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
typedef typename BDWFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
typedef typename BDWFamily::PIPE_CONTROL PIPE_CONTROL;
typedef typename BDWFamily::MI_MATH MI_MATH;
typedef typename BDWFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
size_t n = 0;
if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
n += sizeof(PIPE_CONTROL) +
(2 * sizeof(MI_LOAD_REGISTER_REG) +
sizeof(MI_LOAD_REGISTER_IMM) +
sizeof(PIPE_CONTROL) +
sizeof(MI_MATH) +
NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) *
2; // For 2 WADisableLSQCROPERFforOCL WAs
}
return n;
}
template class GpgpuWalkerHelper<BDWFamily>;
} // namespace OCLRT

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -23,8 +23,6 @@
#include "runtime/memory_manager/svm_memory_manager.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_queue/command_queue_hw.inl"
#include "runtime/command_queue/dispatch_walker_helper.h"
#include "runtime/command_queue/dispatch_walker_helper.inl"
namespace OCLRT {
@ -37,43 +35,4 @@ void populateFactoryTable<CommandQueueHw<Family>>() {
commandQueueFactory[gfxCore] = CommandQueueHw<Family>::create;
}
template <>
void applyWADisableLSQCROPERFforOCL<Family>(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
if (disablePerfMode) {
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
// Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
addAluReadModifyWriteRegister<Family>(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
}
} else {
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
// Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
auto pCmd = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
*pCmd = PIPE_CONTROL::sInit();
pCmd->setCommandStreamerStallEnable(true);
// Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
addAluReadModifyWriteRegister<Family>(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
}
}
}
template <>
size_t getSizeForWADisableLSQCROPERFforOCL<Family>(const Kernel *pKernel) {
typedef typename Family::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
typedef typename Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
typedef typename Family::PIPE_CONTROL PIPE_CONTROL;
typedef typename Family::MI_MATH MI_MATH;
typedef typename Family::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
size_t n = 0;
if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
n += sizeof(PIPE_CONTROL) +
(2 * sizeof(MI_LOAD_REGISTER_REG) +
sizeof(MI_LOAD_REGISTER_IMM) +
sizeof(PIPE_CONTROL) +
sizeof(MI_MATH) +
NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) *
2; // For 2 WADisableLSQCROPERFforOCL WAs
}
return n;
}
} // namespace OCLRT

View File

@ -0,0 +1,71 @@
/*
* Copyright (c) 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/gen9/hw_cmds_base.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/command_queue/gpgpu_walker.inl"
namespace OCLRT {
template <>
void GpgpuWalkerHelper<SKLFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
if (disablePerfMode) {
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
// Set bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
GpgpuWalkerHelper<SKLFamily>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_OR, L3SQC_BIT_LQSC_RO_PERF_DIS);
}
} else {
if (kernel.getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
// Add PIPE_CONTROL with CS_Stall to wait till GPU finishes its work
typedef typename SKLFamily::PIPE_CONTROL PIPE_CONTROL;
auto pCmd = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
*pCmd = PIPE_CONTROL::sInit();
pCmd->setCommandStreamerStallEnable(true);
// Clear bit L3SQC_BIT_LQSC_RO_PERF_DIS in L3SQC_REG4
GpgpuWalkerHelper<SKLFamily>::addAluReadModifyWriteRegister(pCommandStream, L3SQC_REG4, ALU_OPCODE_AND, ~L3SQC_BIT_LQSC_RO_PERF_DIS);
}
}
}
template <>
size_t GpgpuWalkerHelper<SKLFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
typedef typename SKLFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
typedef typename SKLFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
typedef typename SKLFamily::PIPE_CONTROL PIPE_CONTROL;
typedef typename SKLFamily::MI_MATH MI_MATH;
typedef typename SKLFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
size_t n = 0;
if ((pKernel != nullptr) && pKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
n += sizeof(PIPE_CONTROL) +
(2 * sizeof(MI_LOAD_REGISTER_REG) +
sizeof(MI_LOAD_REGISTER_IMM) +
sizeof(PIPE_CONTROL) +
sizeof(MI_MATH) +
NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)) *
2; // For 2 WADisableLSQCROPERFforOCL WAs
}
return n;
}
template class GpgpuWalkerHelper<SKLFamily>;
} // namespace OCLRT

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -24,7 +24,7 @@
#include "runtime/helpers/dispatch_info.h"
#include "runtime/kernel/kernel.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
namespace OCLRT {
@ -67,7 +67,7 @@ enum class RegionCoordZ : uint32_t {
Middle = 1,
Back = 2
};
}
} // namespace SplitDispatch
// Compute power in compile time
static constexpr uint32_t powConst(uint32_t base, uint32_t currExp) {
@ -453,4 +453,4 @@ class DispatchInfoBuilder {
return x % y ? 1 : 0;
}
};
}
} // namespace OCLRT

View File

@ -21,13 +21,14 @@
*/
#include "test.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/event/perf_counter.h"
#include "runtime/helpers/aligned_memory.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/task_information.h"
#include "unit_tests/fixtures/device_fixture.h"
#include "unit_tests/command_queue/command_queue_fixture.h"
#include "unit_tests/libult/mock_gfx_family.h"
#include "unit_tests/helpers/hw_parse.h"
#include "unit_tests/helpers/debug_manager_state_restore.h"
#include "unit_tests/mocks/mock_kernel.h"
@ -137,7 +138,7 @@ HWTEST_F(DispatchWalkerTest, shouldntChangeCommandStreamMemory) {
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
cl_uint dimensions = 1;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimensions,
@ -185,7 +186,7 @@ HWTEST_F(DispatchWalkerTest, noLocalIdsShouldntCrash) {
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
cl_uint dimensions = 1;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimensions,
@ -214,7 +215,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithDefaultLwsAlgorithm)
size_t workItems[3] = {1, 1, 1};
for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
workItems[dimension - 1] = 256;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimension,
@ -244,7 +245,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithSquaredLwsAlgorithm)
size_t workItems[3] = {1, 1, 1};
for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
workItems[dimension - 1] = 256;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimension,
@ -273,7 +274,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithNDLwsAlgorithm) {
size_t workItems[3] = {1, 1, 1};
for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
workItems[dimension - 1] = 256;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimension,
@ -303,7 +304,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithOldLwsAlgorithm) {
size_t workItems[3] = {1, 1, 1};
for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
workItems[dimension - 1] = 256;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimension,
@ -332,7 +333,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNumWorkGroups) {
size_t workItems[3] = {2, 5, 10};
size_t workGroupSize[3] = {1, 1, 1};
cl_uint dimensions = 3;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimensions,
@ -363,7 +364,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithOutComputeND) {
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {2, 5, 10};
cl_uint dimensions = 3;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimensions,
@ -394,7 +395,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeND) {
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {2, 5, 10};
cl_uint dimensions = 3;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimensions,
@ -426,7 +427,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeSquared) {
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {2, 5, 10};
cl_uint dimensions = 3;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimensions,
@ -458,7 +459,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithOutComputeSquaredAn
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {2, 5, 10};
cl_uint dimensions = 3;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimensions,
@ -488,7 +489,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSize) {
size_t workItems[3] = {2, 5, 10};
size_t workGroupSize[3] = {1, 2, 3};
cl_uint dimensions = 3;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimensions,
@ -521,7 +522,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizes) {
size_t workItems[3] = {2, 5, 10};
size_t workGroupSize[3] = {1, 2, 3};
cl_uint dimensions = 3;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimensions,
@ -561,7 +562,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizeForSplitKernel) {
MockMultiDispatchInfo multiDispatchInfo(std::vector<DispatchInfo *>({&di1, &di2}));
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
multiDispatchInfo,
0,
@ -604,7 +605,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterLocalWorkSizesForSplitWalker) {
MockMultiDispatchInfo multiDispatchInfo(std::vector<DispatchInfo *>({&di1, &di2}));
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
multiDispatchInfo,
0,
@ -646,7 +647,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerDoesntConsumeCommandStreamWhenQueueIs
KernelOperation *blockedCommandsData = nullptr;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimensions,
@ -686,7 +687,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromKernelW
KernelOperation *blockedCommandsData = nullptr;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
kernel,
dimensions,
@ -727,7 +728,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromMdiWhen
KernelOperation *blockedCommandsData = nullptr;
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
multiDispatchInfo,
0,
@ -759,7 +760,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfo) {
MockMultiDispatchInfo multiDispatchInfo(std::vector<Kernel *>({&kernel1, &kernel2}));
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
multiDispatchInfo,
0,
@ -800,7 +801,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoCorrectlyProg
indirectHeap.align(KernelCommandsHelper<FamilyType>::alignInterfaceDescriptorData);
auto dshBeforeMultiDisptach = indirectHeap.getUsed();
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
multiDispatchInfo,
0,
@ -884,7 +885,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoCorrectlyProg
// create commandStream
auto &cmdStream = pCmdQ->getCS(0);
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
multiDispatchInfo,
0,
@ -929,7 +930,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoAndDifferentK
// create commandStream
auto &cmdStream = pCmdQ->getCS(0);
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
multiDispatchInfo,
0,
@ -979,7 +980,7 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerWithMultipleDispatchInfoButSameKernel
// create commandStream
auto &cmdStream = pCmdQ->getCS(0);
dispatchWalker<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
*pCmdQ,
multiDispatchInfo,
0,
@ -1030,7 +1031,7 @@ HWTEST_F(DispatchWalkerTest, givenMultiDispatchWhenWhitelistedRegisterForCoheren
DispatchInfo di2(&kernel, 1, Vec3<size_t>(1, 1, 1), Vec3<size_t>(1, 1, 1), Vec3<size_t>(0, 0, 0));
MockMultiDispatchInfo multiDispatchInfo(std::vector<DispatchInfo *>({&di1, &di2}));
dispatchWalker<FamilyType>(*pCmdQ, multiDispatchInfo, 0, nullptr, nullptr, nullptr, nullptr, pDevice->getPreemptionMode(), false);
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ, multiDispatchInfo, 0, nullptr, nullptr, nullptr, nullptr, pDevice->getPreemptionMode(), false);
hwParser.parseCommands<FamilyType>(cmdStream, 0);
@ -1056,3 +1057,15 @@ TEST(DispatchWalker, calculateDispatchDim) {
}
}
}
HWTEST_F(DispatchWalkerTest, WhenCallingDefaultWaMethodsThenExpectNothing) {
auto &cmdStream = pCmdQ->getCS(0);
MockKernel kernel(&program, kernelInfo, *pDevice);
EXPECT_EQ(CL_SUCCESS, kernel.initialize());
GpgpuWalkerHelper<GENX>::applyWADisableLSQCROPERFforOCL(&cmdStream, kernel, false);
size_t expectedSize = 0;
size_t actualSize = GpgpuWalkerHelper<GENX>::getSizeForWADisableLSQCROPERFforOCL(&kernel);
EXPECT_EQ(expectedSize, actualSize);
}

View File

@ -20,7 +20,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/command_queue/enqueue_fill_buffer.h"
#include "runtime/command_queue/enqueue_kernel.h"
#include "runtime/command_queue/enqueue_read_buffer.h"
@ -43,8 +43,8 @@ struct GetSizeRequiredBufferTest : public CommandEnqueueFixture,
public HelloWorldKernelFixture,
public ::testing::Test {
using SimpleArgKernelFixture::SetUp;
using HelloWorldKernelFixture::SetUp;
using SimpleArgKernelFixture::SetUp;
GetSizeRequiredBufferTest() {
}

View File

@ -22,7 +22,7 @@
#include "runtime/built_ins/built_ins.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/command_queue/enqueue_copy_image.h"
#include "runtime/command_queue/enqueue_fill_image.h"
#include "runtime/command_queue/enqueue_read_image.h"

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -20,7 +20,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/helpers/options.h"
#include "unit_tests/mocks/mock_kernel.h"
#include "unit_tests/mocks/mock_device.h"

View File

@ -21,7 +21,7 @@
*/
#include "hw_cmds.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "unit_tests/fixtures/device_fixture.h"
#include "unit_tests/helpers/debug_manager_state_restore.h"
#include "test.h"
@ -109,7 +109,7 @@ struct WorkGroupSizeBase : public DeviceFixture {
(workItems[0] + workGroupSize[0] - 1) / workGroupSize[0],
(workItems[1] + workGroupSize[1] - 1) / workGroupSize[1],
(workItems[2] + workGroupSize[2] - 1) / workGroupSize[2]};
setGpgpuWalkerThreadData<FamilyType>(&pCmd, globalOffsets, workGroupsStart, workGroupsNum, workGroupSize, simdSize);
GpgpuWalkerHelper<FamilyType>::setGpgpuWalkerThreadData(&pCmd, globalOffsets, workGroupsStart, workGroupsNum, workGroupSize, simdSize);
//And check if it is programmed correctly
auto numWorkItems = computeWalkerWorkItems<FamilyType>(pCmd);

View File

@ -52,7 +52,7 @@
#include "gtest/gtest.h"
#include "runtime/utilities/linux/debug_env_reader.h"
#include "runtime/gmm_helper/gmm_helper.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
using namespace OCLRT;

View File

@ -21,7 +21,7 @@
*/
#pragma once
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/context/context.h"
#include "runtime/helpers/aligned_memory.h"
#include "runtime/helpers/options.h"

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -31,7 +31,7 @@
#include "unit_tests/mocks/mock_kernel.h"
#include "unit_tests/helpers/debug_manager_state_restore.h"
#include "runtime/command_queue/dispatch_walker_helper.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/helpers/kernel_commands.h"
#include <memory>
@ -330,7 +330,7 @@ HWTEST_F(DeviceQueueSlb, cleanupSection) {
if (mockParentKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages) {
cleanupSectionOffsetToParse += getSizeForWADisableLSQCROPERFforOCL<FamilyType>(mockParentKernel) / 2;
cleanupSectionOffsetToParse += GpgpuWalkerHelper<FamilyType>::getSizeForWADisableLSQCROPERFforOCL(mockParentKernel) / 2;
}
hwParser.parseCommands<FamilyType>(*slbCS, cleanupSectionOffsetToParse);
@ -394,7 +394,7 @@ HWTEST_F(DeviceQueueSlb, AddEMCleanupSectionWithProfiling) {
auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
if (mockParentKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages && getSizeForWADisableLSQCROPERFforOCL<FamilyType>(mockParentKernel) > 0) {
if (mockParentKernel->getKernelInfo().patchInfo.executionEnvironment->UsesFencesForReadWriteImages && GpgpuWalkerHelper<FamilyType>::getSizeForWADisableLSQCROPERFforOCL(mockParentKernel) > 0) {
auto loadRegImmItor = find<MI_LOAD_REGISTER_IMM *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
EXPECT_NE(hwParser.cmdList.end(), loadRegImmItor);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -20,7 +20,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/command_queue/local_id_gen.h"
#include "runtime/device_queue/device_queue_hw.h"
#include "runtime/helpers/per_thread_data.h"

View File

@ -53,19 +53,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenDev
size_t executionModelDSHUsedBefore = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE)->getUsed();
dispatchWalker<FamilyType>(*pCmdQ,
*pKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false);
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*pKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false);
size_t dshUsedAfter = pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE).getUsed();
EXPECT_EQ(0u, dshUsedAfter);
@ -109,19 +109,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenDef
auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT);
dispatchWalker<FamilyType>(*pCmdQ,
*pKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false);
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*pKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false);
auto iohUsed = ioh.getUsed();
EXPECT_EQ(0u, iohUsed);
@ -136,19 +136,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenSSH
MockMultiDispatchInfo multiDispatchInfo(pKernel);
dispatchWalker<FamilyType>(*pCmdQ,
*pKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false);
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*pKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false);
auto &ssh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE);
@ -172,19 +172,19 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsBlockedThenSSHSiz
MockMultiDispatchInfo multiDispatchInfo(pKernel);
dispatchWalker<FamilyType>(*pCmdQ,
*pKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
true); // blockQueue
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*pKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
true); // blockQueue
ASSERT_NE(nullptr, blockedCommandsData);
size_t minRequiredSize = KernelCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
@ -269,19 +269,19 @@ HWTEST_F(MockParentKernelDispatch, GivenBlockedQueueWhenParentKernelIsDispatched
const size_t globalOffsets[3] = {0, 0, 0};
const size_t workItems[3] = {1, 1, 1};
dispatchWalker<FamilyType>(*pCmdQ,
*mockParentKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
true); // blockQueue
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*mockParentKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
true); // blockQueue
ASSERT_NE(nullptr, blockedCommandsData);
@ -302,19 +302,19 @@ HWTEST_F(MockParentKernelDispatch, GivenParentKernelWhenDispatchedThenMediaInter
const size_t globalOffsets[3] = {0, 0, 0};
const size_t workItems[3] = {1, 1, 1};
dispatchWalker<FamilyType>(*pCmdQ,
*mockParentKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false); // blockQueue
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*mockParentKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false); // blockQueue
LinearStream *commandStream = &pCmdQ->getCS(0);
@ -358,19 +358,19 @@ HWTEST_F(MockParentKernelDispatch, GivenUsedSSHHeapWhenParentKernelIsDispatchedT
// If parent is not using SSH, then heap obtained has zero usage and the same buffer
ASSERT_EQ(0u, mockParentKernel->getKernelInfo().heapInfo.pKernelHeader->SurfaceStateHeapSize);
dispatchWalker<FamilyType>(*pCmdQ,
*mockParentKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false); // blockQueue
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*mockParentKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false); // blockQueue
EXPECT_EQ(0u, ssh.getUsed());
@ -393,19 +393,19 @@ HWTEST_F(MockParentKernelDispatch, GivenNotUsedSSHHeapWhenParentKernelIsDispatch
auto *bufferMemory = ssh.getCpuBase();
dispatchWalker<FamilyType>(*pCmdQ,
*mockParentKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false); // blockQueue
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*mockParentKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false); // blockQueue
EXPECT_EQ(bufferMemory, ssh.getCpuBase());

View File

@ -72,7 +72,7 @@ HWTEST_F(ExecutionModelSchedulerFixture, dispatchScheduler) {
LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
dispatchScheduler<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchScheduler(
*pCmdQ,
*pDevQueueHw,
pDevice->getPreemptionMode(),
@ -188,7 +188,7 @@ HWTEST_F(ExecutionModelSchedulerFixture, dispatchSchedulerDoesNotUseStandardCmdQ
getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
dispatchScheduler<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchScheduler(
*pCmdQ,
*pDevQueueHw,
pDevice->getPreemptionMode(),
@ -219,7 +219,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, dispatchSchedulerWithEarlyReturnSetToF
LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
dispatchScheduler<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchScheduler(
*pCmdQ,
mockDevQueue,
device->getPreemptionMode(),

View File

@ -22,7 +22,7 @@
#include "runtime/built_ins/built_ins.h"
#include "runtime/command_queue/enqueue_kernel.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/device_queue/device_queue.h"
#include "runtime/device_queue/device_queue_hw.h"
#include "runtime/helpers/kernel_commands.h"
@ -51,7 +51,7 @@ BDWTEST_F(BdwSchedulerTest, givenCallToDispatchSchedulerWhenPipeControlWithCSSta
LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
dispatchScheduler<FamilyType>(
GpgpuWalkerHelper<FamilyType>::dispatchScheduler(
*pCmdQ,
*pDevQueueHw,
pDevice->getPreemptionMode(),

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -21,7 +21,7 @@
*/
#include "runtime/context/context.h"
#include "runtime/command_queue/dispatch_walker_helper.h"
#include "runtime/command_queue/gpgpu_walker.h"
#include "unit_tests/fixtures/device_host_queue_fixture.h"
#include "unit_tests/helpers/hw_parse.h"
#include "unit_tests/mocks/mock_device_queue.h"

View File

@ -21,12 +21,23 @@
*/
#include "unit_tests/libult/mock_gfx_family.h"
#include "runtime/command_queue/gpgpu_walker.inl"
#include "runtime/command_stream/preemption.inl"
#include "runtime/device_queue/device_queue_hw.h"
#include "runtime/device_queue/device_queue_hw.inl"
#include "runtime/helpers/hw_helper.inl"
#include "runtime/helpers/kernel_commands.inl"
#include "runtime/helpers/preamble.inl"
namespace OCLRT {
bool (*GENX::isSimulationFcn)(unsigned short) = nullptr;
GENX::GPGPU_WALKER GENX::cmdInitGpgpuWalker = GENX::GPGPU_WALKER::sInit();
GENX::INTERFACE_DESCRIPTOR_DATA GENX::cmdInitInterfaceDescriptorData = GENX::INTERFACE_DESCRIPTOR_DATA::sInit();
GENX::MEDIA_STATE_FLUSH GENX::cmdInitMediaStateFlush = GENX::MEDIA_STATE_FLUSH::sInit();
GENX::MEDIA_INTERFACE_DESCRIPTOR_LOAD GENX::cmdInitMediaInterfaceDescriptorLoad = GENX::MEDIA_INTERFACE_DESCRIPTOR_LOAD::sInit();
template <>
size_t HwHelperHw<GENX>::getMaxBarrierRegisterPerSlice() const {
return 32;
@ -57,4 +68,89 @@ struct hw_helper_static_init {
template class HwHelperHw<GENX>;
hw_helper_static_init si;
template class GpgpuWalkerHelper<GENX>;
template <>
bool KernelCommandsHelper<GENX>::isPipeControlWArequired() {
return false;
}
template struct KernelCommandsHelper<GENX>;
template <>
size_t PreemptionHelper::getRequiredCmdStreamSize<GENX>(PreemptionMode newPreemptionMode, PreemptionMode oldPreemptionMode) {
return 0;
}
template <>
void PreemptionHelper::programCmdStream<GENX>(LinearStream &cmdStream, PreemptionMode newPreemptionMode, PreemptionMode oldPreemptionMode,
GraphicsAllocation *preemptionCsr, Device &device) {
}
template <>
size_t PreemptionHelper::getRequiredPreambleSize<GENX>(const Device &device) {
return 0;
}
template <>
void PreemptionHelper::programPreamble<GENX>(LinearStream &preambleCmdStream, Device &device,
const GraphicsAllocation *preemptionCsr) {
}
template <>
size_t PreemptionHelper::getPreemptionWaCsSize<GENX>(const Device &device) {
return 0;
}
template void PreemptionHelper::programInterfaceDescriptorDataPreemption<GENX>(INTERFACE_DESCRIPTOR_DATA<GENX> *idd, PreemptionMode preemptionMode);
template <>
size_t DeviceQueueHw<GENX>::getWaCommandsSize() {
return (size_t)0;
}
template <>
void DeviceQueueHw<GENX>::addArbCheckCmdWa() {
}
template <>
void DeviceQueueHw<GENX>::addMiAtomicCmdWa(uint64_t atomicOpPlaceholder) {
}
template <>
void DeviceQueueHw<GENX>::addLriCmdWa(bool setArbCheck) {
}
template <>
void DeviceQueueHw<GENX>::addPipeControlCmdWa(bool isNoopCmd) {
}
template <>
void DeviceQueueHw<GENX>::addProfilingEndCmds(uint64_t timestampAddress) {
}
template class DeviceQueueHw<GENX>;
template <>
void PreambleHelper<GENX>::addPipeControlBeforeVfeCmd(LinearStream *pCommandStream, const HardwareInfo *hwInfo) {
}
template <>
uint32_t PreambleHelper<GENX>::getL3Config(const HardwareInfo &hwInfo, bool useSLM) {
uint32_t l3Config = 0;
return l3Config;
}
template <>
void PreambleHelper<GENX>::programPipelineSelect(LinearStream *pCommandStream, bool mediaSamplerRequired) {
}
template <>
struct L3CNTLRegisterOffset<GENX> {
static const uint32_t registerOffset = 0x7034;
};
template struct PreambleHelper<GENX>;
} // namespace OCLRT

View File

@ -31,9 +31,71 @@ extern HwHelper *hwHelperFactory[IGFX_MAX_CORE];
struct GENX {
static bool (*isSimulationFcn)(unsigned short);
typedef struct tagINTERFACE_DESCRIPTOR_DATA {
typedef enum tagDENORM_MODE {
DENORM_MODE_FTZ = 0x0,
DENORM_MODE_SETBYKERNEL = 0x1,
} DENORM_MODE;
typedef enum tagSAMPLERSTATEPOINTER {
SAMPLERSTATEPOINTER_BIT_SHIFT = 0x5,
SAMPLERSTATEPOINTER_ALIGN_SIZE = 0x20,
} SAMPLERSTATEPOINTER;
typedef enum tagSAMPLER_COUNT {
SAMPLER_COUNT_NO_SAMPLERS_USED = 0x0,
SAMPLER_COUNT_BETWEEN_1_AND_4_SAMPLERS_USED = 0x1,
SAMPLER_COUNT_BETWEEN_5_AND_8_SAMPLERS_USED = 0x2,
SAMPLER_COUNT_BETWEEN_9_AND_12_SAMPLERS_USED = 0x3,
SAMPLER_COUNT_BETWEEN_13_AND_16_SAMPLERS_USED = 0x4,
} SAMPLER_COUNT;
typedef enum tagSHARED_LOCAL_MEMORY_SIZE {
SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K = 0x0,
SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K = 0x1,
SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K = 0x2,
SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K = 0x3,
SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K = 0x4,
SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K = 0x5,
SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K = 0x6,
SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K = 0x7,
} SHARED_LOCAL_MEMORY_SIZE;
typedef enum tagBINDINGTABLEPOINTER {
BINDINGTABLEPOINTER_BIT_SHIFT = 0x5,
BINDINGTABLEPOINTER_ALIGN_SIZE = 0x20,
} BINDINGTABLEPOINTER;
static tagINTERFACE_DESCRIPTOR_DATA sInit(void) {
INTERFACE_DESCRIPTOR_DATA state;
return state;
}
inline void setKernelStartPointerHigh(const uint32_t value) {
}
inline void setKernelStartPointer(const uint64_t value) {
}
inline void setNumberOfThreadsInGpgpuThreadGroup(const uint32_t value) {
}
inline void setCrossThreadConstantDataReadLength(const uint32_t value) {
}
inline void setDenormMode(const DENORM_MODE value) {
}
inline void setConstantIndirectUrbEntryReadLength(const uint32_t value) {
}
inline void setBindingTablePointer(const uint64_t value) {
}
inline void setSamplerStatePointer(const uint64_t value) {
}
inline void setSamplerCount(const SAMPLER_COUNT value) {
}
inline void setSharedLocalMemorySize(const SHARED_LOCAL_MEMORY_SIZE value) {
}
inline void setBarrierEnable(const bool value) {
}
} INTERFACE_DESCRIPTOR_DATA;
typedef struct tagBINDING_TABLE_STATE {
inline void init(void) {
}
inline uint32_t getSurfaceStatePointer(void) const {
return 0u;
}
inline void setSurfaceStatePointer(const uint64_t value) {
}
inline uint32_t getRawData(const uint32_t index) {
return 0;
}
@ -42,6 +104,247 @@ struct GENX {
SURFACESTATEPOINTER_ALIGN_SIZE = 0x40,
} SURFACESTATEPOINTER;
} BINDING_TABLE_STATE;
typedef struct tagGPGPU_WALKER {
typedef enum tagSIMD_SIZE {
SIMD_SIZE_SIMD8 = 0x0,
SIMD_SIZE_SIMD16 = 0x1,
SIMD_SIZE_SIMD32 = 0x2,
} SIMD_SIZE;
typedef enum tagINDIRECTDATASTARTADDRESS {
INDIRECTDATASTARTADDRESS_BIT_SHIFT = 0x6,
INDIRECTDATASTARTADDRESS_ALIGN_SIZE = 0x40,
} INDIRECTDATASTARTADDRESS;
static tagGPGPU_WALKER sInit(void) {
GPGPU_WALKER state;
return state;
}
inline void setThreadWidthCounterMaximum(const uint32_t value) {
}
inline void setThreadGroupIdXDimension(const uint32_t value) {
}
inline void setThreadGroupIdYDimension(const uint32_t value) {
}
inline void setThreadGroupIdZDimension(const uint32_t value) {
}
inline void setRightExecutionMask(const uint32_t value) {
}
inline void setBottomExecutionMask(const uint32_t value) {
}
inline void setSimdSize(const SIMD_SIZE value) {
}
inline void setThreadGroupIdStartingX(const uint32_t value) {
}
inline void setThreadGroupIdStartingY(const uint32_t value) {
}
inline void setThreadGroupIdStartingResumeZ(const uint32_t value) {
}
inline void setIndirectDataStartAddress(const uint32_t value) {
}
inline void setInterfaceDescriptorOffset(const uint32_t value) {
}
inline void setIndirectDataLength(const uint32_t value) {
}
} GPGPU_WALKER;
typedef struct tagPIPE_CONTROL {
typedef enum tagPOST_SYNC_OPERATION {
POST_SYNC_OPERATION_NO_WRITE = 0x0,
POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA = 0x1,
POST_SYNC_OPERATION_WRITE_PS_DEPTH_COUNT = 0x2,
POST_SYNC_OPERATION_WRITE_TIMESTAMP = 0x3,
} POST_SYNC_OPERATION;
static tagPIPE_CONTROL sInit(void) {
PIPE_CONTROL state;
return state;
}
inline void setCommandStreamerStallEnable(const uint32_t value) {
}
inline void setDcFlushEnable(const bool value) {
}
inline void setStateCacheInvalidationEnable(const bool value) {
}
inline void setPipeControlFlushEnable(const bool value) {
}
inline void setTextureCacheInvalidationEnable(const bool value) {
}
inline void setPostSyncOperation(const POST_SYNC_OPERATION value) {
}
inline void setAddress(const uint32_t value) {
}
inline void setAddressHigh(const uint32_t value) {
}
inline void setImmediateData(const uint64_t value) {
}
inline void setGenericMediaStateClear(const bool value) {
}
} PIPE_CONTROL;
typedef struct tagMI_LOAD_REGISTER_IMM {
static tagMI_LOAD_REGISTER_IMM sInit(void) {
MI_LOAD_REGISTER_IMM state;
return state;
}
inline void setRegisterOffset(const uint32_t value) {
}
inline void setDataDword(const uint32_t value) {
}
} MI_LOAD_REGISTER_IMM;
typedef struct tagMI_LOAD_REGISTER_REG {
static tagMI_LOAD_REGISTER_REG sInit(void) {
MI_LOAD_REGISTER_REG state;
return state;
}
inline void setSourceRegisterAddress(const uint32_t value) {
}
inline void setDestinationRegisterAddress(const uint32_t value) {
}
} MI_LOAD_REGISTER_REG;
typedef struct tagMI_MATH {
union _DW0 {
struct _BitField {
uint32_t DwordLength : BITFIELD_RANGE(0, 5);
uint32_t Reserved : BITFIELD_RANGE(6, 22);
uint32_t InstructionOpcode : BITFIELD_RANGE(23, 28);
uint32_t InstructionType : BITFIELD_RANGE(29, 31);
} BitField;
uint32_t Value;
} DW0;
typedef enum tagMI_COMMAND_OPCODE {
MI_COMMAND_OPCODE_MI_MATH = 0x0,
} MI_COMMAND_OPCODE;
typedef enum tagCOMMAND_TYPE {
COMMAND_TYPE_MI_COMMAND = 0x0,
} COMMAND_TYPE;
} MI_MATH;
typedef struct tagMI_MATH_ALU_INST_INLINE {
union _DW0 {
struct _BitField {
uint32_t Operand2 : BITFIELD_RANGE(0, 9);
uint32_t Operand1 : BITFIELD_RANGE(10, 19);
uint32_t ALUOpcode : BITFIELD_RANGE(20, 31);
} BitField;
uint32_t Value;
} DW0;
} MI_MATH_ALU_INST_INLINE;
typedef struct tagMI_COMMAND_OPCODE_MI_MATH {
} MI_COMMAND_OPCODE_MI_MATH;
typedef struct tagMI_STORE_REGISTER_MEM {
static tagMI_STORE_REGISTER_MEM sInit(void) {
MI_STORE_REGISTER_MEM state;
return state;
}
inline void setRegisterAddress(const uint32_t value) {
}
inline void setMemoryAddress(const uint64_t value) {
}
} MI_STORE_REGISTER_MEM;
typedef struct tagMI_REPORT_PERF_COUNT {
static tagMI_REPORT_PERF_COUNT sInit(void) {
MI_REPORT_PERF_COUNT state;
return state;
}
inline void setReportId(const uint32_t value) {
}
inline void setMemoryAddress(const uint64_t value) {
}
} MI_REPORT_PERF_COUNT;
typedef struct tagMI_BATCH_BUFFER_START {
typedef enum tagSECOND_LEVEL_BATCH_BUFFER {
SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH = 0x0,
SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH = 0x1,
} SECOND_LEVEL_BATCH_BUFFER;
static tagMI_BATCH_BUFFER_START sInit(void) {
MI_BATCH_BUFFER_START state;
return state;
}
inline void setSecondLevelBatchBuffer(const SECOND_LEVEL_BATCH_BUFFER value) {
}
inline void setBatchBufferStartAddressGraphicsaddress472(const uint64_t value) {
}
} MI_BATCH_BUFFER_START;
typedef struct tagMEDIA_STATE_FLUSH {
static tagMEDIA_STATE_FLUSH sInit(void) {
MEDIA_STATE_FLUSH state;
return state;
}
inline void setInterfaceDescriptorOffset(const uint32_t value) {
}
} MEDIA_STATE_FLUSH;
typedef struct tagMEDIA_INTERFACE_DESCRIPTOR_LOAD {
static tagMEDIA_INTERFACE_DESCRIPTOR_LOAD sInit(void) {
MEDIA_INTERFACE_DESCRIPTOR_LOAD state;
return state;
}
inline void setInterfaceDescriptorDataStartAddress(const uint32_t value) {
}
inline void setInterfaceDescriptorTotalLength(const uint32_t value) {
}
} MEDIA_INTERFACE_DESCRIPTOR_LOAD;
typedef struct tagMI_BATCH_BUFFER_END {
static tagMI_BATCH_BUFFER_END sInit(void) {
MI_BATCH_BUFFER_END state;
return state;
}
} MI_BATCH_BUFFER_END;
typedef struct tagRENDER_SURFACE_STATE {
} RENDER_SURFACE_STATE;
typedef struct tagMEDIA_VFE_STATE {
static tagMEDIA_VFE_STATE sInit(void) {
MEDIA_VFE_STATE state;
return state;
}
inline void setMaximumNumberOfThreads(const uint32_t value) {
}
inline void setNumberOfUrbEntries(const uint32_t value) {
}
inline void setUrbEntryAllocationSize(const uint32_t value) {
}
inline void setPerThreadScratchSpace(const uint32_t value) {
}
inline void setStackSize(const uint32_t value) {
}
inline void setScratchSpaceBasePointer(const uint32_t value) {
}
inline void setScratchSpaceBasePointerHigh(const uint32_t value) {
}
} MEDIA_VFE_STATE;
typedef struct tagSAMPLER_STATE {
inline void setIndirectStatePointer(const uint32_t indirectStatePointerValue) {
}
} SAMPLER_STATE;
typedef struct tagGPGPU_CSR_BASE_ADDRESS {
inline void init(void) {
}
inline void setGpgpuCsrBaseAddress(uint64_t value) {
}
} GPGPU_CSR_BASE_ADDRESS;
typedef struct tagSTATE_SIP {
inline void init(void) {
}
inline void setSystemInstructionPointer(uint64_t value) {
}
} STATE_SIP;
static GPGPU_WALKER cmdInitGpgpuWalker;
static INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;
static MEDIA_STATE_FLUSH cmdInitMediaStateFlush;
static MEDIA_INTERFACE_DESCRIPTOR_LOAD cmdInitMediaInterfaceDescriptorLoad;
};
} // namespace OCLRT