Use GPU pointers for HwPerfCounter.

Related-To: NEO-2872

Change-Id: Ia30f2ee0d96a3da05b8e5ecf55e9b7fb5a34ace7
Signed-off-by: Piotr Fusik <piotr.fusik@intel.com>
This commit is contained in:
Piotr Fusik
2019-04-18 12:25:29 +02:00
committed by sys_ocldev
parent 086ef7c461
commit 603eee76e5
11 changed files with 209 additions and 190 deletions

View File

@@ -362,15 +362,15 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
return commandQueueProperties; return commandQueueProperties;
} }
bool isProfilingEnabled() { bool isProfilingEnabled() const {
return !!(this->getCommandQueueProperties() & CL_QUEUE_PROFILING_ENABLE); return !!(this->getCommandQueueProperties() & CL_QUEUE_PROFILING_ENABLE);
} }
bool isOOQEnabled() { bool isOOQEnabled() const {
return !!(this->getCommandQueueProperties() & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE); return !!(this->getCommandQueueProperties() & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
} }
bool isPerfCountersEnabled() { bool isPerfCountersEnabled() const {
return perfCountersEnabled; return perfCountersEnabled;
} }
@@ -388,7 +388,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
this->isSpecialCommandQueue = newValue; this->isSpecialCommandQueue = newValue;
} }
uint16_t getPerfCountersUserRegistersNumber() { uint16_t getPerfCountersUserRegistersNumber() const {
return perfCountersUserRegistersNumber; return perfCountersUserRegistersNumber;
} }

View File

@@ -163,8 +163,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
} }
bool profilingRequired = (this->isProfilingEnabled() && event != nullptr); bool profilingRequired = (this->isProfilingEnabled() && event != nullptr);
bool perfCountersRequired = false; bool perfCountersRequired = (this->isPerfCountersEnabled() && event != nullptr);
perfCountersRequired = (this->isPerfCountersEnabled() && event != nullptr);
KernelOperation *blockedCommandsData = nullptr; KernelOperation *blockedCommandsData = nullptr;
std::unique_ptr<PrintfHandler> printfHandler; std::unique_ptr<PrintfHandler> printfHandler;
bool slmUsed = multiDispatchInfo.usesSlm() || parentKernel; bool slmUsed = multiDispatchInfo.usesSlm() || parentKernel;
@@ -381,7 +380,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
KernelOperation *&blockedCommandsData, KernelOperation *&blockedCommandsData,
TimestampPacketContainer &previousTimestampPacketNodes, TimestampPacketContainer &previousTimestampPacketNodes,
PreemptionMode preemption) { PreemptionMode preemption) {
HwPerfCounter *hwPerfCounter = nullptr; TagNode<HwPerfCounter> *hwPerfCounter = nullptr;
DebugManager.dumpKernelArgs(&multiDispatchInfo); DebugManager.dumpKernelArgs(&multiDispatchInfo);
printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device)); printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device));
@@ -399,7 +398,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
// Get allocation for timestamps // Get allocation for timestamps
hwTimeStamps = event->getHwTimeStampNode(); hwTimeStamps = event->getHwTimeStampNode();
if (this->isPerfCountersEnabled()) { if (this->isPerfCountersEnabled()) {
hwPerfCounter = event->getHwPerfCounterNode()->tagForCpuAccess; hwPerfCounter = event->getHwPerfCounterNode();
// PERF COUNTER: copy current configuration from queue to event // PERF COUNTER: copy current configuration from queue to event
event->copyPerfCounters(this->getPerfCountersConfigData()); event->copyPerfCounters(this->getPerfCountersConfigData());
} }

View File

@@ -114,15 +114,6 @@ inline cl_uint computeDimensions(const size_t workItems[3]) {
template <typename GfxFamily> template <typename GfxFamily>
class GpgpuWalkerHelper { class GpgpuWalkerHelper {
public: public:
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
static void addAluReadModifyWriteRegister(
LinearStream *pCommandStream,
uint32_t aluRegister,
uint32_t operation,
uint32_t mask);
static void applyWADisableLSQCROPERFforOCL(LinearStream *pCommandStream, static void applyWADisableLSQCROPERFforOCL(LinearStream *pCommandStream,
const Kernel &kernel, const Kernel &kernel,
bool disablePerfMode); bool disablePerfMode);
@@ -143,50 +134,21 @@ class GpgpuWalkerHelper {
static void dispatchProfilingCommandsStart( static void dispatchProfilingCommandsStart(
TagNode<HwTimeStamps> &hwTimeStamps, TagNode<HwTimeStamps> &hwTimeStamps,
NEO::LinearStream *commandStream); LinearStream *commandStream);
static void dispatchProfilingCommandsEnd( static void dispatchProfilingCommandsEnd(
TagNode<HwTimeStamps> &hwTimeStamps, TagNode<HwTimeStamps> &hwTimeStamps,
NEO::LinearStream *commandStream); LinearStream *commandStream);
static void dispatchPerfCountersNoopidRegisterCommands(
CommandQueue &commandQueue,
NEO::HwPerfCounter &hwPerfCounter,
NEO::LinearStream *commandStream,
bool start);
static void dispatchPerfCountersReadFreqRegisterCommands(
CommandQueue &commandQueue,
NEO::HwPerfCounter &hwPerfCounter,
NEO::LinearStream *commandStream,
bool start);
static void dispatchPerfCountersGeneralPurposeCounterCommands(
CommandQueue &commandQueue,
NEO::HwPerfCounter &hwPerfCounter,
NEO::LinearStream *commandStream,
bool start);
static void dispatchPerfCountersUserCounterCommands(
CommandQueue &commandQueue,
NEO::HwPerfCounter &hwPerfCounter,
NEO::LinearStream *commandStream,
bool start);
static void dispatchPerfCountersOABufferStateCommands(
CommandQueue &commandQueue,
NEO::HwPerfCounter &hwPerfCounter,
NEO::LinearStream *commandStream);
static void dispatchPerfCountersCommandsStart( static void dispatchPerfCountersCommandsStart(
CommandQueue &commandQueue, CommandQueue &commandQueue,
NEO::HwPerfCounter &hwPerfCounter, TagNode<HwPerfCounter> &hwPerfCounter,
NEO::LinearStream *commandStream); LinearStream *commandStream);
static void dispatchPerfCountersCommandsEnd( static void dispatchPerfCountersCommandsEnd(
CommandQueue &commandQueue, CommandQueue &commandQueue,
NEO::HwPerfCounter &hwPerfCounter, TagNode<HwPerfCounter> &hwPerfCounter,
NEO::LinearStream *commandStream); LinearStream *commandStream);
static void setupTimestampPacket( static void setupTimestampPacket(
LinearStream *cmdStream, LinearStream *cmdStream,
@@ -203,6 +165,36 @@ class GpgpuWalkerHelper {
IndirectHeap *dsh); IndirectHeap *dsh);
static void adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxFamily> *storeCmd); static void adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxFamily> *storeCmd);
private:
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
static void addAluReadModifyWriteRegister(
LinearStream *pCommandStream,
uint32_t aluRegister,
uint32_t operation,
uint32_t mask);
static void dispatchStoreRegisterCommand(
LinearStream *commandStream,
uint64_t memoryAddress,
uint32_t registerAddress);
static void dispatchPerfCountersGeneralPurposeCounterCommands(
CommandQueue &commandQueue,
TagNode<HwPerfCounter> &hwPerfCounter,
LinearStream *commandStream,
bool start);
static void dispatchPerfCountersUserCounterCommands(
CommandQueue &commandQueue,
TagNode<HwPerfCounter> &hwPerfCounter,
LinearStream *commandStream,
bool start);
static void dispatchPerfCountersOABufferStateCommands(
TagNode<HwPerfCounter> &hwPerfCounter,
LinearStream *commandStream);
}; };
template <typename GfxFamily> template <typename GfxFamily>

View File

@@ -42,14 +42,14 @@ void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG; typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
typedef typename GfxFamily::MI_MATH MI_MATH; typedef typename GfxFamily::MI_MATH MI_MATH;
typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE; typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
auto pCmd = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG))); auto pCmd = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
*pCmd = GfxFamily::cmdInitLoadRegisterReg; *pCmd = GfxFamily::cmdInitLoadRegisterReg;
pCmd->setSourceRegisterAddress(aluRegister); pCmd->setSourceRegisterAddress(aluRegister);
pCmd->setDestinationRegisterAddress(CS_GPR_R0); pCmd->setDestinationRegisterAddress(CS_GPR_R0);
// Load "Mask" into CS_GPR_R1 // Load "Mask" into CS_GPR_R1
typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
auto pCmd2 = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM))); auto pCmd2 = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_IMM>();
*pCmd2 = GfxFamily::cmdInitLoadRegisterImm; *pCmd2 = GfxFamily::cmdInitLoadRegisterImm;
pCmd2->setRegisterOffset(CS_GPR_R1); pCmd2->setRegisterOffset(CS_GPR_R1);
pCmd2->setDataDword(mask); pCmd2->setDataDword(mask);
@@ -88,13 +88,13 @@ void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU; pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;
// LOAD value of CS_GPR_R0 into "Register" // LOAD value of CS_GPR_R0 into "Register"
auto pCmd4 = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG))); auto pCmd4 = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
*pCmd4 = GfxFamily::cmdInitLoadRegisterReg; *pCmd4 = GfxFamily::cmdInitLoadRegisterReg;
pCmd4->setSourceRegisterAddress(CS_GPR_R0); pCmd4->setSourceRegisterAddress(CS_GPR_R0);
pCmd4->setDestinationRegisterAddress(aluRegister); pCmd4->setDestinationRegisterAddress(aluRegister);
// Add PIPE_CONTROL to flush caches // Add PIPE_CONTROL to flush caches
auto pCmd5 = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL))); auto pCmd5 = pCommandStream->getSpaceForCmd<PIPE_CONTROL>();
*pCmd5 = GfxFamily::cmdInitPipeControl; *pCmd5 = GfxFamily::cmdInitPipeControl;
pCmd5->setCommandStreamerStallEnable(true); pCmd5->setCommandStreamerStallEnable(true);
pCmd5->setDcFlushEnable(true); pCmd5->setDcFlushEnable(true);
@@ -106,7 +106,8 @@ void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
template <typename GfxFamily> template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart( void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
TagNode<HwTimeStamps> &hwTimeStamps, TagNode<HwTimeStamps> &hwTimeStamps,
NEO::LinearStream *commandStream) { LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
// PIPE_CONTROL for global timestamp // PIPE_CONTROL for global timestamp
@@ -118,7 +119,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextStartTS); timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextStartTS);
//low part //low part
auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); auto pMICmdLow = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
*pMICmdLow = GfxFamily::cmdInitStoreRegisterMem; *pMICmdLow = GfxFamily::cmdInitStoreRegisterMem;
adjustMiStoreRegMemMode(pMICmdLow); adjustMiStoreRegMemMode(pMICmdLow);
pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
@@ -128,12 +129,12 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
template <typename GfxFamily> template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd( void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
TagNode<HwTimeStamps> &hwTimeStamps, TagNode<HwTimeStamps> &hwTimeStamps,
NEO::LinearStream *commandStream) { LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
// PIPE_CONTROL for global timestamp // PIPE_CONTROL for global timestamp
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
*pPipeControlCmd = GfxFamily::cmdInitPipeControl; *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
pPipeControlCmd->setCommandStreamerStallEnable(true); pPipeControlCmd->setCommandStreamerStallEnable(true);
@@ -141,7 +142,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextEndTS); uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextEndTS);
//low part //low part
auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); auto pMICmdLow = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
*pMICmdLow = GfxFamily::cmdInitStoreRegisterMem; *pMICmdLow = GfxFamily::cmdInitStoreRegisterMem;
adjustMiStoreRegMemMode(pMICmdLow); adjustMiStoreRegMemMode(pMICmdLow);
pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
@@ -149,144 +150,79 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
} }
template <typename GfxFamily> template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersNoopidRegisterCommands( void GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(
CommandQueue &commandQueue, LinearStream *commandStream,
NEO::HwPerfCounter &hwPerfCounter, uint64_t memoryAddress,
NEO::LinearStream *commandStream, uint32_t registerAddress) {
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.DMAFenceIdBegin)) auto pCmd = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.DMAFenceIdEnd)); *pCmd = GfxFamily::cmdInitStoreRegisterMem;
pCmd->setRegisterAddress(registerAddress);
auto pNoopIdRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); pCmd->setMemoryAddress(memoryAddress);
*pNoopIdRegister = GfxFamily::cmdInitStoreRegisterMem;
pNoopIdRegister->setRegisterAddress(NEO::INSTR_MMIO_NOOPID);
pNoopIdRegister->setMemoryAddress(address);
}
template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersReadFreqRegisterCommands(
CommandQueue &commandQueue,
NEO::HwPerfCounter &hwPerfCounter,
NEO::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.CoreFreqBegin))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.CoreFreqEnd));
auto pCoreFreqRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pCoreFreqRegister = GfxFamily::cmdInitStoreRegisterMem;
pCoreFreqRegister->setRegisterAddress(NEO::INSTR_MMIO_RPSTAT1);
pCoreFreqRegister->setMemoryAddress(address);
} }
template <typename GfxFamily> template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands( void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(
CommandQueue &commandQueue, CommandQueue &commandQueue,
NEO::HwPerfCounter &hwPerfCounter, TagNode<HwPerfCounter> &hwPerfCounter,
NEO::LinearStream *commandStream, LinearStream *commandStream,
bool start) { bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; uint64_t baseAddress = hwPerfCounter.getGpuAddress();
uint64_t address = 0; baseAddress += start ? offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Gp)
const uint64_t baseAddress = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportBegin.Gp)) : offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Gp);
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.Gp));
// Read General Purpose counters // Read General Purpose counters
for (uint16_t i = 0; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) { for (auto i = 0u; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) {
auto pGeneralPurposeRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pGeneralPurposeRegister = GfxFamily::cmdInitStoreRegisterMem;
uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint); uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint);
pGeneralPurposeRegister->setRegisterAddress(regAddr);
//Gp field is 2*uint64 wide so it can hold 4 uint32 //Gp field is 2*uint64 wide so it can hold 4 uint32
address = baseAddress + i * sizeof(cl_uint); uint64_t address = baseAddress + i * sizeof(cl_uint);
pGeneralPurposeRegister->setMemoryAddress(address); dispatchStoreRegisterCommand(commandStream, address, regAddr);
} }
} }
template <typename GfxFamily> template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands( void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(
CommandQueue &commandQueue, CommandQueue &commandQueue,
NEO::HwPerfCounter &hwPerfCounter, TagNode<HwPerfCounter> &hwPerfCounter,
NEO::LinearStream *commandStream, LinearStream *commandStream,
bool start) { bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; uint64_t baseAddr = hwPerfCounter.getGpuAddress();
baseAddr += start ? offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User)
uint64_t address = 0; : offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User);
const uint64_t baseAddr = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportBegin.User)) auto userRegs = &commandQueue.getPerfCountersConfigData()->ReadRegs;
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.User));
uint32_t cmdNum = 0;
uint32_t regAddr = 0;
auto configData = commandQueue.getPerfCountersConfigData();
auto userRegs = &configData->ReadRegs;
for (uint32_t i = 0; i < userRegs->RegsCount; i++) { for (uint32_t i = 0; i < userRegs->RegsCount; i++) {
auto pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); uint32_t regAddr = userRegs->Reg[i].Offset;
*pRegister = GfxFamily::cmdInitStoreRegisterMem;
regAddr = userRegs->Reg[i].Offset;
pRegister->setRegisterAddress(regAddr);
//offset between base (low) registers is cl_ulong wide //offset between base (low) registers is cl_ulong wide
address = baseAddr + i * sizeof(cl_ulong); uint64_t address = baseAddr + i * sizeof(cl_ulong);
pRegister->setMemoryAddress(address); dispatchStoreRegisterCommand(commandStream, address, regAddr);
cmdNum++;
if (userRegs->Reg[i].BitSize > 32) { if (userRegs->Reg[i].BitSize > 32) {
pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); dispatchStoreRegisterCommand(commandStream, address + sizeof(cl_uint), regAddr + sizeof(cl_uint));
*pRegister = GfxFamily::cmdInitStoreRegisterMem;
regAddr += sizeof(cl_uint);
pRegister->setRegisterAddress(regAddr);
address += sizeof(cl_uint);
pRegister->setMemoryAddress(address);
cmdNum++;
} }
} }
} }
template <typename GfxFamily> template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands( void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(
CommandQueue &commandQueue, TagNode<HwPerfCounter> &hwPerfCounter,
NEO::HwPerfCounter &hwPerfCounter, LinearStream *commandStream) {
NEO::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaStatus), INSTR_GFX_OFFSETS::INSTR_OA_STATUS);
dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaHead), INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR);
uint64_t address = 0; dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaTail), INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR);
//OA Status
auto pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pOaRegister = GfxFamily::cmdInitStoreRegisterMem;
pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_STATUS);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.OaStatus));
pOaRegister->setMemoryAddress(address);
//OA Head
pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pOaRegister = GfxFamily::cmdInitStoreRegisterMem;
pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.OaHead));
pOaRegister->setMemoryAddress(address);
//OA Tail
pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pOaRegister = GfxFamily::cmdInitStoreRegisterMem;
pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.OaTail));
pOaRegister->setMemoryAddress(address);
} }
template <typename GfxFamily> template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart( void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
CommandQueue &commandQueue, CommandQueue &commandQueue,
NEO::HwPerfCounter &hwPerfCounter, TagNode<HwPerfCounter> &hwPerfCounter,
NEO::LinearStream *commandStream) { LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT; using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
auto perfCounters = commandQueue.getPerfCounters(); auto perfCounters = commandQueue.getPerfCounters();
@@ -294,25 +230,25 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
uint32_t currentReportId = perfCounters->getCurrentReportId(); uint32_t currentReportId = perfCounters->getCurrentReportId();
uint64_t address = 0; uint64_t address = 0;
//flush command streamer //flush command streamer
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
*pPipeControlCmd = GfxFamily::cmdInitPipeControl; *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
pPipeControlCmd->setCommandStreamerStallEnable(true); pPipeControlCmd->setCommandStreamerStallEnable(true);
//Store value of NOOPID register //Store value of NOOPID register
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, true); GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdBegin), INSTR_MMIO_NOOPID);
//Read Core Frequency //Read Core Frequency
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, true); GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqBegin), INSTR_MMIO_RPSTAT1);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, true); GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, true);
auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT)); auto pReportPerfCount = commandStream->getSpaceForCmd<MI_REPORT_PERF_COUNT>();
*pReportPerfCount = GfxFamily::cmdInitReportPerfCount; *pReportPerfCount = GfxFamily::cmdInitReportPerfCount;
pReportPerfCount->setReportId(currentReportId); pReportPerfCount->setReportId(currentReportId);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportBegin.Oa)); address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Oa);
pReportPerfCount->setMemoryAddress(address); pReportPerfCount->setMemoryAddress(address);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWTimeStamp.GlobalStartTS)); address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalStartTS);
PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false); PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false);
@@ -324,41 +260,39 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
template <typename GfxFamily> template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd( void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(
CommandQueue &commandQueue, CommandQueue &commandQueue,
NEO::HwPerfCounter &hwPerfCounter, TagNode<HwPerfCounter> &hwPerfCounter,
NEO::LinearStream *commandStream) { LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT; using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
auto perfCounters = commandQueue.getPerfCounters(); auto perfCounters = commandQueue.getPerfCounters();
uint32_t currentReportId = perfCounters->getCurrentReportId(); uint32_t currentReportId = perfCounters->getCurrentReportId();
uint64_t address = 0;
//flush command streamer //flush command streamer
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL)); auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
*pPipeControlCmd = GfxFamily::cmdInitPipeControl; *pPipeControlCmd = GfxFamily::cmdInitPipeControl;
pPipeControlCmd->setCommandStreamerStallEnable(true); pPipeControlCmd->setCommandStreamerStallEnable(true);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(commandQueue, hwPerfCounter, commandStream); GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(hwPerfCounter, commandStream);
//Timestamp: Global End //Timestamp: Global End
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWTimeStamp.GlobalEndTS)); uint64_t address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalEndTS);
PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false); PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false);
auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT)); auto pReportPerfCount = commandStream->getSpaceForCmd<MI_REPORT_PERF_COUNT>();
*pReportPerfCount = GfxFamily::cmdInitReportPerfCount; *pReportPerfCount = GfxFamily::cmdInitReportPerfCount;
pReportPerfCount->setReportId(currentReportId); pReportPerfCount->setReportId(currentReportId);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.Oa)); address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Oa);
pReportPerfCount->setMemoryAddress(address); pReportPerfCount->setMemoryAddress(address);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, false); GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandQueue, hwPerfCounter, commandStream, false);
//Store value of NOOPID register //Store value of NOOPID register
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersNoopidRegisterCommands(commandQueue, hwPerfCounter, commandStream, false); GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdEnd), INSTR_MMIO_NOOPID);
//Read Core Frequency //Read Core Frequency
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersReadFreqRegisterCommands(commandQueue, hwPerfCounter, commandStream, false); GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqEnd), INSTR_MMIO_RPSTAT1);
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, false); GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, hwPerfCounter, commandStream, false);

View File

@@ -41,7 +41,7 @@ class HardwareInterface {
const CsrDependencies &csrDependencies, const CsrDependencies &csrDependencies,
KernelOperation **blockedCommandsData, KernelOperation **blockedCommandsData,
TagNode<HwTimeStamps> *hwTimeStamps, TagNode<HwTimeStamps> *hwTimeStamps,
HwPerfCounter *hwPerfCounter, TagNode<HwPerfCounter> *hwPerfCounter,
TimestampPacketContainer *previousTimestampPacketNodes, TimestampPacketContainer *previousTimestampPacketNodes,
TimestampPacketContainer *currentTimestampPacketNodes, TimestampPacketContainer *currentTimestampPacketNodes,
PreemptionMode preemptionMode, PreemptionMode preemptionMode,
@@ -67,13 +67,13 @@ class HardwareInterface {
const DispatchInfo &dispatchInfo, const DispatchInfo &dispatchInfo,
const MultiDispatchInfo &multiDispatchInfo, const MultiDispatchInfo &multiDispatchInfo,
TagNode<HwTimeStamps> *hwTimeStamps, TagNode<HwTimeStamps> *hwTimeStamps,
HwPerfCounter *hwPerfCounter, TagNode<HwPerfCounter> *hwPerfCounter,
LinearStream *commandStream, LinearStream *commandStream,
CommandQueue &commandQueue); CommandQueue &commandQueue);
static void dispatchProfilingPerfEndCommands( static void dispatchProfilingPerfEndCommands(
TagNode<HwTimeStamps> *hwTimeStamps, TagNode<HwTimeStamps> *hwTimeStamps,
HwPerfCounter *hwPerfCounter, TagNode<HwPerfCounter> *hwPerfCounter,
LinearStream *commandStream, LinearStream *commandStream,
CommandQueue &commandQueue); CommandQueue &commandQueue);

View File

@@ -28,7 +28,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
const CsrDependencies &csrDependencies, const CsrDependencies &csrDependencies,
KernelOperation **blockedCommandsData, KernelOperation **blockedCommandsData,
TagNode<HwTimeStamps> *hwTimeStamps, TagNode<HwTimeStamps> *hwTimeStamps,
HwPerfCounter *hwPerfCounter, TagNode<HwPerfCounter> *hwPerfCounter,
TimestampPacketContainer *previousTimestampPacketNodes, TimestampPacketContainer *previousTimestampPacketNodes,
TimestampPacketContainer *currentTimestampPacketNodes, TimestampPacketContainer *currentTimestampPacketNodes,
PreemptionMode preemptionMode, PreemptionMode preemptionMode,

View File

@@ -53,7 +53,7 @@ inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
const DispatchInfo &dispatchInfo, const DispatchInfo &dispatchInfo,
const MultiDispatchInfo &multiDispatchInfo, const MultiDispatchInfo &multiDispatchInfo,
TagNode<HwTimeStamps> *hwTimeStamps, TagNode<HwTimeStamps> *hwTimeStamps,
HwPerfCounter *hwPerfCounter, TagNode<HwPerfCounter> *hwPerfCounter,
LinearStream *commandStream, LinearStream *commandStream,
CommandQueue &commandQueue) { CommandQueue &commandQueue) {
@@ -71,7 +71,7 @@ inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
template <typename GfxFamily> template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands( inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
TagNode<HwTimeStamps> *hwTimeStamps, TagNode<HwTimeStamps> *hwTimeStamps,
HwPerfCounter *hwPerfCounter, TagNode<HwPerfCounter> *hwPerfCounter,
LinearStream *commandStream, LinearStream *commandStream,
CommandQueue &commandQueue) { CommandQueue &commandQueue) {

View File

@@ -213,7 +213,6 @@ void DeviceQueueHw<GfxFamily>::addExecutionModelCleanUpSection(Kernel *parentKer
igilQueue->m_controls.m_CleanupSectionAddress = ptrOffset(slbBuffer->getGpuAddress(), slbCS.getUsed()); igilQueue->m_controls.m_CleanupSectionAddress = ptrOffset(slbBuffer->getGpuAddress(), slbCS.getUsed());
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&slbCS, *parentKernel, true); GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&slbCS, *parentKernel, true);
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
if (hwTimeStamp != nullptr) { if (hwTimeStamp != nullptr) {

View File

@@ -1194,13 +1194,8 @@ TEST_F(EventTest, hwPerfCounterMemoryIsPlacedInGraphicsAllocation) {
void *memoryStorage = allocation->getUnderlyingBuffer(); void *memoryStorage = allocation->getUnderlyingBuffer();
size_t graphicsAllocationSize = allocation->getUnderlyingBufferSize(); size_t graphicsAllocationSize = allocation->getUnderlyingBufferSize();
uintptr_t perfCounterAddress = reinterpret_cast<uintptr_t>(perfCounter); EXPECT_GE(perfCounter, memoryStorage);
uintptr_t graphicsAllocationStart = reinterpret_cast<uintptr_t>(memoryStorage); EXPECT_LE(perfCounter + 1, ptrOffset(memoryStorage, graphicsAllocationSize));
if (!((perfCounterAddress >= graphicsAllocationStart) &&
((perfCounterAddress + sizeof(HwPerfCounter)) <= (graphicsAllocationStart + graphicsAllocationSize)))) {
EXPECT_TRUE(false);
}
} }
TEST_F(EventTest, IsPerfCounter_DisabledByNullQueue) { TEST_F(EventTest, IsPerfCounter_DisabledByNullQueue) {

View File

@@ -51,6 +51,8 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
using BaseClass::CommandStreamReceiver::latestSentStatelessMocsConfig; using BaseClass::CommandStreamReceiver::latestSentStatelessMocsConfig;
using BaseClass::CommandStreamReceiver::latestSentTaskCount; using BaseClass::CommandStreamReceiver::latestSentTaskCount;
using BaseClass::CommandStreamReceiver::mediaVfeStateDirty; using BaseClass::CommandStreamReceiver::mediaVfeStateDirty;
using BaseClass::CommandStreamReceiver::perfCounterAllocator;
using BaseClass::CommandStreamReceiver::profilingTimeStampAllocator;
using BaseClass::CommandStreamReceiver::requiredScratchSize; using BaseClass::CommandStreamReceiver::requiredScratchSize;
using BaseClass::CommandStreamReceiver::requiredThreadArbitrationPolicy; using BaseClass::CommandStreamReceiver::requiredThreadArbitrationPolicy;
using BaseClass::CommandStreamReceiver::samplerCacheFlushRequired; using BaseClass::CommandStreamReceiver::samplerCacheFlushRequired;

View File

@@ -422,7 +422,9 @@ class MyOSTime : public OSTime {
return 0; return 0;
} }
}; };
int MyOSTime::instanceNum = 0; int MyOSTime::instanceNum = 0;
TEST(EventProfilingTest, givenEventWhenCompleteIsZeroThenCalcProfilingDataSetsEndTimestampInCompleteTimestampAndDoesntCallOsTimeMethods) { TEST(EventProfilingTest, givenEventWhenCompleteIsZeroThenCalcProfilingDataSetsEndTimestampInCompleteTimestampAndDoesntCallOsTimeMethods) {
std::unique_ptr<MockDevice> device(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr)); std::unique_ptr<MockDevice> device(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
MyOSTime::instanceNum = 0; MyOSTime::instanceNum = 0;
@@ -523,7 +525,21 @@ struct ProfilingWithPerfCountersTests : public ProfilingTests,
ProfilingTests::TearDown(); ProfilingTests::TearDown();
PerformanceCountersFixture::TearDown(); PerformanceCountersFixture::TearDown();
} }
template <typename GfxFamily>
GenCmdList::iterator expectStoreRegister(GenCmdList::iterator itor, uint64_t memoryAddress, uint32_t registerAddress) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
itor = find<MI_STORE_REGISTER_MEM *>(itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
auto pStore = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
EXPECT_EQ(memoryAddress, pStore->getMemoryAddress());
EXPECT_EQ(registerAddress, pStore->getRegisterAddress());
itor++;
return itor;
}
}; };
HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCounterAndForWorkloadWithKernelWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) { HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCounterAndForWorkloadWithKernelWHENGetCSFromCmdQueueTHENEnoughSpaceInCS) {
typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM; typedef typename FamilyType::MI_STORE_REGISTER_MEM MI_STORE_REGISTER_MEM;
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
@@ -859,6 +875,88 @@ HWTEST_F(ProfilingWithPerfCountersTests,
pCmdQ->setPerfCountersEnabled(false, UINT32_MAX); pCmdQ->setPerfCountersEnabled(false, UINT32_MAX);
} }
template <typename TagType>
struct FixedGpuAddressTagAllocator : TagAllocator<TagType> {
struct MockTagNode : TagNode<TagType> {
void setGpuAddress(uint64_t value) { this->gpuAddress = value; }
};
FixedGpuAddressTagAllocator(CommandStreamReceiver &csr, uint64_t gpuAddress)
: TagAllocator<TagType>(csr.getMemoryManager(), csr.getPreferredTagPoolSize(), MemoryConstants::cacheLineSize) {
auto tag = reinterpret_cast<MockTagNode *>(this->freeTags.peekHead());
tag->setGpuAddress(gpuAddress);
}
};
HWTEST_F(ProfilingWithPerfCountersTests, GIVENCommandQueueWithProfilingPerfCountersWHENWalkerIsDispatchedTHENRegisterStoresArePresentInCS) {
uint64_t timeStampGpuAddress = 0x123456000;
uint64_t perfCountersGpuAddress = 0xabcdef000;
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
csr.profilingTimeStampAllocator.reset(new FixedGpuAddressTagAllocator<HwTimeStamps>(csr, timeStampGpuAddress));
csr.perfCounterAllocator.reset(new FixedGpuAddressTagAllocator<HwPerfCounter>(csr, perfCountersGpuAddress));
pCmdQ->setPerfCountersEnabled(true, 1);
MockKernel kernel(program.get(), kernelInfo, *pDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
uint32_t dimensions = 1;
cl_event event;
cl_kernel clKernel = &kernel;
static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueKernel(
clKernel,
dimensions,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&event);
auto pEvent = static_cast<MockEvent<Event> *>(event);
EXPECT_EQ(pEvent->getHwTimeStampNode()->getGpuAddress(), timeStampGpuAddress);
EXPECT_EQ(pEvent->getHwPerfCounterNode()->getGpuAddress(), perfCountersGpuAddress);
parseCommands<FamilyType>(*pCmdQ);
auto itor = expectStoreRegister<FamilyType>(cmdList.begin(), timeStampGpuAddress + offsetof(HwTimeStamps, ContextStartTS), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdBegin), INSTR_MMIO_NOOPID);
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqBegin), INSTR_MMIO_RPSTAT1);
for (auto i = 0u; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) {
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Gp) + i * sizeof(cl_uint),
INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint));
}
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User), 0);
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User) + 8, 0);
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User) + 12, 4);
// after WALKER:
itor = expectStoreRegister<FamilyType>(itor, timeStampGpuAddress + offsetof(HwTimeStamps, ContextEndTS), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.OaStatus), INSTR_GFX_OFFSETS::INSTR_OA_STATUS);
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.OaHead), INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR);
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.OaTail), INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR);
for (auto i = 0u; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) {
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Gp) + i * sizeof(cl_uint),
INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint));
}
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdEnd), INSTR_MMIO_NOOPID);
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqEnd), INSTR_MMIO_RPSTAT1);
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User), 0);
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User) + 8, 0);
itor = expectStoreRegister<FamilyType>(itor, perfCountersGpuAddress + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User) + 12, 4);
EXPECT_TRUE(pEvent->calcProfilingData());
clReleaseEvent(event);
pCmdQ->setPerfCountersEnabled(false, UINT32_MAX);
}
struct MockTimestampContainer : public TimestampPacketContainer { struct MockTimestampContainer : public TimestampPacketContainer {
~MockTimestampContainer() override { ~MockTimestampContainer() override {
for (const auto &node : timestampPacketNodes) { for (const auto &node : timestampPacketNodes) {