Files
compute-runtime/opencl/source/gtpin/gtpin_callbacks.cpp
Kacper Nowak 27636f7258 fix(gtpin/ocl): Pass debug zebin to gtpin
In order to unify behaviour between L0/OCL with GTPin,
OCL callbacks should return debug zebin (if instrumented binary is
zebin). This commit fixes currently exisitng issue with no debug data being
passed to GTPin if instrumented binary is zebin.
Signed-off-by: Kacper Nowak <kacper.nowak@intel.com>
2023-03-07 15:35:42 +01:00

290 lines
13 KiB
C++

/*
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/memory_manager/surface.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/page_fault_manager/cpu_page_fault_manager.h"
#include "opencl/source/cl_device/cl_device.h"
#include "opencl/source/command_queue/command_queue.h"
#include "opencl/source/context/context.h"
#include "opencl/source/gtpin/gtpin_defs.h"
#include "opencl/source/gtpin/gtpin_gfx_core_helper.h"
#include "opencl/source/gtpin/gtpin_notify.h"
#include "opencl/source/kernel/kernel.h"
#include "opencl/source/kernel/multi_device_kernel.h"
#include "opencl/source/mem_obj/buffer.h"
#include "opencl/source/program/program.h"
#include "CL/cl.h"
#include "ocl_igc_shared/gtpin/gtpin_ocl_interface.h"
#include <deque>
#include <vector>
using namespace gtpin;
namespace NEO {
using GTPinLockType = std::recursive_mutex;
extern gtpin::ocl::gtpin_events_t GTPinCallbacks;
igc_init_t *pIgcInit = nullptr;
std::atomic<int> sequenceCount(1);
CommandQueue *pCmdQueueForFlushTask = nullptr;
std::deque<gtpinkexec_t> kernelExecQueue;
GTPinLockType kernelExecQueueLock;
void gtpinNotifyContextCreate(cl_context context) {
if (isGTPinInitialized) {
platform_info_t gtpinPlatformInfo;
auto pContext = castToObjectOrAbort<Context>(context);
auto pDevice = pContext->getDevice(0);
UNRECOVERABLE_IF(pDevice == nullptr);
auto &gtpinHelper = pDevice->getGTPinGfxCoreHelper();
gtpinPlatformInfo.gen_version = (gtpin::GTPIN_GEN_VERSION)gtpinHelper.getGenVersion();
gtpinPlatformInfo.device_id = static_cast<uint32_t>(pDevice->getHardwareInfo().platform.usDeviceID);
(*GTPinCallbacks.onContextCreate)((context_handle_t)context, &gtpinPlatformInfo, &pIgcInit);
}
}
void gtpinNotifyContextDestroy(cl_context context) {
if (isGTPinInitialized) {
(*GTPinCallbacks.onContextDestroy)((context_handle_t)context);
}
}
void gtpinNotifyKernelCreate(cl_kernel kernel) {
if (nullptr == kernel) {
return;
}
if (isGTPinInitialized) {
auto pMultiDeviceKernel = castToObjectOrAbort<MultiDeviceKernel>(kernel);
auto pKernel = pMultiDeviceKernel->getDefaultKernel();
auto &device = pMultiDeviceKernel->getDevices()[0];
size_t gtpinBTI = pKernel->getNumberOfBindingTableStates();
// Enlarge local copy of SSH by 1 SS
auto &gtpinHelper = device->getGTPinGfxCoreHelper();
if (!gtpinHelper.addSurfaceState(pKernel)) {
// Kernel with no SSH or Kernel EM, not supported
return;
}
if (pKernel->isKernelHeapSubstituted()) {
// ISA for this kernel was already substituted
return;
}
// Notify GT-Pin that new kernel was created
Context *pContext = &(pKernel->getContext());
cl_context context = pContext;
auto &kernelInfo = pKernel->getKernelInfo();
instrument_params_in_t paramsIn = {};
paramsIn.kernel_type = GTPIN_KERNEL_TYPE_CS;
paramsIn.simd = (GTPIN_SIMD_WIDTH)kernelInfo.getMaxSimdSize();
paramsIn.orig_kernel_binary = (uint8_t *)pKernel->getKernelHeap();
paramsIn.orig_kernel_size = static_cast<uint32_t>(pKernel->getKernelHeapSize());
paramsIn.buffer_type = GTPIN_BUFFER_BINDFULL;
paramsIn.buffer_desc.BTI = static_cast<uint32_t>(gtpinBTI);
paramsIn.igc_hash_id = kernelInfo.shaderHashCode;
paramsIn.kernel_name = (char *)kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str();
paramsIn.igc_info = kernelInfo.igcInfoForGtpin;
if (kernelInfo.debugData.vIsa != nullptr) {
paramsIn.debug_data = kernelInfo.debugData.vIsa;
paramsIn.debug_data_size = static_cast<uint32_t>(kernelInfo.debugData.vIsaSize);
} else {
const auto rootDeviceIndex = pMultiDeviceKernel->getDevices()[0]->getRootDeviceIndex();
const auto &debugDataPerProgram = pMultiDeviceKernel->getProgram()->getDebugData(rootDeviceIndex);
paramsIn.debug_data = debugDataPerProgram;
paramsIn.debug_data_size = static_cast<uint32_t>(pMultiDeviceKernel->getProgram()->getDebugDataSize(rootDeviceIndex));
}
instrument_params_out_t paramsOut = {0};
(*GTPinCallbacks.onKernelCreate)((context_handle_t)(cl_context)context, &paramsIn, &paramsOut);
// Substitute ISA of created kernel with instrumented code
pKernel->substituteKernelHeap(paramsOut.inst_kernel_binary, paramsOut.inst_kernel_size);
pKernel->setKernelId(paramsOut.kernel_id);
}
}
void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
if (isGTPinInitialized) {
auto pCmdQ = reinterpret_cast<CommandQueue *>(pCmdQueue);
auto &device = pCmdQ->getDevice();
auto rootDeviceIndex = device.getRootDeviceIndex();
auto pMultiDeviceKernel = castToObjectOrAbort<MultiDeviceKernel>(kernel);
auto pKernel = pMultiDeviceKernel->getKernel(rootDeviceIndex);
if (pKernel->getSurfaceStateHeapSize() == 0) {
// Kernel with no SSH, not supported
return;
}
Context *pContext = &(pKernel->getContext());
cl_context context = (cl_context)pContext;
uint64_t kernelId = pKernel->getKernelId();
command_buffer_handle_t commandBuffer = (command_buffer_handle_t)((uintptr_t)(sequenceCount++));
uint32_t kernelOffset = 0;
resource_handle_t resource = 0;
// Notify GT-Pin that abstract "command buffer" was created
(*GTPinCallbacks.onCommandBufferCreate)((context_handle_t)context, commandBuffer);
// Notify GT-Pin that kernel was submited for execution
(*GTPinCallbacks.onKernelSubmit)(commandBuffer, kernelId, &kernelOffset, &resource);
// Create new record in Kernel Execution Queue describing submited kernel
pKernel->setStartOffset(kernelOffset);
gtpinkexec_t kExec;
kExec.pKernel = pKernel;
kExec.gtpinResource = (cl_mem)resource;
kExec.commandBuffer = commandBuffer;
kExec.pCommandQueue = (CommandQueue *)pCmdQueue;
std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
kernelExecQueue.push_back(kExec);
lock.unlock();
// Patch SSH[gtpinBTI] with GT-Pin resource
if (!resource) {
return;
}
auto clDevice = pContext->getDevice(0);
auto &gtpinHelper = clDevice->getGTPinGfxCoreHelper();
size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI);
if (gtpinHelper.canUseSharedAllocation(device.getHardwareInfo())) {
auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
size_t size = gpuAllocation->getUnderlyingBufferSize();
Buffer::setSurfaceState(&device, pSurfaceState, false, false, size, gpuAllocation->getUnderlyingBuffer(), 0, gpuAllocation, 0, 0,
pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
device.getMemoryManager()->getPageFaultManager()->moveAllocationToGpuDomain(reinterpret_cast<void *>(gpuAllocation->getGpuAddress()));
} else {
cl_mem buffer = (cl_mem)resource;
auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device,
pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
}
}
}
void gtpinNotifyPreFlushTask(void *pCmdQueue) {
if (isGTPinInitialized) {
pCmdQueueForFlushTask = (CommandQueue *)pCmdQueue;
}
}
void gtpinNotifyFlushTask(TaskCountType flushedTaskCount) {
if (isGTPinInitialized) {
std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
size_t numElems = kernelExecQueue.size();
for (size_t n = 0; n < numElems; n++) {
if ((kernelExecQueue[n].pCommandQueue == pCmdQueueForFlushTask) && !kernelExecQueue[n].isTaskCountValid) {
// Update record in Kernel Execution Queue with kernel's TC
kernelExecQueue[n].isTaskCountValid = true;
kernelExecQueue[n].taskCount = flushedTaskCount;
break;
}
}
pCmdQueueForFlushTask = nullptr;
}
}
void gtpinNotifyTaskCompletion(TaskCountType completedTaskCount) {
std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
size_t numElems = kernelExecQueue.size();
for (size_t n = 0; n < numElems;) {
if (kernelExecQueue[n].isTaskCountValid && (kernelExecQueue[n].taskCount <= completedTaskCount)) {
// Notify GT-Pin that execution of "command buffer" was completed
(*GTPinCallbacks.onCommandBufferComplete)(kernelExecQueue[n].commandBuffer);
// Remove kernel's record from Kernel Execution Queue
kernelExecQueue.erase(kernelExecQueue.begin() + n);
numElems--;
} else {
n++;
}
}
}
void gtpinNotifyMakeResident(void *pKernel, void *pCSR) {
if (isGTPinInitialized) {
std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
size_t numElems = kernelExecQueue.size();
for (size_t n = 0; n < numElems; n++) {
if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident && kernelExecQueue[n].gtpinResource) {
// It's time for kernel to make resident its GT-Pin resource
CommandStreamReceiver *pCommandStreamReceiver = reinterpret_cast<CommandStreamReceiver *>(pCSR);
GraphicsAllocation *pGfxAlloc = nullptr;
Context &context = static_cast<Kernel *>(pKernel)->getContext();
auto clDevice = context.getDevice(0);
auto &gtpinHelper = clDevice->getGTPinGfxCoreHelper();
if (gtpinHelper.canUseSharedAllocation(context.getDevice(0)->getHardwareInfo())) {
auto allocData = reinterpret_cast<SvmAllocationData *>(kernelExecQueue[n].gtpinResource);
pGfxAlloc = allocData->gpuAllocations.getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
} else {
cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource;
auto pBuffer = castToObjectOrAbort<Buffer>(gtpinBuffer);
pGfxAlloc = pBuffer->getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
}
pCommandStreamReceiver->makeResident(*pGfxAlloc);
kernelExecQueue[n].isResourceResident = true;
break;
}
}
}
}
void gtpinNotifyUpdateResidencyList(void *pKernel, void *pResVec) {
if (isGTPinInitialized) {
std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
size_t numElems = kernelExecQueue.size();
for (size_t n = 0; n < numElems; n++) {
if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident && kernelExecQueue[n].gtpinResource) {
// It's time for kernel to update its residency list with its GT-Pin resource
std::vector<Surface *> *pResidencyVector = (std::vector<Surface *> *)pResVec;
cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource;
auto pBuffer = castToObjectOrAbort<Buffer>(gtpinBuffer);
auto rootDeviceIndex = kernelExecQueue[n].pCommandQueue->getDevice().getRootDeviceIndex();
GraphicsAllocation *pGfxAlloc = pBuffer->getGraphicsAllocation(rootDeviceIndex);
GeneralSurface *pSurface = new GeneralSurface(pGfxAlloc);
pResidencyVector->push_back(pSurface);
kernelExecQueue[n].isResourceResident = true;
break;
}
}
}
}
void gtpinNotifyPlatformShutdown() {
if (isGTPinInitialized) {
// Clear Kernel Execution Queue
kernelExecQueue.clear();
}
}
void *gtpinGetIgcInit() {
return pIgcInit;
}
void gtpinSetIgcInit(void *pIgcInitPtr) {
pIgcInit = static_cast<igc_init_t *>(pIgcInitPtr);
}
void gtpinRemoveCommandQueue(void *pCmdQueue) {
if (isGTPinInitialized) {
std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
size_t n = 0;
while (n < kernelExecQueue.size()) {
if (kernelExecQueue[n].pCommandQueue == pCmdQueue) {
kernelExecQueue.erase(kernelExecQueue.begin() + n);
} else {
n++;
}
}
}
}
} // namespace NEO