Store single KernelInfo in Kernel

remove root device index from Kernel's methods

Related-To: NEO-5001
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2021-03-22 15:26:03 +00:00
committed by Compute-Runtime-Automation
parent ecceddcab6
commit 7098e9c5f2
136 changed files with 1043 additions and 1192 deletions

View File

@@ -536,15 +536,14 @@ void CommandQueue::enqueueBlockedMapUnmapOperation(const cl_event *eventWaitList
bool CommandQueue::setupDebugSurface(Kernel *kernel) {
auto debugSurface = getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation();
auto rootDeviceIndex = device->getRootDeviceIndex();
DEBUG_BREAK_IF(!kernel->requiresSshForBuffers(rootDeviceIndex));
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap(rootDeviceIndex)),
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful);
DEBUG_BREAK_IF(!kernel->requiresSshForBuffers());
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap()),
kernel->getKernelInfo().kernelDescriptor.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful);
void *addressToPatch = reinterpret_cast<void *>(debugSurface->getGpuAddress());
size_t sizeToPatch = debugSurface->getUnderlyingBufferSize();
Buffer::setSurfaceState(&device->getDevice(), surfaceState, false, false, sizeToPatch,
addressToPatch, 0, debugSurface, 0, 0,
kernel->getDefaultKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics,
kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics,
kernel->getTotalNumDevicesInContext());
return true;
}
@@ -894,7 +893,7 @@ void CommandQueue::aubCaptureHook(bool &blocking, bool &clearAllDependencies, co
if (getGpgpuCommandStreamReceiver().getType() > CommandStreamReceiverType::CSR_HW) {
for (auto &dispatchInfo : multiDispatchInfo) {
auto kernelName = dispatchInfo.getKernel()->getKernelInfo(device->getRootDeviceIndex()).kernelDescriptor.kernelMetadata.kernelName;
auto kernelName = dispatchInfo.getKernel()->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName;
getGpgpuCommandStreamReceiver().addAubComment(kernelName.c_str());
}
}

View File

@@ -67,11 +67,10 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface *(&surfaces)[surfaceCount
if (DebugManager.flags.ForceDispatchScheduler.get()) {
forceDispatchScheduler(multiDispatchInfo);
} else {
auto rootDeviceIndex = device->getRootDeviceIndex();
kernel->updateAuxTranslationRequired();
if (kernel->isAuxTranslationRequired()) {
kernel->fillWithKernelObjsForAuxTranslation(kernelObjsForAuxTranslation, rootDeviceIndex);
kernel->fillWithKernelObjsForAuxTranslation(kernelObjsForAuxTranslation);
multiDispatchInfo.setKernelObjsForAuxTranslation(kernelObjsForAuxTranslation);
if (!kernelObjsForAuxTranslation.empty()) {
@@ -86,13 +85,13 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface *(&surfaces)[surfaceCount
dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::AuxToNonAux);
}
if (kernel->getKernelInfo(rootDeviceIndex).builtinDispatchBuilder == nullptr) {
if (kernel->getKernelInfo().builtinDispatchBuilder == nullptr) {
DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::WalkerSplit> builder(getClDevice());
builder.setDispatchGeometry(workDim, workItems, enqueuedWorkSizes, globalOffsets, Vec3<size_t>{0, 0, 0}, localWorkSizesIn);
builder.setKernel(kernel);
builder.bake(multiDispatchInfo);
} else {
auto builder = kernel->getKernelInfo(rootDeviceIndex).builtinDispatchBuilder;
auto builder = kernel->getKernelInfo().builtinDispatchBuilder;
builder->buildDispatchInfos(multiDispatchInfo, kernel, workDim, workItems, enqueuedWorkSizes, globalOffsets);
if (multiDispatchInfo.size() == 0) {
@@ -357,7 +356,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
if (blockQueue) {
if (parentKernel) {
size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel, device->getRootDeviceIndex());
size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
blockedCommandsData->surfaceStateHeapSizeEM = minSizeSSHForEM;
}
@@ -400,7 +399,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
printfHandler->prepareDispatch(multiDispatchInfo);
}
if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer(device->getRootDeviceIndex())) {
if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) {
auto &gws = multiDispatchInfo.begin()->getGWS();
auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize();
size_t workGroupsCount = (gws.x * gws.y * gws.z) /
@@ -569,8 +568,7 @@ void CommandQueueHw<GfxFamily>::processDeviceEnqueue(DeviceQueueHw<GfxFamily> *d
TagNode<HwTimeStamps> *hwTimeStamps,
bool &blocking) {
auto parentKernel = multiDispatchInfo.peekParentKernel();
auto rootDeviceIndex = devQueueHw->getDevice().getRootDeviceIndex();
size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
bool isCcsUsed = EngineHelpers::isCcs(gpgpuEngine->osContext->getEngineType());
uint32_t taskCount = getGpgpuCommandStreamReceiver().peekTaskCount() + 1;
@@ -684,8 +682,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
printfHandler->makeResident(getGpgpuCommandStreamReceiver());
}
auto rootDeviceIndex = device->getRootDeviceIndex();
if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer(rootDeviceIndex)) {
if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) {
device->getDevice().syncBufferHandler->makeResident(getGpgpuCommandStreamReceiver());
}
@@ -722,7 +719,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
kernel->makeResident(getGpgpuCommandStreamReceiver());
requiresCoherency |= kernel->requiresCoherency();
mediaSamplerRequired |= kernel->isVmeKernel();
auto numGrfRequiredByKernel = static_cast<uint32_t>(kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelAttributes.numGrfRequired);
auto numGrfRequiredByKernel = static_cast<uint32_t>(kernel->getKernelInfo().kernelDescriptor.kernelAttributes.numGrfRequired);
numGrfRequired = std::max(numGrfRequired, numGrfRequiredByKernel);
specialPipelineSelectMode |= kernel->requiresSpecialPipelineSelectMode();
auxTranslationRequired |= kernel->isAuxTranslationRequired();
@@ -730,11 +727,11 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
anyUncacheableArgs = true;
}
if (kernel->requiresPerDssBackedBuffer(rootDeviceIndex)) {
if (kernel->requiresPerDssBackedBuffer()) {
usePerDssBackedBuffer = true;
}
if (kernel->getDefaultKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics) {
if (kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics) {
useGlobalAtomics = true;
}
}

View File

@@ -37,8 +37,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueKernel(
size_t enqueuedLocalWorkSize[3] = {0, 0, 0};
auto &kernel = *pKernel;
auto rootDeviceIndex = device->getRootDeviceIndex();
const auto &kernelInfo = kernel.getKernelInfo(rootDeviceIndex);
const auto &kernelInfo = kernel.getKernelInfo();
if (kernel.isParentKernel && !this->context->getDefaultDeviceQueue()) {
return CL_INVALID_OPERATION;
@@ -109,7 +108,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueKernel(
Surface *surfaces[] = {&s};
if (context->isProvidingPerformanceHints()) {
if (kernel.hasPrintfOutput(rootDeviceIndex)) {
if (kernel.hasPrintfOutput()) {
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL, PRINTF_DETECTED_IN_KERNEL, kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str());
}
if (kernel.requiresCoherency()) {

View File

@@ -102,7 +102,7 @@ class GpgpuWalkerHelper {
bool disablePerfMode);
static size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
static size_t getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel, uint32_t rootDeviceIndex);
static size_t getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel);
static size_t setGpgpuWalkerThreadData(
WALKER_TYPE<GfxFamily> *walkerCmd,
@@ -200,7 +200,7 @@ IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInf
if (Kernel *parentKernel = multiDispatchInfo.peekParentKernel()) {
if (heapType == IndirectHeap::SURFACE_STATE) {
expectedSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel, commandQueue.getDevice().getRootDeviceIndex());
expectedSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
} else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
{
DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());

View File

@@ -172,7 +172,7 @@ size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(const K
}
template <typename GfxFamily>
size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel, uint32_t rootDeviceIndex) {
size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel) {
return 0u;
}

View File

@@ -68,8 +68,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
IndirectHeap *dsh,
bool isCcsUsed) {
auto rootDeviceIndex = devQueueHw.getDevice().getRootDeviceIndex();
const auto &kernelInfo = scheduler.getKernelInfo(rootDeviceIndex);
const auto &kernelInfo = scheduler.getKernelInfo();
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
@@ -117,8 +116,8 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
auto pGpGpuWalkerCmd = commandStream.getSpaceForCmd<GPGPU_WALKER>();
GPGPU_WALKER cmdWalker = GfxFamily::cmdInitGpgpuWalker;
bool inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(scheduler, rootDeviceIndex);
auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(scheduler, rootDeviceIndex);
bool inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(scheduler);
auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(scheduler);
HardwareCommandsHelper<GfxFamily>::sendIndirectState(
commandStream,
@@ -126,7 +125,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
*ioh,
*ssh,
scheduler,
scheduler.getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed, rootDeviceIndex),
scheduler.getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
simd,
localWorkSizes,
offsetInterfaceDescriptorTable,
@@ -195,7 +194,7 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilin
}
size += PerformanceCounters::getGpuCommandsSize(commandQueue, reservePerfCounters);
size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
size += GpgpuWalkerHelper<GfxFamily>::getSizeForWaDisableRccRhwoOptimization(pKernel, commandQueue.getDevice().getRootDeviceIndex());
size += GpgpuWalkerHelper<GfxFamily>::getSizeForWaDisableRccRhwoOptimization(pKernel);
return size;
}

View File

@@ -102,7 +102,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
size_t sizeToPatch = debugSurface->getUnderlyingBufferSize();
Buffer::setSurfaceState(&commandQueue.getDevice(), commandQueue.getDevice().getDebugger()->getDebugSurfaceReservedSurfaceState(*ssh),
false, false, sizeToPatch, addressToPatch, 0, debugSurface, 0, 0,
mainKernel->getDefaultKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics,
mainKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics,
mainKernel->getTotalNumDevicesInContext());
}
@@ -244,7 +244,6 @@ template <typename GfxFamily>
void HardwareInterface<GfxFamily>::obtainIndirectHeaps(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo,
bool blockedQueue, IndirectHeap *&dsh, IndirectHeap *&ioh, IndirectHeap *&ssh) {
auto parentKernel = multiDispatchInfo.peekParentKernel();
auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex();
if (blockedQueue) {
size_t dshSize = 0;
@@ -254,7 +253,7 @@ void HardwareInterface<GfxFamily>::obtainIndirectHeaps(CommandQueue &commandQueu
if (parentKernel) {
dshSize = commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize();
sshSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
sshSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
iohEqualsDsh = true;
colorCalcSize = static_cast<size_t>(commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize);
} else {

View File

@@ -70,11 +70,10 @@ inline void HardwareInterface<GfxFamily>::programWalker(
Vec3<size_t> &numberOfWorkgroups,
Vec3<size_t> &startOfWorkgroups) {
auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex();
auto walkerCmdBuf = allocateWalkerSpace(commandStream, kernel);
WALKER_TYPE<GfxFamily> walkerCmd = GfxFamily::cmdInitGpgpuWalker;
uint32_t dim = dispatchInfo.getDim();
uint32_t simd = kernel.getKernelInfo(rootDeviceIndex).getMaxSimdSize();
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
size_t startWorkGroups[3] = {startOfWorkgroups.x, startOfWorkgroups.y, startOfWorkgroups.z};
@@ -86,7 +85,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
}
auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel, rootDeviceIndex);
auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
HardwareCommandsHelper<GfxFamily>::sendIndirectState(
commandStream,
@@ -94,7 +93,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
ioh,
ssh,
kernel,
kernel.getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed, rootDeviceIndex),
kernel.getKernelStartOffset(true, kernelUsesLocalIds, isCcsUsed),
simd,
localWorkSizes,
offsetInterfaceDescriptorTable,
@@ -105,7 +104,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
true,
commandQueue.getDevice());
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernel.getKernelInfo(rootDeviceIndex).kernelDescriptor,
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernel.getKernelInfo().kernelDescriptor,
globalOffsets, startWorkGroups,
numWorkGroups, localWorkSizes, simd, dim,
false, false, 0u);

View File

@@ -416,11 +416,10 @@ Vec3<size_t> computeWorkgroupSize(const DispatchInfo &dispatchInfo) {
if (kernel != nullptr) {
auto &device = dispatchInfo.getClDevice();
auto rootDeviceIndex = device.getRootDeviceIndex();
const auto &hwInfo = device.getHardwareInfo();
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
auto isSimulation = device.isSimulation();
if (kernel->requiresLimitedWorkgroupSize(rootDeviceIndex) && hwHelper.isSpecialWorkgroupSizeRequired(hwInfo, isSimulation)) {
if (kernel->requiresLimitedWorkgroupSize() && hwHelper.isSpecialWorkgroupSizeRequired(hwInfo, isSimulation)) {
setSpecialWorkgroupSize(workGroupSize);
} else if (DebugManager.flags.EnableComputeWorkSizeND.get()) {
WorkSizeInfo wsInfo(dispatchInfo);
@@ -428,7 +427,7 @@ Vec3<size_t> computeWorkgroupSize(const DispatchInfo &dispatchInfo) {
computeWorkgroupSizeND(wsInfo, workGroupSize, workItems, dispatchInfo.getDim());
} else {
auto maxWorkGroupSize = kernel->getMaxKernelWorkGroupSize();
auto simd = kernel->getKernelInfo(rootDeviceIndex).getMaxSimdSize();
auto simd = kernel->getKernelInfo().getMaxSimdSize();
size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
if (dispatchInfo.getDim() == 1) {
computeWorkgroupSize1D(maxWorkGroupSize, workGroupSize, workItems, simd);
@@ -476,7 +475,7 @@ void provideLocalWorkGroupSizeHints(Context *context, DispatchInfo dispatchInfo)
preferredWorkGroupSize[1] = lws.y;
preferredWorkGroupSize[2] = lws.z;
const auto &kernelInfo = dispatchInfo.getKernel()->getKernelInfo(dispatchInfo.getClDevice().getRootDeviceIndex());
const auto &kernelInfo = dispatchInfo.getKernel()->getKernelInfo();
if (dispatchInfo.getEnqueuedWorkgroupSize().x == 0) {
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL, NULL_LOCAL_WORKGROUP_SIZE, kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(),
preferredWorkGroupSize[0], preferredWorkGroupSize[1], preferredWorkGroupSize[2]);