Store device specific kernel members per root device

Related-To: NEO-5001
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2020-12-10 13:22:10 +00:00
committed by Compute-Runtime-Automation
parent 8d2cfd87ae
commit aa1fc85257
30 changed files with 446 additions and 306 deletions

View File

@ -132,7 +132,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueKernel(
",", globalWorkSizeIn[2],
",SIMD:, ", kernelInfo.getMaxSimdSize());
if (totalWorkItems > kernel.maxKernelWorkGroupSize) {
if (totalWorkItems > kernel.getMaxKernelWorkGroupSize(rootDeviceIndex)) {
return CL_INVALID_WORK_GROUP_SIZE;
}

View File

@ -96,31 +96,13 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
// Patch our kernel constants
*scheduler.globalWorkOffsetX = 0;
*scheduler.globalWorkOffsetY = 0;
*scheduler.globalWorkOffsetZ = 0;
*scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
*scheduler.globalWorkSizeY = 1;
*scheduler.globalWorkSizeZ = 1;
*scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
*scheduler.localWorkSizeY = 1;
*scheduler.localWorkSizeZ = 1;
*scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
*scheduler.localWorkSizeY2 = 1;
*scheduler.localWorkSizeZ2 = 1;
*scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
*scheduler.enqueuedLocalWorkSizeY = 1;
*scheduler.enqueuedLocalWorkSizeZ = 1;
*scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
*scheduler.numWorkGroupsY = 0;
*scheduler.numWorkGroupsZ = 0;
*scheduler.workDim = 1;
scheduler.setGlobalWorkOffsetValues(rootDeviceIndex, 0, 0, 0);
scheduler.setGlobalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getGws()), 1, 1);
scheduler.setLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
scheduler.setLocalWorkSize2Values(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
scheduler.setEnqueuedLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
scheduler.setNumWorkGroupsValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getGws() / scheduler.getLws()), 0, 0);
scheduler.setWorkDim(rootDeviceIndex, 1);
// Send our indirect object data
size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};

View File

@ -196,36 +196,23 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex();
// Patch our kernel constants
*kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
*kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
*kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
kernel.setGlobalWorkOffsetValues(rootDeviceIndex, static_cast<uint32_t>(offset.x), static_cast<uint32_t>(offset.y), static_cast<uint32_t>(offset.z));
kernel.setGlobalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(gws.x), static_cast<uint32_t>(gws.y), static_cast<uint32_t>(gws.z));
*kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
*kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
*kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
if (isMainKernel || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
*kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
*kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
*kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
if (isMainKernel || (!kernel.isLocalWorkSize2Patched(rootDeviceIndex))) {
kernel.setLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
}
*kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
*kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
*kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
*kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
*kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
*kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
kernel.setLocalWorkSize2Values(rootDeviceIndex, static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
kernel.setEnqueuedLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(elws.x), static_cast<uint32_t>(elws.y), static_cast<uint32_t>(elws.z));
if (isMainKernel) {
*kernel.numWorkGroupsX = static_cast<uint32_t>(totalNumberOfWorkgroups.x);
*kernel.numWorkGroupsY = static_cast<uint32_t>(totalNumberOfWorkgroups.y);
*kernel.numWorkGroupsZ = static_cast<uint32_t>(totalNumberOfWorkgroups.z);
kernel.setNumWorkGroupsValues(rootDeviceIndex, static_cast<uint32_t>(totalNumberOfWorkgroups.x), static_cast<uint32_t>(totalNumberOfWorkgroups.y), static_cast<uint32_t>(totalNumberOfWorkgroups.z));
}
*kernel.workDim = dim;
kernel.setWorkDim(rootDeviceIndex, dim);
// Send our indirect object data
size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};

View File

@ -427,7 +427,7 @@ Vec3<size_t> computeWorkgroupSize(const DispatchInfo &dispatchInfo) {
size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
computeWorkgroupSizeND(wsInfo, workGroupSize, workItems, dispatchInfo.getDim());
} else {
auto maxWorkGroupSize = kernel->maxKernelWorkGroupSize;
auto maxWorkGroupSize = kernel->getMaxKernelWorkGroupSize(rootDeviceIndex);
auto simd = kernel->getKernelInfo(rootDeviceIndex).getMaxSimdSize();
size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
if (dispatchInfo.getDim() == 1) {