Store device specific kernel members per root device

Related-To: NEO-5001
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2020-12-10 13:22:10 +00:00
committed by Compute-Runtime-Automation
parent 8d2cfd87ae
commit aa1fc85257
30 changed files with 446 additions and 306 deletions

View File

@@ -132,7 +132,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueKernel(
",", globalWorkSizeIn[2],
",SIMD:, ", kernelInfo.getMaxSimdSize());
if (totalWorkItems > kernel.maxKernelWorkGroupSize) {
if (totalWorkItems > kernel.getMaxKernelWorkGroupSize(rootDeviceIndex)) {
return CL_INVALID_WORK_GROUP_SIZE;
}

View File

@@ -96,31 +96,13 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
// Patch our kernel constants
*scheduler.globalWorkOffsetX = 0;
*scheduler.globalWorkOffsetY = 0;
*scheduler.globalWorkOffsetZ = 0;
*scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
*scheduler.globalWorkSizeY = 1;
*scheduler.globalWorkSizeZ = 1;
*scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
*scheduler.localWorkSizeY = 1;
*scheduler.localWorkSizeZ = 1;
*scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
*scheduler.localWorkSizeY2 = 1;
*scheduler.localWorkSizeZ2 = 1;
*scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
*scheduler.enqueuedLocalWorkSizeY = 1;
*scheduler.enqueuedLocalWorkSizeZ = 1;
*scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
*scheduler.numWorkGroupsY = 0;
*scheduler.numWorkGroupsZ = 0;
*scheduler.workDim = 1;
scheduler.setGlobalWorkOffsetValues(rootDeviceIndex, 0, 0, 0);
scheduler.setGlobalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getGws()), 1, 1);
scheduler.setLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
scheduler.setLocalWorkSize2Values(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
scheduler.setEnqueuedLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
scheduler.setNumWorkGroupsValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getGws() / scheduler.getLws()), 0, 0);
scheduler.setWorkDim(rootDeviceIndex, 1);
// Send our indirect object data
size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};

View File

@@ -196,36 +196,23 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex();
// Patch our kernel constants
*kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
*kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
*kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
kernel.setGlobalWorkOffsetValues(rootDeviceIndex, static_cast<uint32_t>(offset.x), static_cast<uint32_t>(offset.y), static_cast<uint32_t>(offset.z));
kernel.setGlobalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(gws.x), static_cast<uint32_t>(gws.y), static_cast<uint32_t>(gws.z));
*kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
*kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
*kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
if (isMainKernel || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
*kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
*kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
*kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
if (isMainKernel || (!kernel.isLocalWorkSize2Patched(rootDeviceIndex))) {
kernel.setLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
}
*kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
*kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
*kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
*kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
*kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
*kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
kernel.setLocalWorkSize2Values(rootDeviceIndex, static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
kernel.setEnqueuedLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(elws.x), static_cast<uint32_t>(elws.y), static_cast<uint32_t>(elws.z));
if (isMainKernel) {
*kernel.numWorkGroupsX = static_cast<uint32_t>(totalNumberOfWorkgroups.x);
*kernel.numWorkGroupsY = static_cast<uint32_t>(totalNumberOfWorkgroups.y);
*kernel.numWorkGroupsZ = static_cast<uint32_t>(totalNumberOfWorkgroups.z);
kernel.setNumWorkGroupsValues(rootDeviceIndex, static_cast<uint32_t>(totalNumberOfWorkgroups.x), static_cast<uint32_t>(totalNumberOfWorkgroups.y), static_cast<uint32_t>(totalNumberOfWorkgroups.z));
}
*kernel.workDim = dim;
kernel.setWorkDim(rootDeviceIndex, dim);
// Send our indirect object data
size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};

View File

@@ -427,7 +427,7 @@ Vec3<size_t> computeWorkgroupSize(const DispatchInfo &dispatchInfo) {
size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
computeWorkgroupSizeND(wsInfo, workGroupSize, workItems, dispatchInfo.getDim());
} else {
auto maxWorkGroupSize = kernel->maxKernelWorkGroupSize;
auto maxWorkGroupSize = kernel->getMaxKernelWorkGroupSize(rootDeviceIndex);
auto simd = kernel->getKernelInfo(rootDeviceIndex).getMaxSimdSize();
size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
if (dispatchInfo.getDim() == 1) {

View File

@@ -143,7 +143,7 @@ class Context : public BaseObject<_cl_context> {
ContextType peekContextType() const { return contextType; }
SchedulerKernel &getSchedulerKernel();
MOCKABLE_VIRTUAL SchedulerKernel &getSchedulerKernel();
bool isDeviceAssociated(const ClDevice &clDevice) const;
ClDevice *getSubDeviceByIndex(uint32_t subDeviceIndex) const;

View File

@@ -63,10 +63,10 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {
}
if (isGTPinInitialized) {
auto pKernel = castToObjectOrAbort<Kernel>(kernel);
size_t gtpinBTI = pKernel->getNumberOfBindingTableStates();
// Enlarge local copy of SSH by 1 SS
auto device = pKernel->getDevices()[0];
auto rootDeviceIndex = device->getRootDeviceIndex();
size_t gtpinBTI = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
// Enlarge local copy of SSH by 1 SS
GFXCORE_FAMILY genFamily = device->getHardwareInfo().platform.eRenderCoreFamily;
GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
if (pKernel->isParentKernel || !gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex)) {
@@ -138,7 +138,7 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
}
GFXCORE_FAMILY genFamily = device.getHardwareInfo().platform.eRenderCoreFamily;
GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
size_t gtpinBTI = pKernel->getNumberOfBindingTableStates(rootDeviceIndex) - 1;
void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI, rootDeviceIndex);
cl_mem buffer = (cl_mem)resource;
auto pBuffer = castToObjectOrAbort<Buffer>(buffer);

View File

@@ -27,7 +27,7 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel, uint32_t rootD
size_t ssSize = sizeof(RENDER_SURFACE_STATE);
size_t btsSize = sizeof(BINDING_TABLE_STATE);
size_t sizeToEnlarge = ssSize + btsSize;
size_t currBTOffset = pKernel->getBindingTableOffset();
size_t currBTOffset = pKernel->getBindingTableOffset(rootDeviceIndex);
size_t currSurfaceStateSize = currBTOffset;
char *pSsh = static_cast<char *>(pKernel->getSurfaceStateHeap(rootDeviceIndex));
char *pNewSsh = new char[sshSize + sizeToEnlarge];
@@ -35,7 +35,7 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel, uint32_t rootD
RENDER_SURFACE_STATE *pSS = reinterpret_cast<RENDER_SURFACE_STATE *>(pNewSsh + currSurfaceStateSize);
*pSS = GfxFamily::cmdInitRenderSurfaceState;
size_t newSurfaceStateSize = currSurfaceStateSize + ssSize;
size_t currBTCount = pKernel->getNumberOfBindingTableStates();
size_t currBTCount = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
memcpy_s(pNewSsh + newSurfaceStateSize, sshSize + sizeToEnlarge - newSurfaceStateSize, pSsh + currBTOffset, currBTCount * btsSize);
BINDING_TABLE_STATE *pNewBTS = reinterpret_cast<BINDING_TABLE_STATE *>(pNewSsh + newSurfaceStateSize + currBTCount * btsSize);
*pNewBTS = GfxFamily::cmdInitBindingTableState;
@@ -48,10 +48,10 @@ template <typename GfxFamily>
void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti, uint32_t rootDeviceIndex) {
using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
if ((nullptr == pKernel->getSurfaceStateHeap(rootDeviceIndex)) || (bti >= pKernel->getNumberOfBindingTableStates())) {
if ((nullptr == pKernel->getSurfaceStateHeap(rootDeviceIndex)) || (bti >= pKernel->getNumberOfBindingTableStates(rootDeviceIndex))) {
return nullptr;
}
auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), (pKernel->getBindingTableOffset() + bti * sizeof(BINDING_TABLE_STATE))));
auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), (pKernel->getBindingTableOffset(rootDeviceIndex) + bti * sizeof(BINDING_TABLE_STATE))));
auto pSurfaceState = ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), pBts->getSurfaceStatePointer());
return pSurfaceState;
}

View File

@@ -238,7 +238,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
auto dstBindingTablePointer = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(ssh, (kernelInfo.patchInfo.bindingTableState != nullptr) ? kernelInfo.patchInfo.bindingTableState->Count : 0,
kernel.getSurfaceStateHeap(rootDeviceIndex), kernel.getSurfaceStateHeapSize(rootDeviceIndex),
kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset());
kernel.getNumberOfBindingTableStates(rootDeviceIndex), kernel.getBindingTableOffset(rootDeviceIndex));
// Copy our sampler state if it exists
uint32_t samplerStateOffset = 0;
@@ -281,7 +281,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
DEBUG_BREAK_IF(patchInfo.executionEnvironment == nullptr);
auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates(rootDeviceIndex)));
if (resetBindingTablePrefetch(kernel)) {
bindingTablePrefetchSize = 0;
}

View File

@@ -78,8 +78,9 @@ Kernel::Kernel(Program *programArg, const KernelInfoContainer &kernelInfosArg, b
program->retain();
program->retainForKernel();
imageTransformer.reset(new ImageTransformer);
maxKernelWorkGroupSize = static_cast<uint32_t>(deviceVector[0]->getSharedDeviceInfo().maxWorkGroupSize);
for (const auto &pClDevice : deviceVector) {
kernelDeviceInfos[pClDevice->getRootDeviceIndex()].maxKernelWorkGroupSize = static_cast<uint32_t>(pClDevice->getSharedDeviceInfo().maxWorkGroupSize);
}
}
Kernel::~Kernel() {
@@ -170,9 +171,9 @@ template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData
cl_int Kernel::initialize() {
cl_int retVal = CL_OUT_OF_HOST_MEMORY;
do {
reconfigureKernel();
auto pClDevice = &getDevice();
auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
reconfigureKernel(rootDeviceIndex);
auto &hwInfo = pClDevice->getHardwareInfo();
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
auto &kernelInfo = *kernelInfos[rootDeviceIndex];
@@ -201,40 +202,84 @@ cl_int Kernel::initialize() {
}
auto crossThread = reinterpret_cast<uint32_t *>(kernelDeviceInfos[rootDeviceIndex].crossThreadData);
globalWorkOffsetX = workloadInfo.globalWorkOffsetOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[0]) : globalWorkOffsetX;
globalWorkOffsetY = workloadInfo.globalWorkOffsetOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[1]) : globalWorkOffsetY;
globalWorkOffsetZ = workloadInfo.globalWorkOffsetOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[2]) : globalWorkOffsetZ;
kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX = workloadInfo.globalWorkOffsetOffsets[0] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[0])
: kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX;
kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = workloadInfo.globalWorkOffsetOffsets[1] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[1])
: kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY;
kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ = workloadInfo.globalWorkOffsetOffsets[2] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[2])
: kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ;
localWorkSizeX = workloadInfo.localWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[0]) : localWorkSizeX;
localWorkSizeY = workloadInfo.localWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[1]) : localWorkSizeY;
localWorkSizeZ = workloadInfo.localWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[2]) : localWorkSizeZ;
kernelDeviceInfos[rootDeviceIndex].localWorkSizeX = workloadInfo.localWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[0])
: kernelDeviceInfos[rootDeviceIndex].localWorkSizeX;
kernelDeviceInfos[rootDeviceIndex].localWorkSizeY = workloadInfo.localWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[1])
: kernelDeviceInfos[rootDeviceIndex].localWorkSizeY;
kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ = workloadInfo.localWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[2])
: kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ;
localWorkSizeX2 = workloadInfo.localWorkSizeOffsets2[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[0]) : localWorkSizeX2;
localWorkSizeY2 = workloadInfo.localWorkSizeOffsets2[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[1]) : localWorkSizeY2;
localWorkSizeZ2 = workloadInfo.localWorkSizeOffsets2[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[2]) : localWorkSizeZ2;
kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 = workloadInfo.localWorkSizeOffsets2[0] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[0])
: kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2;
kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2 = workloadInfo.localWorkSizeOffsets2[1] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[1])
: kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2;
kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2 = workloadInfo.localWorkSizeOffsets2[2] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[2])
: kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2;
globalWorkSizeX = workloadInfo.globalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[0]) : globalWorkSizeX;
globalWorkSizeY = workloadInfo.globalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[1]) : globalWorkSizeY;
globalWorkSizeZ = workloadInfo.globalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[2]) : globalWorkSizeZ;
kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX = workloadInfo.globalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[0])
: kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX;
kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY = workloadInfo.globalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[1])
: kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY;
kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ = workloadInfo.globalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[2])
: kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ;
enqueuedLocalWorkSizeX = workloadInfo.enqueuedLocalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[0]) : enqueuedLocalWorkSizeX;
enqueuedLocalWorkSizeY = workloadInfo.enqueuedLocalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[1]) : enqueuedLocalWorkSizeY;
enqueuedLocalWorkSizeZ = workloadInfo.enqueuedLocalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[2]) : enqueuedLocalWorkSizeZ;
kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX = workloadInfo.enqueuedLocalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[0])
: kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX;
kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY = workloadInfo.enqueuedLocalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[1])
: kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY;
kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ = workloadInfo.enqueuedLocalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[2])
: kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ;
numWorkGroupsX = workloadInfo.numWorkGroupsOffset[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[0]) : numWorkGroupsX;
numWorkGroupsY = workloadInfo.numWorkGroupsOffset[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[1]) : numWorkGroupsY;
numWorkGroupsZ = workloadInfo.numWorkGroupsOffset[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[2]) : numWorkGroupsZ;
kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX = workloadInfo.numWorkGroupsOffset[0] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[0])
: kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX;
kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY = workloadInfo.numWorkGroupsOffset[1] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[1])
: kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY;
kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ = workloadInfo.numWorkGroupsOffset[2] != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[2])
: kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ;
maxWorkGroupSizeForCrossThreadData = workloadInfo.maxWorkGroupSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.maxWorkGroupSizeOffset) : maxWorkGroupSizeForCrossThreadData;
workDim = workloadInfo.workDimOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.workDimOffset) : workDim;
dataParameterSimdSize = workloadInfo.simdSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.simdSizeOffset) : dataParameterSimdSize;
parentEventOffset = workloadInfo.parentEventOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.parentEventOffset) : parentEventOffset;
preferredWkgMultipleOffset = workloadInfo.preferredWkgMultipleOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.preferredWkgMultipleOffset) : preferredWkgMultipleOffset;
kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData = workloadInfo.maxWorkGroupSizeOffset != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.maxWorkGroupSizeOffset)
: kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData;
kernelDeviceInfos[rootDeviceIndex].workDim = workloadInfo.workDimOffset != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.workDimOffset)
: kernelDeviceInfos[rootDeviceIndex].workDim;
kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize = workloadInfo.simdSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.simdSizeOffset) : kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize;
kernelDeviceInfos[rootDeviceIndex].parentEventOffset = workloadInfo.parentEventOffset != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.parentEventOffset)
: kernelDeviceInfos[rootDeviceIndex].parentEventOffset;
kernelDeviceInfos[rootDeviceIndex].preferredWkgMultipleOffset = workloadInfo.preferredWkgMultipleOffset != WorkloadInfo::undefinedOffset
? ptrOffset(crossThread, workloadInfo.preferredWkgMultipleOffset)
: kernelDeviceInfos[rootDeviceIndex].preferredWkgMultipleOffset;
*maxWorkGroupSizeForCrossThreadData = maxKernelWorkGroupSize;
*dataParameterSimdSize = maxSimdSize;
*preferredWkgMultipleOffset = maxSimdSize;
*parentEventOffset = WorkloadInfo::invalidParentEvent;
*kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData = kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
*kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize = maxSimdSize;
*kernelDeviceInfos[rootDeviceIndex].preferredWkgMultipleOffset = maxSimdSize;
*kernelDeviceInfos[rootDeviceIndex].parentEventOffset = WorkloadInfo::invalidParentEvent;
}
// allocate our own SSH, if necessary
@@ -247,8 +292,8 @@ cl_int Kernel::initialize() {
memcpy_s(kernelDeviceInfos[rootDeviceIndex].pSshLocal.get(), kernelDeviceInfos[rootDeviceIndex].sshLocalSize,
heapInfo.pSsh, kernelDeviceInfos[rootDeviceIndex].sshLocalSize);
}
numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0;
localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0;
kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0;
kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0;
// patch crossthread data and ssh with inline surfaces, if necessary
auto perHwThreadPrivateMemorySize = PatchTokenBinary::getPerHwThreadPrivateSurfaceSize(patchInfo.pAllocateStatelessPrivateSurface, kernelInfo.getMaxSimdSize());
@@ -582,7 +627,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para
switch (paramName) {
case CL_KERNEL_WORK_GROUP_SIZE:
maxWorkgroupSize = this->maxKernelWorkGroupSize;
maxWorkgroupSize = kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
if (DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get()) {
auto divisionSize = CommonConstants::maximalSimdSize / patchInfo.executionEnvironment->LargestCompiledSIMDSize;
maxWorkgroupSize /= divisionSize;
@@ -646,9 +691,10 @@ cl_int Kernel::getSubGroupInfo(ClDevice &clDevice, cl_kernel_sub_group_info para
size_t *paramValueSizeRet) const {
size_t numDimensions = 0;
size_t WGS = 1;
const auto &kernelInfo = getKernelInfo(clDevice.getRootDeviceIndex());
auto rootDeviceIndex = clDevice.getRootDeviceIndex();
const auto &kernelInfo = getKernelInfo(rootDeviceIndex);
auto maxSimdSize = static_cast<size_t>(kernelInfo.getMaxSimdSize());
auto maxRequiredWorkGroupSize = static_cast<size_t>(kernelInfo.getMaxRequiredWorkGroupSize(maxKernelWorkGroupSize));
auto maxRequiredWorkGroupSize = static_cast<size_t>(kernelInfo.getMaxRequiredWorkGroupSize(getMaxKernelWorkGroupSize(rootDeviceIndex)));
auto largestCompiledSIMDSize = static_cast<size_t>(kernelInfo.patchInfo.executionEnvironment->LargestCompiledSIMDSize);
GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
@@ -811,15 +857,15 @@ size_t Kernel::getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const {
: 0;
}
size_t Kernel::getNumberOfBindingTableStates() const {
return numberOfBindingTableStates;
size_t Kernel::getNumberOfBindingTableStates(uint32_t rootDeviceIndex) const {
return kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates;
}
void Kernel::resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(static_cast<char *>(pNewSsh));
kernelDeviceInfos[rootDeviceIndex].sshLocalSize = static_cast<uint32_t>(newSshSize);
numberOfBindingTableStates = newBindingTableCount;
localBindingTableOffset = newBindingTableOffset;
kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = newBindingTableCount;
kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset = newBindingTableOffset;
}
cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) {
@@ -2564,4 +2610,51 @@ const KernelInfo &Kernel::getDefaultKernelInfo() const {
UNRECOVERABLE_IF(!pKernelInfo);
return *pKernelInfo;
}
void Kernel::setGlobalWorkOffsetValues(uint32_t rootDeviceIndex, uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ) {
*kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX = globalWorkOffsetX;
*kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = globalWorkOffsetY;
*kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = globalWorkOffsetZ;
}
void Kernel::setGlobalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ) {
*kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX = globalWorkSizeX;
*kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY = globalWorkSizeY;
*kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ = globalWorkSizeZ;
}
void Kernel::setLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
*kernelDeviceInfos[rootDeviceIndex].localWorkSizeX = localWorkSizeX;
*kernelDeviceInfos[rootDeviceIndex].localWorkSizeY = localWorkSizeY;
*kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ = localWorkSizeZ;
}
void Kernel::setLocalWorkSize2Values(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
*kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 = localWorkSizeX;
*kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2 = localWorkSizeY;
*kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2 = localWorkSizeZ;
}
void Kernel::setEnqueuedLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
*kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX = localWorkSizeX;
*kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY = localWorkSizeY;
*kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ = localWorkSizeZ;
}
bool Kernel::isLocalWorkSize2Patched(uint32_t rootDeviceIndex) {
return kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 != &dummyPatchLocation;
}
void Kernel::setNumWorkGroupsValues(uint32_t rootDeviceIndex, uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ) {
*kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX = numWorkGroupsX;
*kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY = numWorkGroupsY;
*kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ = numWorkGroupsZ;
}
void Kernel::setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim) {
*kernelDeviceInfos[rootDeviceIndex].workDim = workDim;
}
uint32_t Kernel::getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const {
return kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
}
} // namespace NEO

View File

@@ -161,9 +161,9 @@ class Kernel : public BaseObject<_cl_kernel> {
size_t getKernelHeapSize(uint32_t rootDeviceIndex) const;
size_t getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const;
size_t getDynamicStateHeapSize(uint32_t rootDeviceIndex) const;
size_t getNumberOfBindingTableStates() const;
size_t getBindingTableOffset() const {
return localBindingTableOffset;
size_t getNumberOfBindingTableStates(uint32_t rootDeviceIndex) const;
size_t getBindingTableOffset(uint32_t rootDeviceIndex) const {
return kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset;
}
void resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
@@ -304,37 +304,6 @@ class Kernel : public BaseObject<_cl_kernel> {
size_t argSize,
const void *argValue) const;
uint32_t *globalWorkOffsetX = &Kernel::dummyPatchLocation;
uint32_t *globalWorkOffsetY = &Kernel::dummyPatchLocation;
uint32_t *globalWorkOffsetZ = &Kernel::dummyPatchLocation;
uint32_t *localWorkSizeX = &Kernel::dummyPatchLocation;
uint32_t *localWorkSizeY = &Kernel::dummyPatchLocation;
uint32_t *localWorkSizeZ = &Kernel::dummyPatchLocation;
uint32_t *localWorkSizeX2 = &Kernel::dummyPatchLocation;
uint32_t *localWorkSizeY2 = &Kernel::dummyPatchLocation;
uint32_t *localWorkSizeZ2 = &Kernel::dummyPatchLocation;
uint32_t *globalWorkSizeX = &Kernel::dummyPatchLocation;
uint32_t *globalWorkSizeY = &Kernel::dummyPatchLocation;
uint32_t *globalWorkSizeZ = &Kernel::dummyPatchLocation;
uint32_t *enqueuedLocalWorkSizeX = &Kernel::dummyPatchLocation;
uint32_t *enqueuedLocalWorkSizeY = &Kernel::dummyPatchLocation;
uint32_t *enqueuedLocalWorkSizeZ = &Kernel::dummyPatchLocation;
uint32_t *numWorkGroupsX = &Kernel::dummyPatchLocation;
uint32_t *numWorkGroupsY = &Kernel::dummyPatchLocation;
uint32_t *numWorkGroupsZ = &Kernel::dummyPatchLocation;
uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation;
uint32_t maxKernelWorkGroupSize = 0;
uint32_t *workDim = &Kernel::dummyPatchLocation;
uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation;
uint32_t *parentEventOffset = &Kernel::dummyPatchLocation;
uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation;
static uint32_t dummyPatchLocation;
std::vector<size_t> slmSizes;
@@ -426,6 +395,16 @@ class Kernel : public BaseObject<_cl_kernel> {
}
const KernelInfo &getDefaultKernelInfo() const;
void setGlobalWorkOffsetValues(uint32_t rootDeviceIndex, uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ);
void setGlobalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ);
void setLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
void setLocalWorkSize2Values(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
void setEnqueuedLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
bool isLocalWorkSize2Patched(uint32_t rootDeviceIndex);
void setNumWorkGroupsValues(uint32_t rootDeviceIndex, uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ);
void setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim);
uint32_t getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const;
protected:
struct ObjectCounts {
uint32_t imageCount;
@@ -511,7 +490,7 @@ class Kernel : public BaseObject<_cl_kernel> {
void resolveArgs();
void reconfigureKernel();
void reconfigureKernel(uint32_t rootDeviceIndex);
void addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation);
bool allocationForCacheFlush(GraphicsAllocation *argAllocation) const;
@@ -534,9 +513,6 @@ class Kernel : public BaseObject<_cl_kernel> {
AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None;
size_t numberOfBindingTableStates = 0u;
size_t localBindingTableOffset = 0u;
GraphicsAllocation *kernelReflectionSurface = nullptr;
bool usingSharedObjArgs = false;
@@ -561,6 +537,40 @@ class Kernel : public BaseObject<_cl_kernel> {
uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::NotSet;
struct KernelDeviceInfo : public NonCopyableClass {
uint32_t *globalWorkOffsetX = &Kernel::dummyPatchLocation;
uint32_t *globalWorkOffsetY = &Kernel::dummyPatchLocation;
uint32_t *globalWorkOffsetZ = &Kernel::dummyPatchLocation;
uint32_t *localWorkSizeX = &Kernel::dummyPatchLocation;
uint32_t *localWorkSizeY = &Kernel::dummyPatchLocation;
uint32_t *localWorkSizeZ = &Kernel::dummyPatchLocation;
uint32_t *localWorkSizeX2 = &Kernel::dummyPatchLocation;
uint32_t *localWorkSizeY2 = &Kernel::dummyPatchLocation;
uint32_t *localWorkSizeZ2 = &Kernel::dummyPatchLocation;
uint32_t *globalWorkSizeX = &Kernel::dummyPatchLocation;
uint32_t *globalWorkSizeY = &Kernel::dummyPatchLocation;
uint32_t *globalWorkSizeZ = &Kernel::dummyPatchLocation;
uint32_t *enqueuedLocalWorkSizeX = &Kernel::dummyPatchLocation;
uint32_t *enqueuedLocalWorkSizeY = &Kernel::dummyPatchLocation;
uint32_t *enqueuedLocalWorkSizeZ = &Kernel::dummyPatchLocation;
uint32_t *numWorkGroupsX = &Kernel::dummyPatchLocation;
uint32_t *numWorkGroupsY = &Kernel::dummyPatchLocation;
uint32_t *numWorkGroupsZ = &Kernel::dummyPatchLocation;
uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation;
uint32_t maxKernelWorkGroupSize = 0;
uint32_t *workDim = &Kernel::dummyPatchLocation;
uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation;
uint32_t *parentEventOffset = &Kernel::dummyPatchLocation;
uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation;
size_t numberOfBindingTableStates = 0u;
size_t localBindingTableOffset = 0u;
std::unique_ptr<char[]> pSshLocal;
uint32_t sshLocalSize = 0u;
char *crossThreadData = nullptr;

View File

@@ -13,7 +13,7 @@ namespace NEO {
bool Kernel::requiresCacheFlushCommand(const CommandQueue &commandQueue) const {
return false;
}
void Kernel::reconfigureKernel() {
void Kernel::reconfigureKernel(uint32_t rootDeviceIndex) {
}
int Kernel::setKernelThreadArbitrationPolicy(uint32_t policy) {
if (policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_ROUND_ROBIN_INTEL) {

View File

@@ -133,8 +133,9 @@ WorkSizeInfo::WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t
}
WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) {
auto &device = dispatchInfo.getClDevice();
const auto &kernelInfo = dispatchInfo.getKernel()->getKernelInfo(device.getRootDeviceIndex());
this->maxWorkGroupSize = dispatchInfo.getKernel()->maxKernelWorkGroupSize;
auto rootDeviceIndex = device.getRootDeviceIndex();
const auto &kernelInfo = dispatchInfo.getKernel()->getKernelInfo(rootDeviceIndex);
this->maxWorkGroupSize = dispatchInfo.getKernel()->getMaxKernelWorkGroupSize(rootDeviceIndex);
auto pExecutionEnvironment = kernelInfo.patchInfo.executionEnvironment;
this->hasBarriers = (pExecutionEnvironment != nullptr) && (pExecutionEnvironment->HasBarriers);
this->simdSize = static_cast<uint32_t>(kernelInfo.getMaxSimdSize());

View File

@@ -15,12 +15,12 @@ struct KernelSubGroupInfoFixture : HelloWorldFixture<HelloWorldFixtureFactory> {
void SetUp() override {
ParentClass::SetUp();
pKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
maxSimdSize = static_cast<size_t>(pKernel->getKernelInfo(rootDeviceIndex).getMaxSimdSize());
ASSERT_LE(8u, maxSimdSize);
maxWorkDim = static_cast<size_t>(pClDevice->getDeviceInfo().maxWorkItemDimensions);
ASSERT_EQ(3u, maxWorkDim);
maxWorkGroupSize = static_cast<size_t>(pKernel->maxKernelWorkGroupSize);
maxWorkGroupSize = static_cast<size_t>(pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize);
ASSERT_GE(1024u, maxWorkGroupSize);
largestCompiledSIMDSize = static_cast<size_t>(pKernel->getKernelInfo(rootDeviceIndex).patchInfo.executionEnvironment->LargestCompiledSIMDSize);
ASSERT_EQ(32u, largestCompiledSIMDSize);
@@ -30,8 +30,8 @@ struct KernelSubGroupInfoFixture : HelloWorldFixture<HelloWorldFixtureFactory> {
auto requiredWorkGroupSizeZ = static_cast<size_t>(pKernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2]);
calculatedMaxWorkgroupSize = requiredWorkGroupSizeX * requiredWorkGroupSizeY * requiredWorkGroupSizeZ;
if ((calculatedMaxWorkgroupSize == 0) || (calculatedMaxWorkgroupSize > static_cast<size_t>(pKernel->maxKernelWorkGroupSize))) {
calculatedMaxWorkgroupSize = static_cast<size_t>(pKernel->maxKernelWorkGroupSize);
if ((calculatedMaxWorkgroupSize == 0) || (calculatedMaxWorkgroupSize > static_cast<size_t>(pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize))) {
calculatedMaxWorkgroupSize = static_cast<size_t>(pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize);
}
}

View File

@@ -259,7 +259,7 @@ HWTEST_F(DispatchWalkerTest, GivenDefaultLwsAlgorithmWhenDispatchingWalkerThenDi
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
EXPECT_EQ(dimension, *kernel.workDim);
EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
}
}
@@ -288,7 +288,7 @@ HWTEST_F(DispatchWalkerTest, GivenSquaredLwsAlgorithmWhenDispatchingWalkerThenDi
nullptr,
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
EXPECT_EQ(dimension, *kernel.workDim);
EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
}
}
@@ -316,7 +316,7 @@ HWTEST_F(DispatchWalkerTest, GivenNdLwsAlgorithmWhenDispatchingWalkerThenDimensi
nullptr,
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
EXPECT_EQ(dimension, *kernel.workDim);
EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
}
}
@@ -345,7 +345,7 @@ HWTEST_F(DispatchWalkerTest, GivenOldLwsAlgorithmWhenDispatchingWalkerThenDimens
nullptr,
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
EXPECT_EQ(dimension, *kernel.workDim);
EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
}
}
@@ -375,9 +375,9 @@ HWTEST_F(DispatchWalkerTest, GivenNumWorkGroupsWhenDispatchingWalkerThenNumWorkG
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
EXPECT_EQ(2u, *kernel.numWorkGroupsX);
EXPECT_EQ(5u, *kernel.numWorkGroupsY);
EXPECT_EQ(10u, *kernel.numWorkGroupsZ);
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
EXPECT_EQ(10u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
}
HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndDefaultAlgorithmWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -405,9 +405,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndDefaultAlgorithmWhenDispatch
nullptr,
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
EXPECT_EQ(2u, *kernel.localWorkSizeX);
EXPECT_EQ(5u, *kernel.localWorkSizeY);
EXPECT_EQ(1u, *kernel.localWorkSizeZ);
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
}
HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndNdOnWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -435,9 +435,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndNdOnWhenDispatchingWalkerThe
nullptr,
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
EXPECT_EQ(2u, *kernel.localWorkSizeX);
EXPECT_EQ(5u, *kernel.localWorkSizeY);
EXPECT_EQ(10u, *kernel.localWorkSizeZ);
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
EXPECT_EQ(10u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
}
HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -466,9 +466,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmWhenDispatch
nullptr,
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
EXPECT_EQ(2u, *kernel.localWorkSizeX);
EXPECT_EQ(5u, *kernel.localWorkSizeY);
EXPECT_EQ(1u, *kernel.localWorkSizeZ);
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
}
HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmOffAndNdOffWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -497,9 +497,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmOffAndNdOffW
nullptr,
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
EXPECT_EQ(2u, *kernel.localWorkSizeX);
EXPECT_EQ(5u, *kernel.localWorkSizeY);
EXPECT_EQ(1u, *kernel.localWorkSizeZ);
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
}
HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -526,9 +526,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeWhenDispatchingWalkerThenLwsIsC
nullptr,
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
EXPECT_EQ(1u, *kernel.localWorkSizeX);
EXPECT_EQ(2u, *kernel.localWorkSizeY);
EXPECT_EQ(3u, *kernel.localWorkSizeZ);
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
}
HWTEST_F(DispatchWalkerTest, GivenTwoSetsOfLwsOffsetsWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -558,12 +558,12 @@ HWTEST_F(DispatchWalkerTest, GivenTwoSetsOfLwsOffsetsWhenDispatchingWalkerThenLw
nullptr,
nullptr,
CL_COMMAND_NDRANGE_KERNEL);
EXPECT_EQ(1u, *kernel.localWorkSizeX);
EXPECT_EQ(2u, *kernel.localWorkSizeY);
EXPECT_EQ(3u, *kernel.localWorkSizeZ);
EXPECT_EQ(1u, *kernel.localWorkSizeX2);
EXPECT_EQ(2u, *kernel.localWorkSizeY2);
EXPECT_EQ(3u, *kernel.localWorkSizeZ2);
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
}
HWTEST_F(DispatchWalkerTest, GivenSplitKernelWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -597,16 +597,16 @@ HWTEST_F(DispatchWalkerTest, GivenSplitKernelWhenDispatchingWalkerThenLwsIsCorre
auto dispatchId = 0;
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
auto &kernel = static_cast<MockKernel &>(*dispatchInfo.getKernel());
if (dispatchId == 0) {
EXPECT_EQ(1u, *kernel.localWorkSizeX);
EXPECT_EQ(2u, *kernel.localWorkSizeY);
EXPECT_EQ(3u, *kernel.localWorkSizeZ);
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
}
if (dispatchId == 1) {
EXPECT_EQ(4u, *kernel.localWorkSizeX);
EXPECT_EQ(5u, *kernel.localWorkSizeY);
EXPECT_EQ(6u, *kernel.localWorkSizeZ);
EXPECT_EQ(4u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
EXPECT_EQ(6u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
}
dispatchId++;
}
@@ -646,27 +646,27 @@ HWTEST_F(DispatchWalkerTest, GivenSplitWalkerWhenDispatchingWalkerThenLwsIsCorre
CL_COMMAND_NDRANGE_KERNEL);
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
auto &kernel = static_cast<MockKernel &>(*dispatchInfo.getKernel());
if (&kernel == &mainKernel) {
EXPECT_EQ(4u, *kernel.localWorkSizeX);
EXPECT_EQ(5u, *kernel.localWorkSizeY);
EXPECT_EQ(6u, *kernel.localWorkSizeZ);
EXPECT_EQ(4u, *kernel.localWorkSizeX2);
EXPECT_EQ(5u, *kernel.localWorkSizeY2);
EXPECT_EQ(6u, *kernel.localWorkSizeZ2);
EXPECT_EQ(3u, *kernel.numWorkGroupsX);
EXPECT_EQ(2u, *kernel.numWorkGroupsY);
EXPECT_EQ(2u, *kernel.numWorkGroupsZ);
EXPECT_EQ(4u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
EXPECT_EQ(6u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
EXPECT_EQ(4u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
EXPECT_EQ(6u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
} else {
EXPECT_EQ(0u, *kernel.localWorkSizeX);
EXPECT_EQ(0u, *kernel.localWorkSizeY);
EXPECT_EQ(0u, *kernel.localWorkSizeZ);
EXPECT_EQ(1u, *kernel.localWorkSizeX2);
EXPECT_EQ(2u, *kernel.localWorkSizeY2);
EXPECT_EQ(3u, *kernel.localWorkSizeZ2);
EXPECT_EQ(0u, *kernel.numWorkGroupsX);
EXPECT_EQ(0u, *kernel.numWorkGroupsY);
EXPECT_EQ(0u, *kernel.numWorkGroupsZ);
EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
}
}
}
@@ -859,8 +859,8 @@ HWTEST_F(DispatchWalkerTest, GivenMultipleKernelsWhenDispatchingWalkerThenWorkDi
CL_COMMAND_NDRANGE_KERNEL);
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
EXPECT_EQ(*kernel.workDim, dispatchInfo.getDim());
auto &kernel = static_cast<MockKernel &>(*dispatchInfo.getKernel());
EXPECT_EQ(*kernel.kernelDeviceInfos[rootDeviceIndex].workDim, dispatchInfo.getDim());
}
}

View File

@@ -99,7 +99,7 @@ HWTEST_F(EnqueueDebugKernelTest, givenDebugKernelWhenEnqueuedThenSSHAndBtiAreCor
mockCmdQ->enqueueKernel(debugKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(surfaceStates, debugKernel->getBindingTableOffset()));
auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(surfaceStates, debugKernel->getBindingTableOffset(rootDeviceIndex)));
uint32_t surfaceStateOffset = dstBtiTableBase[0].getSurfaceStatePointer();
auto debugSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh.getCpuBase(), surfaceStateOffset));

View File

@@ -1277,9 +1277,9 @@ TEST_F(EnqueueKernelTest, givenKernelWhenAllArgsAreNotAndEventExistSetThenClEnqu
TEST_F(EnqueueKernelTest, givenEnqueueCommandThatLwsExceedsDeviceCapabilitiesWhenEnqueueNDRangeKernelIsCalledThenErrorIsReturned) {
MockKernelWithInternals mockKernel(*pClDevice);
mockKernel.mockKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
mockKernel.mockKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
auto maxKernelWorkgroupSize = mockKernel.mockKernel->maxKernelWorkGroupSize;
auto maxKernelWorkgroupSize = mockKernel.mockKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
size_t globalWorkSize[3] = {maxKernelWorkgroupSize + 1, 1, 1};
size_t localWorkSize[3] = {maxKernelWorkgroupSize + 1, 1, 1};

View File

@@ -64,13 +64,13 @@ TEST_F(EnqueueKernelRequiredWorkSize, GivenUnspecifiedWorkGroupSizeWhenEnqeueing
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(*pKernel->localWorkSizeX, 8u);
EXPECT_EQ(*pKernel->localWorkSizeY, 4u);
EXPECT_EQ(*pKernel->localWorkSizeZ, 4u);
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX, 8u);
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY, 4u);
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ, 4u);
EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeX, 8u);
EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeY, 4u);
EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeZ, 4u);
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX, 8u);
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY, 4u);
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ, 4u);
}
// Fully specified
@@ -91,13 +91,13 @@ TEST_F(EnqueueKernelRequiredWorkSize, GivenRequiredWorkGroupSizeWhenEnqeueingKer
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeX, 8u);
EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeY, 4u);
EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeZ, 4u);
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX, 8u);
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY, 4u);
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ, 4u);
EXPECT_EQ(*pKernel->localWorkSizeX, 8u);
EXPECT_EQ(*pKernel->localWorkSizeY, 4u);
EXPECT_EQ(*pKernel->localWorkSizeZ, 4u);
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX, 8u);
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY, 4u);
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ, 4u);
}
// Underspecified. Won't permit.

View File

@@ -680,8 +680,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
EXPECT_TRUE(containsHint(expectedHint, userData));
}
@@ -692,8 +695,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
EXPECT_TRUE(containsHint(expectedHint, userData));
DebugManager.flags.EnableComputeWorkSizeND.set(isWorkGroupSizeEnabled);
}
@@ -705,8 +711,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
EXPECT_TRUE(containsHint(expectedHint, userData));
DebugManager.flags.EnableComputeWorkSizeND.set(isWorkGroupSizeEnabled);
}
@@ -716,8 +725,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
EXPECT_TRUE(containsHint(expectedHint, userData));
}
@@ -729,8 +741,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
EXPECT_TRUE(containsHint(expectedHint, userData));
}
@@ -742,8 +757,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
EXPECT_TRUE(containsHint(expectedHint, userData));
}

View File

@@ -241,7 +241,7 @@ struct PerformanceHintEnqueueKernelTest : public PerformanceHintEnqueueTest,
ProgramFixture::TearDown();
PerformanceHintEnqueueTest::TearDown();
}
Kernel *kernel = nullptr;
MockKernel *kernel = nullptr;
uint32_t rootDeviceIndex = std::numeric_limits<uint32_t>::max();
size_t globalWorkGroupSize[3]{};
};

View File

@@ -328,7 +328,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu
Kernel *blockKernel = Kernel::create(pKernel->getProgram(), MockKernel::toKernelInfoContainer(*pBlockInfo, rootDeviceIndex), nullptr);
blockSSH = alignUp(blockSSH, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
if (blockKernel->getNumberOfBindingTableStates() > 0) {
if (blockKernel->getNumberOfBindingTableStates(rootDeviceIndex) > 0) {
ASSERT_NE(nullptr, pBlockInfo->patchInfo.bindingTableState);
auto dstBlockBti = ptrOffset(blockSSH, pBlockInfo->patchInfo.bindingTableState->Offset);
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(dstBlockBti) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE);
@@ -336,7 +336,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu
auto srcBlockBti = ptrOffset(pBlockInfo->heapInfo.pSsh, pBlockInfo->patchInfo.bindingTableState->Offset);
auto srcBindingTable = reinterpret_cast<const BINDING_TABLE_STATE *>(srcBlockBti);
for (uint32_t i = 0; i < blockKernel->getNumberOfBindingTableStates(); ++i) {
for (uint32_t i = 0; i < blockKernel->getNumberOfBindingTableStates(rootDeviceIndex); ++i) {
uint32_t dstSurfaceStatePointer = dstBindingTable[i].getSurfaceStatePointer();
uint32_t srcSurfaceStatePointer = srcBindingTable[i].getSurfaceStatePointer();
auto *dstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh->getCpuBase(), dstSurfaceStatePointer));

View File

@@ -166,7 +166,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelDispatchTest, givenParentKernelWhenQueue
size_t sshUsed = blockedCommandsData->ssh->getUsed();
size_t expectedSizeSSH = pKernel->getNumberOfBindingTableStates() * sizeof(RENDER_SURFACE_STATE) +
size_t expectedSizeSSH = pKernel->getNumberOfBindingTableStates(rootDeviceIndex) * sizeof(RENDER_SURFACE_STATE) +
pKernel->getKernelInfo(rootDeviceIndex).patchInfo.bindingTableState->Count * sizeof(BINDING_TABLE_STATE) +
UnitTestHelper<FamilyType>::getDefaultSshUsage();

View File

@@ -45,7 +45,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSched
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
SchedulerKernel &scheduler = context->getSchedulerKernel();
auto &scheduler = static_cast<MockSchedulerKernel &>(context->getSchedulerKernel());
auto *executionModelDshAllocation = pDevQueueHw->getDshBuffer();
auto *dshHeap = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
@@ -70,27 +70,27 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSched
pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
false);
EXPECT_EQ(0u, *scheduler.globalWorkOffsetX);
EXPECT_EQ(0u, *scheduler.globalWorkOffsetY);
EXPECT_EQ(0u, *scheduler.globalWorkOffsetZ);
EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX);
EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ);
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.localWorkSizeX);
EXPECT_EQ(1u, *scheduler.localWorkSizeY);
EXPECT_EQ(1u, *scheduler.localWorkSizeZ);
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.localWorkSizeX2);
EXPECT_EQ(1u, *scheduler.localWorkSizeY2);
EXPECT_EQ(1u, *scheduler.localWorkSizeZ2);
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
if (scheduler.enqueuedLocalWorkSizeX != &Kernel::dummyPatchLocation) {
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.enqueuedLocalWorkSizeX);
if (scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX != &Kernel::dummyPatchLocation) {
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX);
}
EXPECT_EQ(1u, *scheduler.enqueuedLocalWorkSizeY);
EXPECT_EQ(1u, *scheduler.enqueuedLocalWorkSizeZ);
EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY);
EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ);
EXPECT_EQ((uint32_t)(scheduler.getGws() / scheduler.getLws()), *scheduler.numWorkGroupsX);
EXPECT_EQ(0u, *scheduler.numWorkGroupsY);
EXPECT_EQ(0u, *scheduler.numWorkGroupsZ);
EXPECT_EQ((uint32_t)(scheduler.getGws() / scheduler.getLws()), *scheduler.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);

View File

@@ -97,7 +97,7 @@ struct HelloWorldKernelFixture : public ProgramFixture {
std::string *pKernelName = nullptr;
cl_uint simd = 32;
cl_int retVal = CL_SUCCESS;
Kernel *pKernel = nullptr;
MockKernel *pKernel = nullptr;
MockContext *pContext = nullptr;
};
} // namespace NEO

View File

@@ -2225,11 +2225,11 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
Kernel *pKernel = castToObject<Kernel>(kernel);
ASSERT_NE(nullptr, pKernel);
size_t numBTS1 = pKernel->getNumberOfBindingTableStates();
size_t numBTS1 = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
EXPECT_EQ(2u, numBTS1);
size_t sizeSurfaceStates1 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
EXPECT_NE(0u, sizeSurfaceStates1);
size_t offsetBTS1 = pKernel->getBindingTableOffset();
size_t offsetBTS1 = pKernel->getBindingTableOffset(rootDeviceIndex);
EXPECT_NE(0u, offsetBTS1);
GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().platform.eRenderCoreFamily;
@@ -2241,11 +2241,11 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
bool surfaceAdded = gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex);
EXPECT_TRUE(surfaceAdded);
size_t numBTS2 = pKernel->getNumberOfBindingTableStates();
size_t numBTS2 = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
EXPECT_EQ(numBTS1 + 1, numBTS2);
size_t sizeSurfaceStates2 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
EXPECT_GT(sizeSurfaceStates2, sizeSurfaceStates1);
size_t offsetBTS2 = pKernel->getBindingTableOffset();
size_t offsetBTS2 = pKernel->getBindingTableOffset(rootDeviceIndex);
EXPECT_GT(offsetBTS2, offsetBTS1);
void *pSS2 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
@@ -2261,11 +2261,11 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
surfaceAdded = gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex);
EXPECT_FALSE(surfaceAdded);
size_t numBTS3 = pKernel->getNumberOfBindingTableStates();
size_t numBTS3 = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
EXPECT_EQ(0u, numBTS3);
size_t sizeSurfaceStates3 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
EXPECT_EQ(0u, sizeSurfaceStates3);
size_t offsetBTS3 = pKernel->getBindingTableOffset();
size_t offsetBTS3 = pKernel->getBindingTableOffset(rootDeviceIndex);
EXPECT_EQ(0u, offsetBTS3);
void *pSS3 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
EXPECT_EQ(nullptr, pSS3);

View File

@@ -385,7 +385,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto expectedBindingTableCount = 3u;
mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
mockKernelWithInternal->mockKernel->kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = expectedBindingTableCount;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
@@ -431,7 +431,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelThatIsSchedulerWhen
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto expectedBindingTableCount = 3u;
mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
mockKernelWithInternal->mockKernel->kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = expectedBindingTableCount;
auto isScheduler = const_cast<bool *>(&mockKernelWithInternal->mockKernel->isSchedulerKernel);
*isScheduler = true;
@@ -475,7 +475,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto expectedBindingTableCount = 100u;
mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
mockKernelWithInternal->mockKernel->kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = expectedBindingTableCount;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
@@ -802,7 +802,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
// Initialize binding table state pointers with pattern
EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates());
EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates(rootDeviceIndex));
const size_t localWorkSizes[3]{256, 1, 1};
@@ -890,7 +890,7 @@ HWTEST_F(HardwareCommandsTest, GivenBuffersNotRequiringSshWhenSettingBindingTabl
auto usedBefore = ssh.getUsed();
// Initialize binding table state pointers with pattern
auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
auto numSurfaceStates = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
EXPECT_EQ(0u, numSurfaceStates);
// set binding table states
@@ -933,7 +933,7 @@ HWTEST_F(HardwareCommandsTest, GivenZeroSurfaceStatesWhenSettingBindingTableStat
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
// Initialize binding table state pointers with pattern
auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
auto numSurfaceStates = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
EXPECT_EQ(0u, numSurfaceStates);
auto dstBindingTablePointer = pushBindingTableAndSurfaceStates<FamilyType>(ssh, *pKernel);

View File

@@ -47,6 +47,6 @@ struct HardwareCommandsTest : ClDeviceFixture,
size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) {
return EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(dstHeap, (srcKernel.getKernelInfo(rootDeviceIndex).patchInfo.bindingTableState != nullptr) ? srcKernel.getKernelInfo(rootDeviceIndex).patchInfo.bindingTableState->Count : 0,
srcKernel.getSurfaceStateHeap(rootDeviceIndex), srcKernel.getSurfaceStateHeapSize(rootDeviceIndex),
srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset());
srcKernel.getNumberOfBindingTableStates(rootDeviceIndex), srcKernel.getBindingTableOffset(rootDeviceIndex));
}
};

View File

@@ -65,7 +65,7 @@ class KernelTests : public ProgramFromBinaryFixture {
ASSERT_EQ(CL_SUCCESS, retVal);
// create a kernel
pKernel = Kernel::create(
pKernel = Kernel::create<MockKernel>(
pProgram,
pProgram->getKernelInfosForKernel(kernelName),
&retVal);
@@ -81,7 +81,7 @@ class KernelTests : public ProgramFromBinaryFixture {
ProgramFromBinaryFixture::TearDown();
}
Kernel *pKernel = nullptr;
MockKernel *pKernel = nullptr;
cl_int retVal = CL_SUCCESS;
};
@@ -278,7 +278,7 @@ TEST_F(KernelTests, GivenKernelWorkGroupSizeWhenGettingWorkGroupInfoThenWorkGrou
size_t paramValueSizeRet = 0;
auto kernelMaxWorkGroupSize = pDevice->getDeviceInfo().maxWorkGroupSize - 1;
pKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(kernelMaxWorkGroupSize);
pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(kernelMaxWorkGroupSize);
retVal = pKernel->getWorkGroupInfo(
*pClDevice,
@@ -2305,10 +2305,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenGlobalWorkOffsetIsCorr
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkOffsetX);
EXPECT_NE(nullptr, kernel.globalWorkOffsetY);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.globalWorkOffsetY);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkOffsetZ);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX);
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ);
}
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSizeIsCorrect) {
@@ -2318,10 +2318,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSizeIsCorrect
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
EXPECT_NE(nullptr, kernel.localWorkSizeX);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.localWorkSizeX);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeY);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeZ);
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
}
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSize2IsCorrect) {
@@ -2331,10 +2331,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSize2IsCorrec
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeX2);
EXPECT_NE(nullptr, kernel.localWorkSizeY2);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.localWorkSizeY2);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeZ2);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
}
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenGlobalWorkSizeIsCorrect) {
@@ -2344,10 +2344,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenGlobalWorkSizeIsCorrec
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkSizeX);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkSizeY);
EXPECT_NE(nullptr, kernel.globalWorkSizeZ);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.globalWorkSizeZ);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY);
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ);
}
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkDimIsCorrect) {
@@ -2357,8 +2357,8 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkDimIsCorrect)
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
EXPECT_NE(nullptr, kernel.workDim);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.workDim);
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
}
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenNumWorkGroupsIsCorrect) {
@@ -2370,12 +2370,12 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenNumWorkGroupsIsCorrect
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
EXPECT_NE(nullptr, kernel.numWorkGroupsX);
EXPECT_NE(nullptr, kernel.numWorkGroupsY);
EXPECT_NE(nullptr, kernel.numWorkGroupsZ);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.numWorkGroupsX);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.numWorkGroupsY);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.numWorkGroupsZ);
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
}
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedLocalWorkSizeIsCorrect) {
@@ -2385,10 +2385,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedLocalWorkSizeI
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
EXPECT_NE(nullptr, kernel.enqueuedLocalWorkSizeX);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.enqueuedLocalWorkSizeX);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.enqueuedLocalWorkSizeY);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.enqueuedLocalWorkSizeZ);
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY);
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ);
}
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedMaxWorkGroupSizeIsCorrect) {
@@ -2398,11 +2398,11 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedMaxWorkGroupSi
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
EXPECT_NE(nullptr, kernel.maxWorkGroupSizeForCrossThreadData);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.maxWorkGroupSizeForCrossThreadData);
EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.maxWorkGroupSizeOffset), static_cast<void *>(kernel.maxWorkGroupSizeForCrossThreadData));
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, *kernel.maxWorkGroupSizeForCrossThreadData);
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, kernel.maxKernelWorkGroupSize);
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData);
EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.maxWorkGroupSizeOffset), static_cast<void *>(kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData));
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, *kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData);
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, kernel.kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize);
}
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenDataParameterSimdSizeIsCorrect) {
@@ -2414,10 +2414,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenDataParameterSimdSizeI
executionEnvironment.CompiledSIMD8 = true;
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
EXPECT_NE(nullptr, kernel.dataParameterSimdSize);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.dataParameterSimdSize);
EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.simdSizeOffset), static_cast<void *>(kernel.dataParameterSimdSize));
EXPECT_EQ_VAL(pKernelInfo->getMaxSimdSize(), *kernel.dataParameterSimdSize);
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize);
EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.simdSizeOffset), static_cast<void *>(kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize));
EXPECT_EQ_VAL(pKernelInfo->getMaxSimdSize(), *kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize);
}
TEST_F(KernelCrossThreadTests, GivenParentEventOffsetWhenKernelIsInitializedThenParentEventIsInitiatedWithInvalid) {
@@ -2425,10 +2425,10 @@ TEST_F(KernelCrossThreadTests, GivenParentEventOffsetWhenKernelIsInitializedThen
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
EXPECT_NE(nullptr, kernel.parentEventOffset);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.parentEventOffset);
EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.parentEventOffset), static_cast<void *>(kernel.parentEventOffset));
EXPECT_EQ(WorkloadInfo::invalidParentEvent, *kernel.parentEventOffset);
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset);
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset);
EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.parentEventOffset), static_cast<void *>(kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset));
EXPECT_EQ(WorkloadInfo::invalidParentEvent, *kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset);
}
TEST_F(KernelCrossThreadTests, WhenAddingKernelThenProgramRefCountIsIncremented) {

View File

@@ -17,6 +17,7 @@
#include "opencl/source/sharings/sharing.h"
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
#include "opencl/test/unit_test/mocks/mock_cl_device.h"
#include "opencl/test/unit_test/mocks/mock_kernel.h"
#include "d3d_sharing_functions.h"
@@ -128,6 +129,52 @@ void MockContext::initializeWithDevices(const ClDeviceVector &devices, bool noSp
setupContextType();
}
SchedulerKernel &MockContext::getSchedulerKernel() {
if (schedulerBuiltIn->pKernel) {
return *static_cast<SchedulerKernel *>(schedulerBuiltIn->pKernel);
}
auto initializeSchedulerProgramAndKernel = [&] {
cl_int retVal = CL_SUCCESS;
auto clDevice = getDevice(0);
auto src = SchedulerKernel::loadSchedulerKernel(&clDevice->getDevice());
auto program = Program::createBuiltInFromGenBinary(this,
devices,
src.resource.data(),
src.resource.size(),
&retVal);
DEBUG_BREAK_IF(retVal != CL_SUCCESS);
DEBUG_BREAK_IF(!program);
retVal = program->processGenBinary(*clDevice);
DEBUG_BREAK_IF(retVal != CL_SUCCESS);
schedulerBuiltIn->pProgram = program;
KernelInfoContainer kernelInfos;
kernelInfos.resize(getMaxRootDeviceIndex() + 1);
for (auto rootDeviceIndex : rootDeviceIndices) {
auto kernelInfo = schedulerBuiltIn->pProgram->getKernelInfo(SchedulerKernel::schedulerName, rootDeviceIndex);
DEBUG_BREAK_IF(!kernelInfo);
kernelInfos[rootDeviceIndex] = kernelInfo;
}
schedulerBuiltIn->pKernel = Kernel::create<MockSchedulerKernel>(
schedulerBuiltIn->pProgram,
kernelInfos,
&retVal);
UNRECOVERABLE_IF(schedulerBuiltIn->pKernel->getScratchSize(clDevice->getRootDeviceIndex()) != 0);
DEBUG_BREAK_IF(retVal != CL_SUCCESS);
};
std::call_once(schedulerBuiltIn->programIsInitialized, initializeSchedulerProgramAndKernel);
UNRECOVERABLE_IF(schedulerBuiltIn->pKernel == nullptr);
return *static_cast<SchedulerKernel *>(schedulerBuiltIn->pKernel);
}
MockDefaultContext::MockDefaultContext() : MockContext(nullptr, nullptr) {
pRootDevice0 = ultClDeviceFactory.rootDevices[0];
pRootDevice1 = ultClDeviceFactory.rootDevices[1];

View File

@@ -47,6 +47,8 @@ class MockContext : public Context {
std::unique_ptr<AsyncEventsHandler> &getAsyncEventsHandlerUniquePtr();
void initializeWithDevices(const ClDeviceVector &devices, bool noSpecialQueue);
SchedulerKernel &getSchedulerKernel() override;
private:
ClDevice *pDevice = nullptr;
};

View File

@@ -40,7 +40,6 @@ class MockKernel : public Kernel {
using Kernel::kernelDeviceInfos;
using Kernel::kernelSvmGfxAllocations;
using Kernel::kernelUnifiedMemoryGfxAllocations;
using Kernel::numberOfBindingTableStates;
using Kernel::patchBufferOffset;
using Kernel::patchWithImplicitSurface;
using Kernel::svmAllocationsRequireCacheFlush;
@@ -595,6 +594,7 @@ class MockParentKernel : public Kernel {
class MockSchedulerKernel : public SchedulerKernel {
public:
using SchedulerKernel::kernelDeviceInfos;
MockSchedulerKernel(Program *programArg, const KernelInfoContainer &kernelInfoArg) : SchedulerKernel(programArg, kernelInfoArg){};
};