mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-20 08:53:55 +08:00
Store device specific kernel members per root device
Related-To: NEO-5001 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
8d2cfd87ae
commit
aa1fc85257
@@ -132,7 +132,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueKernel(
|
||||
",", globalWorkSizeIn[2],
|
||||
",SIMD:, ", kernelInfo.getMaxSimdSize());
|
||||
|
||||
if (totalWorkItems > kernel.maxKernelWorkGroupSize) {
|
||||
if (totalWorkItems > kernel.getMaxKernelWorkGroupSize(rootDeviceIndex)) {
|
||||
return CL_INVALID_WORK_GROUP_SIZE;
|
||||
}
|
||||
|
||||
|
||||
@@ -96,31 +96,13 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
|
||||
DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
|
||||
|
||||
// Patch our kernel constants
|
||||
*scheduler.globalWorkOffsetX = 0;
|
||||
*scheduler.globalWorkOffsetY = 0;
|
||||
*scheduler.globalWorkOffsetZ = 0;
|
||||
|
||||
*scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
|
||||
*scheduler.globalWorkSizeY = 1;
|
||||
*scheduler.globalWorkSizeZ = 1;
|
||||
|
||||
*scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
|
||||
*scheduler.localWorkSizeY = 1;
|
||||
*scheduler.localWorkSizeZ = 1;
|
||||
|
||||
*scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
|
||||
*scheduler.localWorkSizeY2 = 1;
|
||||
*scheduler.localWorkSizeZ2 = 1;
|
||||
|
||||
*scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
|
||||
*scheduler.enqueuedLocalWorkSizeY = 1;
|
||||
*scheduler.enqueuedLocalWorkSizeZ = 1;
|
||||
|
||||
*scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
|
||||
*scheduler.numWorkGroupsY = 0;
|
||||
*scheduler.numWorkGroupsZ = 0;
|
||||
|
||||
*scheduler.workDim = 1;
|
||||
scheduler.setGlobalWorkOffsetValues(rootDeviceIndex, 0, 0, 0);
|
||||
scheduler.setGlobalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getGws()), 1, 1);
|
||||
scheduler.setLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
|
||||
scheduler.setLocalWorkSize2Values(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
|
||||
scheduler.setEnqueuedLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
|
||||
scheduler.setNumWorkGroupsValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getGws() / scheduler.getLws()), 0, 0);
|
||||
scheduler.setWorkDim(rootDeviceIndex, 1);
|
||||
|
||||
// Send our indirect object data
|
||||
size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
|
||||
|
||||
@@ -196,36 +196,23 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
|
||||
|
||||
size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
|
||||
|
||||
auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex();
|
||||
// Patch our kernel constants
|
||||
*kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
|
||||
*kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
|
||||
*kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
|
||||
kernel.setGlobalWorkOffsetValues(rootDeviceIndex, static_cast<uint32_t>(offset.x), static_cast<uint32_t>(offset.y), static_cast<uint32_t>(offset.z));
|
||||
kernel.setGlobalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(gws.x), static_cast<uint32_t>(gws.y), static_cast<uint32_t>(gws.z));
|
||||
|
||||
*kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
|
||||
*kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
|
||||
*kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
|
||||
|
||||
if (isMainKernel || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
|
||||
*kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
|
||||
*kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
|
||||
*kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
|
||||
if (isMainKernel || (!kernel.isLocalWorkSize2Patched(rootDeviceIndex))) {
|
||||
kernel.setLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
|
||||
}
|
||||
|
||||
*kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
|
||||
*kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
|
||||
*kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
|
||||
|
||||
*kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
|
||||
*kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
|
||||
*kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
|
||||
kernel.setLocalWorkSize2Values(rootDeviceIndex, static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
|
||||
kernel.setEnqueuedLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(elws.x), static_cast<uint32_t>(elws.y), static_cast<uint32_t>(elws.z));
|
||||
|
||||
if (isMainKernel) {
|
||||
*kernel.numWorkGroupsX = static_cast<uint32_t>(totalNumberOfWorkgroups.x);
|
||||
*kernel.numWorkGroupsY = static_cast<uint32_t>(totalNumberOfWorkgroups.y);
|
||||
*kernel.numWorkGroupsZ = static_cast<uint32_t>(totalNumberOfWorkgroups.z);
|
||||
kernel.setNumWorkGroupsValues(rootDeviceIndex, static_cast<uint32_t>(totalNumberOfWorkgroups.x), static_cast<uint32_t>(totalNumberOfWorkgroups.y), static_cast<uint32_t>(totalNumberOfWorkgroups.z));
|
||||
}
|
||||
|
||||
*kernel.workDim = dim;
|
||||
kernel.setWorkDim(rootDeviceIndex, dim);
|
||||
|
||||
// Send our indirect object data
|
||||
size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
|
||||
|
||||
@@ -427,7 +427,7 @@ Vec3<size_t> computeWorkgroupSize(const DispatchInfo &dispatchInfo) {
|
||||
size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
|
||||
computeWorkgroupSizeND(wsInfo, workGroupSize, workItems, dispatchInfo.getDim());
|
||||
} else {
|
||||
auto maxWorkGroupSize = kernel->maxKernelWorkGroupSize;
|
||||
auto maxWorkGroupSize = kernel->getMaxKernelWorkGroupSize(rootDeviceIndex);
|
||||
auto simd = kernel->getKernelInfo(rootDeviceIndex).getMaxSimdSize();
|
||||
size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
|
||||
if (dispatchInfo.getDim() == 1) {
|
||||
|
||||
@@ -143,7 +143,7 @@ class Context : public BaseObject<_cl_context> {
|
||||
|
||||
ContextType peekContextType() const { return contextType; }
|
||||
|
||||
SchedulerKernel &getSchedulerKernel();
|
||||
MOCKABLE_VIRTUAL SchedulerKernel &getSchedulerKernel();
|
||||
|
||||
bool isDeviceAssociated(const ClDevice &clDevice) const;
|
||||
ClDevice *getSubDeviceByIndex(uint32_t subDeviceIndex) const;
|
||||
|
||||
@@ -63,10 +63,10 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {
|
||||
}
|
||||
if (isGTPinInitialized) {
|
||||
auto pKernel = castToObjectOrAbort<Kernel>(kernel);
|
||||
size_t gtpinBTI = pKernel->getNumberOfBindingTableStates();
|
||||
// Enlarge local copy of SSH by 1 SS
|
||||
auto device = pKernel->getDevices()[0];
|
||||
auto rootDeviceIndex = device->getRootDeviceIndex();
|
||||
size_t gtpinBTI = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
|
||||
// Enlarge local copy of SSH by 1 SS
|
||||
GFXCORE_FAMILY genFamily = device->getHardwareInfo().platform.eRenderCoreFamily;
|
||||
GTPinHwHelper >pinHelper = GTPinHwHelper::get(genFamily);
|
||||
if (pKernel->isParentKernel || !gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex)) {
|
||||
@@ -138,7 +138,7 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
|
||||
}
|
||||
GFXCORE_FAMILY genFamily = device.getHardwareInfo().platform.eRenderCoreFamily;
|
||||
GTPinHwHelper >pinHelper = GTPinHwHelper::get(genFamily);
|
||||
size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
|
||||
size_t gtpinBTI = pKernel->getNumberOfBindingTableStates(rootDeviceIndex) - 1;
|
||||
void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI, rootDeviceIndex);
|
||||
cl_mem buffer = (cl_mem)resource;
|
||||
auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
|
||||
|
||||
@@ -27,7 +27,7 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel, uint32_t rootD
|
||||
size_t ssSize = sizeof(RENDER_SURFACE_STATE);
|
||||
size_t btsSize = sizeof(BINDING_TABLE_STATE);
|
||||
size_t sizeToEnlarge = ssSize + btsSize;
|
||||
size_t currBTOffset = pKernel->getBindingTableOffset();
|
||||
size_t currBTOffset = pKernel->getBindingTableOffset(rootDeviceIndex);
|
||||
size_t currSurfaceStateSize = currBTOffset;
|
||||
char *pSsh = static_cast<char *>(pKernel->getSurfaceStateHeap(rootDeviceIndex));
|
||||
char *pNewSsh = new char[sshSize + sizeToEnlarge];
|
||||
@@ -35,7 +35,7 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel, uint32_t rootD
|
||||
RENDER_SURFACE_STATE *pSS = reinterpret_cast<RENDER_SURFACE_STATE *>(pNewSsh + currSurfaceStateSize);
|
||||
*pSS = GfxFamily::cmdInitRenderSurfaceState;
|
||||
size_t newSurfaceStateSize = currSurfaceStateSize + ssSize;
|
||||
size_t currBTCount = pKernel->getNumberOfBindingTableStates();
|
||||
size_t currBTCount = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
|
||||
memcpy_s(pNewSsh + newSurfaceStateSize, sshSize + sizeToEnlarge - newSurfaceStateSize, pSsh + currBTOffset, currBTCount * btsSize);
|
||||
BINDING_TABLE_STATE *pNewBTS = reinterpret_cast<BINDING_TABLE_STATE *>(pNewSsh + newSurfaceStateSize + currBTCount * btsSize);
|
||||
*pNewBTS = GfxFamily::cmdInitBindingTableState;
|
||||
@@ -48,10 +48,10 @@ template <typename GfxFamily>
|
||||
void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti, uint32_t rootDeviceIndex) {
|
||||
using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
|
||||
|
||||
if ((nullptr == pKernel->getSurfaceStateHeap(rootDeviceIndex)) || (bti >= pKernel->getNumberOfBindingTableStates())) {
|
||||
if ((nullptr == pKernel->getSurfaceStateHeap(rootDeviceIndex)) || (bti >= pKernel->getNumberOfBindingTableStates(rootDeviceIndex))) {
|
||||
return nullptr;
|
||||
}
|
||||
auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), (pKernel->getBindingTableOffset() + bti * sizeof(BINDING_TABLE_STATE))));
|
||||
auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), (pKernel->getBindingTableOffset(rootDeviceIndex) + bti * sizeof(BINDING_TABLE_STATE))));
|
||||
auto pSurfaceState = ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), pBts->getSurfaceStatePointer());
|
||||
return pSurfaceState;
|
||||
}
|
||||
|
||||
@@ -238,7 +238,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
|
||||
auto dstBindingTablePointer = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(ssh, (kernelInfo.patchInfo.bindingTableState != nullptr) ? kernelInfo.patchInfo.bindingTableState->Count : 0,
|
||||
kernel.getSurfaceStateHeap(rootDeviceIndex), kernel.getSurfaceStateHeapSize(rootDeviceIndex),
|
||||
kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset());
|
||||
kernel.getNumberOfBindingTableStates(rootDeviceIndex), kernel.getBindingTableOffset(rootDeviceIndex));
|
||||
|
||||
// Copy our sampler state if it exists
|
||||
uint32_t samplerStateOffset = 0;
|
||||
@@ -281,7 +281,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
DEBUG_BREAK_IF(patchInfo.executionEnvironment == nullptr);
|
||||
|
||||
auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
|
||||
auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates(rootDeviceIndex)));
|
||||
if (resetBindingTablePrefetch(kernel)) {
|
||||
bindingTablePrefetchSize = 0;
|
||||
}
|
||||
|
||||
@@ -78,8 +78,9 @@ Kernel::Kernel(Program *programArg, const KernelInfoContainer &kernelInfosArg, b
|
||||
program->retain();
|
||||
program->retainForKernel();
|
||||
imageTransformer.reset(new ImageTransformer);
|
||||
|
||||
maxKernelWorkGroupSize = static_cast<uint32_t>(deviceVector[0]->getSharedDeviceInfo().maxWorkGroupSize);
|
||||
for (const auto &pClDevice : deviceVector) {
|
||||
kernelDeviceInfos[pClDevice->getRootDeviceIndex()].maxKernelWorkGroupSize = static_cast<uint32_t>(pClDevice->getSharedDeviceInfo().maxWorkGroupSize);
|
||||
}
|
||||
}
|
||||
|
||||
Kernel::~Kernel() {
|
||||
@@ -170,9 +171,9 @@ template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData
|
||||
cl_int Kernel::initialize() {
|
||||
cl_int retVal = CL_OUT_OF_HOST_MEMORY;
|
||||
do {
|
||||
reconfigureKernel();
|
||||
auto pClDevice = &getDevice();
|
||||
auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
|
||||
reconfigureKernel(rootDeviceIndex);
|
||||
auto &hwInfo = pClDevice->getHardwareInfo();
|
||||
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
|
||||
auto &kernelInfo = *kernelInfos[rootDeviceIndex];
|
||||
@@ -201,40 +202,84 @@ cl_int Kernel::initialize() {
|
||||
}
|
||||
|
||||
auto crossThread = reinterpret_cast<uint32_t *>(kernelDeviceInfos[rootDeviceIndex].crossThreadData);
|
||||
globalWorkOffsetX = workloadInfo.globalWorkOffsetOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[0]) : globalWorkOffsetX;
|
||||
globalWorkOffsetY = workloadInfo.globalWorkOffsetOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[1]) : globalWorkOffsetY;
|
||||
globalWorkOffsetZ = workloadInfo.globalWorkOffsetOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[2]) : globalWorkOffsetZ;
|
||||
kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX = workloadInfo.globalWorkOffsetOffsets[0] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[0])
|
||||
: kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX;
|
||||
kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = workloadInfo.globalWorkOffsetOffsets[1] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[1])
|
||||
: kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY;
|
||||
kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ = workloadInfo.globalWorkOffsetOffsets[2] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[2])
|
||||
: kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ;
|
||||
|
||||
localWorkSizeX = workloadInfo.localWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[0]) : localWorkSizeX;
|
||||
localWorkSizeY = workloadInfo.localWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[1]) : localWorkSizeY;
|
||||
localWorkSizeZ = workloadInfo.localWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[2]) : localWorkSizeZ;
|
||||
kernelDeviceInfos[rootDeviceIndex].localWorkSizeX = workloadInfo.localWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[0])
|
||||
: kernelDeviceInfos[rootDeviceIndex].localWorkSizeX;
|
||||
kernelDeviceInfos[rootDeviceIndex].localWorkSizeY = workloadInfo.localWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[1])
|
||||
: kernelDeviceInfos[rootDeviceIndex].localWorkSizeY;
|
||||
kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ = workloadInfo.localWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[2])
|
||||
: kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ;
|
||||
|
||||
localWorkSizeX2 = workloadInfo.localWorkSizeOffsets2[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[0]) : localWorkSizeX2;
|
||||
localWorkSizeY2 = workloadInfo.localWorkSizeOffsets2[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[1]) : localWorkSizeY2;
|
||||
localWorkSizeZ2 = workloadInfo.localWorkSizeOffsets2[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[2]) : localWorkSizeZ2;
|
||||
kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 = workloadInfo.localWorkSizeOffsets2[0] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[0])
|
||||
: kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2;
|
||||
kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2 = workloadInfo.localWorkSizeOffsets2[1] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[1])
|
||||
: kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2;
|
||||
kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2 = workloadInfo.localWorkSizeOffsets2[2] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[2])
|
||||
: kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2;
|
||||
|
||||
globalWorkSizeX = workloadInfo.globalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[0]) : globalWorkSizeX;
|
||||
globalWorkSizeY = workloadInfo.globalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[1]) : globalWorkSizeY;
|
||||
globalWorkSizeZ = workloadInfo.globalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[2]) : globalWorkSizeZ;
|
||||
kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX = workloadInfo.globalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[0])
|
||||
: kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX;
|
||||
kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY = workloadInfo.globalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[1])
|
||||
: kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY;
|
||||
kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ = workloadInfo.globalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[2])
|
||||
: kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ;
|
||||
|
||||
enqueuedLocalWorkSizeX = workloadInfo.enqueuedLocalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[0]) : enqueuedLocalWorkSizeX;
|
||||
enqueuedLocalWorkSizeY = workloadInfo.enqueuedLocalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[1]) : enqueuedLocalWorkSizeY;
|
||||
enqueuedLocalWorkSizeZ = workloadInfo.enqueuedLocalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[2]) : enqueuedLocalWorkSizeZ;
|
||||
kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX = workloadInfo.enqueuedLocalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[0])
|
||||
: kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX;
|
||||
kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY = workloadInfo.enqueuedLocalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[1])
|
||||
: kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY;
|
||||
kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ = workloadInfo.enqueuedLocalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[2])
|
||||
: kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ;
|
||||
|
||||
numWorkGroupsX = workloadInfo.numWorkGroupsOffset[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[0]) : numWorkGroupsX;
|
||||
numWorkGroupsY = workloadInfo.numWorkGroupsOffset[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[1]) : numWorkGroupsY;
|
||||
numWorkGroupsZ = workloadInfo.numWorkGroupsOffset[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[2]) : numWorkGroupsZ;
|
||||
kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX = workloadInfo.numWorkGroupsOffset[0] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[0])
|
||||
: kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX;
|
||||
kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY = workloadInfo.numWorkGroupsOffset[1] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[1])
|
||||
: kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY;
|
||||
kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ = workloadInfo.numWorkGroupsOffset[2] != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[2])
|
||||
: kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ;
|
||||
|
||||
maxWorkGroupSizeForCrossThreadData = workloadInfo.maxWorkGroupSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.maxWorkGroupSizeOffset) : maxWorkGroupSizeForCrossThreadData;
|
||||
workDim = workloadInfo.workDimOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.workDimOffset) : workDim;
|
||||
dataParameterSimdSize = workloadInfo.simdSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.simdSizeOffset) : dataParameterSimdSize;
|
||||
parentEventOffset = workloadInfo.parentEventOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.parentEventOffset) : parentEventOffset;
|
||||
preferredWkgMultipleOffset = workloadInfo.preferredWkgMultipleOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.preferredWkgMultipleOffset) : preferredWkgMultipleOffset;
|
||||
kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData = workloadInfo.maxWorkGroupSizeOffset != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.maxWorkGroupSizeOffset)
|
||||
: kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData;
|
||||
kernelDeviceInfos[rootDeviceIndex].workDim = workloadInfo.workDimOffset != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.workDimOffset)
|
||||
: kernelDeviceInfos[rootDeviceIndex].workDim;
|
||||
kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize = workloadInfo.simdSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.simdSizeOffset) : kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize;
|
||||
kernelDeviceInfos[rootDeviceIndex].parentEventOffset = workloadInfo.parentEventOffset != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.parentEventOffset)
|
||||
: kernelDeviceInfos[rootDeviceIndex].parentEventOffset;
|
||||
kernelDeviceInfos[rootDeviceIndex].preferredWkgMultipleOffset = workloadInfo.preferredWkgMultipleOffset != WorkloadInfo::undefinedOffset
|
||||
? ptrOffset(crossThread, workloadInfo.preferredWkgMultipleOffset)
|
||||
: kernelDeviceInfos[rootDeviceIndex].preferredWkgMultipleOffset;
|
||||
|
||||
*maxWorkGroupSizeForCrossThreadData = maxKernelWorkGroupSize;
|
||||
*dataParameterSimdSize = maxSimdSize;
|
||||
*preferredWkgMultipleOffset = maxSimdSize;
|
||||
*parentEventOffset = WorkloadInfo::invalidParentEvent;
|
||||
*kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData = kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
|
||||
*kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize = maxSimdSize;
|
||||
*kernelDeviceInfos[rootDeviceIndex].preferredWkgMultipleOffset = maxSimdSize;
|
||||
*kernelDeviceInfos[rootDeviceIndex].parentEventOffset = WorkloadInfo::invalidParentEvent;
|
||||
}
|
||||
|
||||
// allocate our own SSH, if necessary
|
||||
@@ -247,8 +292,8 @@ cl_int Kernel::initialize() {
|
||||
memcpy_s(kernelDeviceInfos[rootDeviceIndex].pSshLocal.get(), kernelDeviceInfos[rootDeviceIndex].sshLocalSize,
|
||||
heapInfo.pSsh, kernelDeviceInfos[rootDeviceIndex].sshLocalSize);
|
||||
}
|
||||
numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0;
|
||||
localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0;
|
||||
kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0;
|
||||
kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0;
|
||||
|
||||
// patch crossthread data and ssh with inline surfaces, if necessary
|
||||
auto perHwThreadPrivateMemorySize = PatchTokenBinary::getPerHwThreadPrivateSurfaceSize(patchInfo.pAllocateStatelessPrivateSurface, kernelInfo.getMaxSimdSize());
|
||||
@@ -582,7 +627,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para
|
||||
|
||||
switch (paramName) {
|
||||
case CL_KERNEL_WORK_GROUP_SIZE:
|
||||
maxWorkgroupSize = this->maxKernelWorkGroupSize;
|
||||
maxWorkgroupSize = kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
|
||||
if (DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get()) {
|
||||
auto divisionSize = CommonConstants::maximalSimdSize / patchInfo.executionEnvironment->LargestCompiledSIMDSize;
|
||||
maxWorkgroupSize /= divisionSize;
|
||||
@@ -646,9 +691,10 @@ cl_int Kernel::getSubGroupInfo(ClDevice &clDevice, cl_kernel_sub_group_info para
|
||||
size_t *paramValueSizeRet) const {
|
||||
size_t numDimensions = 0;
|
||||
size_t WGS = 1;
|
||||
const auto &kernelInfo = getKernelInfo(clDevice.getRootDeviceIndex());
|
||||
auto rootDeviceIndex = clDevice.getRootDeviceIndex();
|
||||
const auto &kernelInfo = getKernelInfo(rootDeviceIndex);
|
||||
auto maxSimdSize = static_cast<size_t>(kernelInfo.getMaxSimdSize());
|
||||
auto maxRequiredWorkGroupSize = static_cast<size_t>(kernelInfo.getMaxRequiredWorkGroupSize(maxKernelWorkGroupSize));
|
||||
auto maxRequiredWorkGroupSize = static_cast<size_t>(kernelInfo.getMaxRequiredWorkGroupSize(getMaxKernelWorkGroupSize(rootDeviceIndex)));
|
||||
auto largestCompiledSIMDSize = static_cast<size_t>(kernelInfo.patchInfo.executionEnvironment->LargestCompiledSIMDSize);
|
||||
|
||||
GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
|
||||
@@ -811,15 +857,15 @@ size_t Kernel::getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const {
|
||||
: 0;
|
||||
}
|
||||
|
||||
size_t Kernel::getNumberOfBindingTableStates() const {
|
||||
return numberOfBindingTableStates;
|
||||
size_t Kernel::getNumberOfBindingTableStates(uint32_t rootDeviceIndex) const {
|
||||
return kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates;
|
||||
}
|
||||
|
||||
void Kernel::resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
|
||||
kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(static_cast<char *>(pNewSsh));
|
||||
kernelDeviceInfos[rootDeviceIndex].sshLocalSize = static_cast<uint32_t>(newSshSize);
|
||||
numberOfBindingTableStates = newBindingTableCount;
|
||||
localBindingTableOffset = newBindingTableOffset;
|
||||
kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = newBindingTableCount;
|
||||
kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset = newBindingTableOffset;
|
||||
}
|
||||
|
||||
cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) {
|
||||
@@ -2564,4 +2610,51 @@ const KernelInfo &Kernel::getDefaultKernelInfo() const {
|
||||
UNRECOVERABLE_IF(!pKernelInfo);
|
||||
return *pKernelInfo;
|
||||
}
|
||||
void Kernel::setGlobalWorkOffsetValues(uint32_t rootDeviceIndex, uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ) {
|
||||
*kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX = globalWorkOffsetX;
|
||||
*kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = globalWorkOffsetY;
|
||||
*kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = globalWorkOffsetZ;
|
||||
}
|
||||
|
||||
void Kernel::setGlobalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ) {
|
||||
*kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX = globalWorkSizeX;
|
||||
*kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY = globalWorkSizeY;
|
||||
*kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ = globalWorkSizeZ;
|
||||
}
|
||||
|
||||
void Kernel::setLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
|
||||
*kernelDeviceInfos[rootDeviceIndex].localWorkSizeX = localWorkSizeX;
|
||||
*kernelDeviceInfos[rootDeviceIndex].localWorkSizeY = localWorkSizeY;
|
||||
*kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ = localWorkSizeZ;
|
||||
}
|
||||
|
||||
void Kernel::setLocalWorkSize2Values(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
|
||||
*kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 = localWorkSizeX;
|
||||
*kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2 = localWorkSizeY;
|
||||
*kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2 = localWorkSizeZ;
|
||||
}
|
||||
|
||||
void Kernel::setEnqueuedLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
|
||||
*kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX = localWorkSizeX;
|
||||
*kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY = localWorkSizeY;
|
||||
*kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ = localWorkSizeZ;
|
||||
}
|
||||
|
||||
bool Kernel::isLocalWorkSize2Patched(uint32_t rootDeviceIndex) {
|
||||
return kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 != &dummyPatchLocation;
|
||||
}
|
||||
|
||||
void Kernel::setNumWorkGroupsValues(uint32_t rootDeviceIndex, uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ) {
|
||||
*kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX = numWorkGroupsX;
|
||||
*kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY = numWorkGroupsY;
|
||||
*kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ = numWorkGroupsZ;
|
||||
}
|
||||
|
||||
void Kernel::setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim) {
|
||||
*kernelDeviceInfos[rootDeviceIndex].workDim = workDim;
|
||||
}
|
||||
|
||||
uint32_t Kernel::getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const {
|
||||
return kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
|
||||
}
|
||||
} // namespace NEO
|
||||
|
||||
@@ -161,9 +161,9 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
size_t getKernelHeapSize(uint32_t rootDeviceIndex) const;
|
||||
size_t getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const;
|
||||
size_t getDynamicStateHeapSize(uint32_t rootDeviceIndex) const;
|
||||
size_t getNumberOfBindingTableStates() const;
|
||||
size_t getBindingTableOffset() const {
|
||||
return localBindingTableOffset;
|
||||
size_t getNumberOfBindingTableStates(uint32_t rootDeviceIndex) const;
|
||||
size_t getBindingTableOffset(uint32_t rootDeviceIndex) const {
|
||||
return kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset;
|
||||
}
|
||||
|
||||
void resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
|
||||
@@ -304,37 +304,6 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
size_t argSize,
|
||||
const void *argValue) const;
|
||||
|
||||
uint32_t *globalWorkOffsetX = &Kernel::dummyPatchLocation;
|
||||
uint32_t *globalWorkOffsetY = &Kernel::dummyPatchLocation;
|
||||
uint32_t *globalWorkOffsetZ = &Kernel::dummyPatchLocation;
|
||||
|
||||
uint32_t *localWorkSizeX = &Kernel::dummyPatchLocation;
|
||||
uint32_t *localWorkSizeY = &Kernel::dummyPatchLocation;
|
||||
uint32_t *localWorkSizeZ = &Kernel::dummyPatchLocation;
|
||||
|
||||
uint32_t *localWorkSizeX2 = &Kernel::dummyPatchLocation;
|
||||
uint32_t *localWorkSizeY2 = &Kernel::dummyPatchLocation;
|
||||
uint32_t *localWorkSizeZ2 = &Kernel::dummyPatchLocation;
|
||||
|
||||
uint32_t *globalWorkSizeX = &Kernel::dummyPatchLocation;
|
||||
uint32_t *globalWorkSizeY = &Kernel::dummyPatchLocation;
|
||||
uint32_t *globalWorkSizeZ = &Kernel::dummyPatchLocation;
|
||||
|
||||
uint32_t *enqueuedLocalWorkSizeX = &Kernel::dummyPatchLocation;
|
||||
uint32_t *enqueuedLocalWorkSizeY = &Kernel::dummyPatchLocation;
|
||||
uint32_t *enqueuedLocalWorkSizeZ = &Kernel::dummyPatchLocation;
|
||||
|
||||
uint32_t *numWorkGroupsX = &Kernel::dummyPatchLocation;
|
||||
uint32_t *numWorkGroupsY = &Kernel::dummyPatchLocation;
|
||||
uint32_t *numWorkGroupsZ = &Kernel::dummyPatchLocation;
|
||||
|
||||
uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation;
|
||||
uint32_t maxKernelWorkGroupSize = 0;
|
||||
uint32_t *workDim = &Kernel::dummyPatchLocation;
|
||||
uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation;
|
||||
uint32_t *parentEventOffset = &Kernel::dummyPatchLocation;
|
||||
uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation;
|
||||
|
||||
static uint32_t dummyPatchLocation;
|
||||
|
||||
std::vector<size_t> slmSizes;
|
||||
@@ -426,6 +395,16 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
}
|
||||
const KernelInfo &getDefaultKernelInfo() const;
|
||||
|
||||
void setGlobalWorkOffsetValues(uint32_t rootDeviceIndex, uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ);
|
||||
void setGlobalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ);
|
||||
void setLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
|
||||
void setLocalWorkSize2Values(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
|
||||
void setEnqueuedLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
|
||||
bool isLocalWorkSize2Patched(uint32_t rootDeviceIndex);
|
||||
void setNumWorkGroupsValues(uint32_t rootDeviceIndex, uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ);
|
||||
void setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim);
|
||||
uint32_t getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const;
|
||||
|
||||
protected:
|
||||
struct ObjectCounts {
|
||||
uint32_t imageCount;
|
||||
@@ -511,7 +490,7 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
|
||||
void resolveArgs();
|
||||
|
||||
void reconfigureKernel();
|
||||
void reconfigureKernel(uint32_t rootDeviceIndex);
|
||||
|
||||
void addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation);
|
||||
bool allocationForCacheFlush(GraphicsAllocation *argAllocation) const;
|
||||
@@ -534,9 +513,6 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
|
||||
AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None;
|
||||
|
||||
size_t numberOfBindingTableStates = 0u;
|
||||
size_t localBindingTableOffset = 0u;
|
||||
|
||||
GraphicsAllocation *kernelReflectionSurface = nullptr;
|
||||
|
||||
bool usingSharedObjArgs = false;
|
||||
@@ -561,6 +537,40 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::NotSet;
|
||||
|
||||
struct KernelDeviceInfo : public NonCopyableClass {
|
||||
uint32_t *globalWorkOffsetX = &Kernel::dummyPatchLocation;
|
||||
uint32_t *globalWorkOffsetY = &Kernel::dummyPatchLocation;
|
||||
uint32_t *globalWorkOffsetZ = &Kernel::dummyPatchLocation;
|
||||
|
||||
uint32_t *localWorkSizeX = &Kernel::dummyPatchLocation;
|
||||
uint32_t *localWorkSizeY = &Kernel::dummyPatchLocation;
|
||||
uint32_t *localWorkSizeZ = &Kernel::dummyPatchLocation;
|
||||
|
||||
uint32_t *localWorkSizeX2 = &Kernel::dummyPatchLocation;
|
||||
uint32_t *localWorkSizeY2 = &Kernel::dummyPatchLocation;
|
||||
uint32_t *localWorkSizeZ2 = &Kernel::dummyPatchLocation;
|
||||
|
||||
uint32_t *globalWorkSizeX = &Kernel::dummyPatchLocation;
|
||||
uint32_t *globalWorkSizeY = &Kernel::dummyPatchLocation;
|
||||
uint32_t *globalWorkSizeZ = &Kernel::dummyPatchLocation;
|
||||
|
||||
uint32_t *enqueuedLocalWorkSizeX = &Kernel::dummyPatchLocation;
|
||||
uint32_t *enqueuedLocalWorkSizeY = &Kernel::dummyPatchLocation;
|
||||
uint32_t *enqueuedLocalWorkSizeZ = &Kernel::dummyPatchLocation;
|
||||
|
||||
uint32_t *numWorkGroupsX = &Kernel::dummyPatchLocation;
|
||||
uint32_t *numWorkGroupsY = &Kernel::dummyPatchLocation;
|
||||
uint32_t *numWorkGroupsZ = &Kernel::dummyPatchLocation;
|
||||
|
||||
uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation;
|
||||
uint32_t maxKernelWorkGroupSize = 0;
|
||||
uint32_t *workDim = &Kernel::dummyPatchLocation;
|
||||
uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation;
|
||||
uint32_t *parentEventOffset = &Kernel::dummyPatchLocation;
|
||||
uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation;
|
||||
|
||||
size_t numberOfBindingTableStates = 0u;
|
||||
size_t localBindingTableOffset = 0u;
|
||||
|
||||
std::unique_ptr<char[]> pSshLocal;
|
||||
uint32_t sshLocalSize = 0u;
|
||||
char *crossThreadData = nullptr;
|
||||
|
||||
@@ -13,7 +13,7 @@ namespace NEO {
|
||||
bool Kernel::requiresCacheFlushCommand(const CommandQueue &commandQueue) const {
|
||||
return false;
|
||||
}
|
||||
void Kernel::reconfigureKernel() {
|
||||
void Kernel::reconfigureKernel(uint32_t rootDeviceIndex) {
|
||||
}
|
||||
int Kernel::setKernelThreadArbitrationPolicy(uint32_t policy) {
|
||||
if (policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_ROUND_ROBIN_INTEL) {
|
||||
|
||||
@@ -133,8 +133,9 @@ WorkSizeInfo::WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t
|
||||
}
|
||||
WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) {
|
||||
auto &device = dispatchInfo.getClDevice();
|
||||
const auto &kernelInfo = dispatchInfo.getKernel()->getKernelInfo(device.getRootDeviceIndex());
|
||||
this->maxWorkGroupSize = dispatchInfo.getKernel()->maxKernelWorkGroupSize;
|
||||
auto rootDeviceIndex = device.getRootDeviceIndex();
|
||||
const auto &kernelInfo = dispatchInfo.getKernel()->getKernelInfo(rootDeviceIndex);
|
||||
this->maxWorkGroupSize = dispatchInfo.getKernel()->getMaxKernelWorkGroupSize(rootDeviceIndex);
|
||||
auto pExecutionEnvironment = kernelInfo.patchInfo.executionEnvironment;
|
||||
this->hasBarriers = (pExecutionEnvironment != nullptr) && (pExecutionEnvironment->HasBarriers);
|
||||
this->simdSize = static_cast<uint32_t>(kernelInfo.getMaxSimdSize());
|
||||
|
||||
@@ -15,12 +15,12 @@ struct KernelSubGroupInfoFixture : HelloWorldFixture<HelloWorldFixtureFactory> {
|
||||
|
||||
void SetUp() override {
|
||||
ParentClass::SetUp();
|
||||
pKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
|
||||
pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
|
||||
maxSimdSize = static_cast<size_t>(pKernel->getKernelInfo(rootDeviceIndex).getMaxSimdSize());
|
||||
ASSERT_LE(8u, maxSimdSize);
|
||||
maxWorkDim = static_cast<size_t>(pClDevice->getDeviceInfo().maxWorkItemDimensions);
|
||||
ASSERT_EQ(3u, maxWorkDim);
|
||||
maxWorkGroupSize = static_cast<size_t>(pKernel->maxKernelWorkGroupSize);
|
||||
maxWorkGroupSize = static_cast<size_t>(pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize);
|
||||
ASSERT_GE(1024u, maxWorkGroupSize);
|
||||
largestCompiledSIMDSize = static_cast<size_t>(pKernel->getKernelInfo(rootDeviceIndex).patchInfo.executionEnvironment->LargestCompiledSIMDSize);
|
||||
ASSERT_EQ(32u, largestCompiledSIMDSize);
|
||||
@@ -30,8 +30,8 @@ struct KernelSubGroupInfoFixture : HelloWorldFixture<HelloWorldFixtureFactory> {
|
||||
auto requiredWorkGroupSizeZ = static_cast<size_t>(pKernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2]);
|
||||
|
||||
calculatedMaxWorkgroupSize = requiredWorkGroupSizeX * requiredWorkGroupSizeY * requiredWorkGroupSizeZ;
|
||||
if ((calculatedMaxWorkgroupSize == 0) || (calculatedMaxWorkgroupSize > static_cast<size_t>(pKernel->maxKernelWorkGroupSize))) {
|
||||
calculatedMaxWorkgroupSize = static_cast<size_t>(pKernel->maxKernelWorkGroupSize);
|
||||
if ((calculatedMaxWorkgroupSize == 0) || (calculatedMaxWorkgroupSize > static_cast<size_t>(pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize))) {
|
||||
calculatedMaxWorkgroupSize = static_cast<size_t>(pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -259,7 +259,7 @@ HWTEST_F(DispatchWalkerTest, GivenDefaultLwsAlgorithmWhenDispatchingWalkerThenDi
|
||||
nullptr,
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
|
||||
EXPECT_EQ(dimension, *kernel.workDim);
|
||||
EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -288,7 +288,7 @@ HWTEST_F(DispatchWalkerTest, GivenSquaredLwsAlgorithmWhenDispatchingWalkerThenDi
|
||||
nullptr,
|
||||
nullptr,
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
EXPECT_EQ(dimension, *kernel.workDim);
|
||||
EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -316,7 +316,7 @@ HWTEST_F(DispatchWalkerTest, GivenNdLwsAlgorithmWhenDispatchingWalkerThenDimensi
|
||||
nullptr,
|
||||
nullptr,
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
EXPECT_EQ(dimension, *kernel.workDim);
|
||||
EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -345,7 +345,7 @@ HWTEST_F(DispatchWalkerTest, GivenOldLwsAlgorithmWhenDispatchingWalkerThenDimens
|
||||
nullptr,
|
||||
nullptr,
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
EXPECT_EQ(dimension, *kernel.workDim);
|
||||
EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -375,9 +375,9 @@ HWTEST_F(DispatchWalkerTest, GivenNumWorkGroupsWhenDispatchingWalkerThenNumWorkG
|
||||
nullptr,
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
|
||||
EXPECT_EQ(2u, *kernel.numWorkGroupsX);
|
||||
EXPECT_EQ(5u, *kernel.numWorkGroupsY);
|
||||
EXPECT_EQ(10u, *kernel.numWorkGroupsZ);
|
||||
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
|
||||
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
|
||||
EXPECT_EQ(10u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
|
||||
}
|
||||
|
||||
HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndDefaultAlgorithmWhenDispatchingWalkerThenLwsIsCorrect) {
|
||||
@@ -405,9 +405,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndDefaultAlgorithmWhenDispatch
|
||||
nullptr,
|
||||
nullptr,
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
EXPECT_EQ(2u, *kernel.localWorkSizeX);
|
||||
EXPECT_EQ(5u, *kernel.localWorkSizeY);
|
||||
EXPECT_EQ(1u, *kernel.localWorkSizeZ);
|
||||
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
|
||||
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
}
|
||||
|
||||
HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndNdOnWhenDispatchingWalkerThenLwsIsCorrect) {
|
||||
@@ -435,9 +435,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndNdOnWhenDispatchingWalkerThe
|
||||
nullptr,
|
||||
nullptr,
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
EXPECT_EQ(2u, *kernel.localWorkSizeX);
|
||||
EXPECT_EQ(5u, *kernel.localWorkSizeY);
|
||||
EXPECT_EQ(10u, *kernel.localWorkSizeZ);
|
||||
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
|
||||
EXPECT_EQ(10u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
}
|
||||
|
||||
HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmWhenDispatchingWalkerThenLwsIsCorrect) {
|
||||
@@ -466,9 +466,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmWhenDispatch
|
||||
nullptr,
|
||||
nullptr,
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
EXPECT_EQ(2u, *kernel.localWorkSizeX);
|
||||
EXPECT_EQ(5u, *kernel.localWorkSizeY);
|
||||
EXPECT_EQ(1u, *kernel.localWorkSizeZ);
|
||||
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
|
||||
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
}
|
||||
|
||||
HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmOffAndNdOffWhenDispatchingWalkerThenLwsIsCorrect) {
|
||||
@@ -497,9 +497,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmOffAndNdOffW
|
||||
nullptr,
|
||||
nullptr,
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
EXPECT_EQ(2u, *kernel.localWorkSizeX);
|
||||
EXPECT_EQ(5u, *kernel.localWorkSizeY);
|
||||
EXPECT_EQ(1u, *kernel.localWorkSizeZ);
|
||||
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
|
||||
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
}
|
||||
|
||||
HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeWhenDispatchingWalkerThenLwsIsCorrect) {
|
||||
@@ -526,9 +526,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeWhenDispatchingWalkerThenLwsIsC
|
||||
nullptr,
|
||||
nullptr,
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
EXPECT_EQ(1u, *kernel.localWorkSizeX);
|
||||
EXPECT_EQ(2u, *kernel.localWorkSizeY);
|
||||
EXPECT_EQ(3u, *kernel.localWorkSizeZ);
|
||||
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
|
||||
EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
}
|
||||
|
||||
HWTEST_F(DispatchWalkerTest, GivenTwoSetsOfLwsOffsetsWhenDispatchingWalkerThenLwsIsCorrect) {
|
||||
@@ -558,12 +558,12 @@ HWTEST_F(DispatchWalkerTest, GivenTwoSetsOfLwsOffsetsWhenDispatchingWalkerThenLw
|
||||
nullptr,
|
||||
nullptr,
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
EXPECT_EQ(1u, *kernel.localWorkSizeX);
|
||||
EXPECT_EQ(2u, *kernel.localWorkSizeY);
|
||||
EXPECT_EQ(3u, *kernel.localWorkSizeZ);
|
||||
EXPECT_EQ(1u, *kernel.localWorkSizeX2);
|
||||
EXPECT_EQ(2u, *kernel.localWorkSizeY2);
|
||||
EXPECT_EQ(3u, *kernel.localWorkSizeZ2);
|
||||
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
|
||||
EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
|
||||
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
|
||||
EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
|
||||
}
|
||||
|
||||
HWTEST_F(DispatchWalkerTest, GivenSplitKernelWhenDispatchingWalkerThenLwsIsCorrect) {
|
||||
@@ -597,16 +597,16 @@ HWTEST_F(DispatchWalkerTest, GivenSplitKernelWhenDispatchingWalkerThenLwsIsCorre
|
||||
|
||||
auto dispatchId = 0;
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
auto &kernel = *dispatchInfo.getKernel();
|
||||
auto &kernel = static_cast<MockKernel &>(*dispatchInfo.getKernel());
|
||||
if (dispatchId == 0) {
|
||||
EXPECT_EQ(1u, *kernel.localWorkSizeX);
|
||||
EXPECT_EQ(2u, *kernel.localWorkSizeY);
|
||||
EXPECT_EQ(3u, *kernel.localWorkSizeZ);
|
||||
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
|
||||
EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
}
|
||||
if (dispatchId == 1) {
|
||||
EXPECT_EQ(4u, *kernel.localWorkSizeX);
|
||||
EXPECT_EQ(5u, *kernel.localWorkSizeY);
|
||||
EXPECT_EQ(6u, *kernel.localWorkSizeZ);
|
||||
EXPECT_EQ(4u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
|
||||
EXPECT_EQ(6u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
}
|
||||
dispatchId++;
|
||||
}
|
||||
@@ -646,27 +646,27 @@ HWTEST_F(DispatchWalkerTest, GivenSplitWalkerWhenDispatchingWalkerThenLwsIsCorre
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
auto &kernel = *dispatchInfo.getKernel();
|
||||
auto &kernel = static_cast<MockKernel &>(*dispatchInfo.getKernel());
|
||||
if (&kernel == &mainKernel) {
|
||||
EXPECT_EQ(4u, *kernel.localWorkSizeX);
|
||||
EXPECT_EQ(5u, *kernel.localWorkSizeY);
|
||||
EXPECT_EQ(6u, *kernel.localWorkSizeZ);
|
||||
EXPECT_EQ(4u, *kernel.localWorkSizeX2);
|
||||
EXPECT_EQ(5u, *kernel.localWorkSizeY2);
|
||||
EXPECT_EQ(6u, *kernel.localWorkSizeZ2);
|
||||
EXPECT_EQ(3u, *kernel.numWorkGroupsX);
|
||||
EXPECT_EQ(2u, *kernel.numWorkGroupsY);
|
||||
EXPECT_EQ(2u, *kernel.numWorkGroupsZ);
|
||||
EXPECT_EQ(4u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
|
||||
EXPECT_EQ(6u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
EXPECT_EQ(4u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
|
||||
EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
|
||||
EXPECT_EQ(6u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
|
||||
EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
|
||||
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
|
||||
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
|
||||
} else {
|
||||
EXPECT_EQ(0u, *kernel.localWorkSizeX);
|
||||
EXPECT_EQ(0u, *kernel.localWorkSizeY);
|
||||
EXPECT_EQ(0u, *kernel.localWorkSizeZ);
|
||||
EXPECT_EQ(1u, *kernel.localWorkSizeX2);
|
||||
EXPECT_EQ(2u, *kernel.localWorkSizeY2);
|
||||
EXPECT_EQ(3u, *kernel.localWorkSizeZ2);
|
||||
EXPECT_EQ(0u, *kernel.numWorkGroupsX);
|
||||
EXPECT_EQ(0u, *kernel.numWorkGroupsY);
|
||||
EXPECT_EQ(0u, *kernel.numWorkGroupsZ);
|
||||
EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
|
||||
EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
|
||||
EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
|
||||
EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
|
||||
EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
|
||||
EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
|
||||
EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -859,8 +859,8 @@ HWTEST_F(DispatchWalkerTest, GivenMultipleKernelsWhenDispatchingWalkerThenWorkDi
|
||||
CL_COMMAND_NDRANGE_KERNEL);
|
||||
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
auto &kernel = *dispatchInfo.getKernel();
|
||||
EXPECT_EQ(*kernel.workDim, dispatchInfo.getDim());
|
||||
auto &kernel = static_cast<MockKernel &>(*dispatchInfo.getKernel());
|
||||
EXPECT_EQ(*kernel.kernelDeviceInfos[rootDeviceIndex].workDim, dispatchInfo.getDim());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -99,7 +99,7 @@ HWTEST_F(EnqueueDebugKernelTest, givenDebugKernelWhenEnqueuedThenSSHAndBtiAreCor
|
||||
|
||||
mockCmdQ->enqueueKernel(debugKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
|
||||
|
||||
auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(surfaceStates, debugKernel->getBindingTableOffset()));
|
||||
auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(surfaceStates, debugKernel->getBindingTableOffset(rootDeviceIndex)));
|
||||
uint32_t surfaceStateOffset = dstBtiTableBase[0].getSurfaceStatePointer();
|
||||
|
||||
auto debugSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh.getCpuBase(), surfaceStateOffset));
|
||||
|
||||
@@ -1277,9 +1277,9 @@ TEST_F(EnqueueKernelTest, givenKernelWhenAllArgsAreNotAndEventExistSetThenClEnqu
|
||||
TEST_F(EnqueueKernelTest, givenEnqueueCommandThatLwsExceedsDeviceCapabilitiesWhenEnqueueNDRangeKernelIsCalledThenErrorIsReturned) {
|
||||
MockKernelWithInternals mockKernel(*pClDevice);
|
||||
|
||||
mockKernel.mockKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
|
||||
mockKernel.mockKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
|
||||
|
||||
auto maxKernelWorkgroupSize = mockKernel.mockKernel->maxKernelWorkGroupSize;
|
||||
auto maxKernelWorkgroupSize = mockKernel.mockKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
|
||||
size_t globalWorkSize[3] = {maxKernelWorkgroupSize + 1, 1, 1};
|
||||
size_t localWorkSize[3] = {maxKernelWorkgroupSize + 1, 1, 1};
|
||||
|
||||
|
||||
@@ -64,13 +64,13 @@ TEST_F(EnqueueKernelRequiredWorkSize, GivenUnspecifiedWorkGroupSizeWhenEnqeueing
|
||||
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
EXPECT_EQ(*pKernel->localWorkSizeX, 8u);
|
||||
EXPECT_EQ(*pKernel->localWorkSizeY, 4u);
|
||||
EXPECT_EQ(*pKernel->localWorkSizeZ, 4u);
|
||||
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX, 8u);
|
||||
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY, 4u);
|
||||
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ, 4u);
|
||||
|
||||
EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeX, 8u);
|
||||
EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeY, 4u);
|
||||
EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeZ, 4u);
|
||||
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX, 8u);
|
||||
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY, 4u);
|
||||
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ, 4u);
|
||||
}
|
||||
|
||||
// Fully specified
|
||||
@@ -91,13 +91,13 @@ TEST_F(EnqueueKernelRequiredWorkSize, GivenRequiredWorkGroupSizeWhenEnqeueingKer
|
||||
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeX, 8u);
|
||||
EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeY, 4u);
|
||||
EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeZ, 4u);
|
||||
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX, 8u);
|
||||
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY, 4u);
|
||||
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ, 4u);
|
||||
|
||||
EXPECT_EQ(*pKernel->localWorkSizeX, 8u);
|
||||
EXPECT_EQ(*pKernel->localWorkSizeY, 4u);
|
||||
EXPECT_EQ(*pKernel->localWorkSizeZ, 4u);
|
||||
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX, 8u);
|
||||
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY, 4u);
|
||||
EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ, 4u);
|
||||
}
|
||||
|
||||
// Underspecified. Won't permit.
|
||||
|
||||
@@ -680,8 +680,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
|
||||
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
|
||||
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
|
||||
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
EXPECT_TRUE(containsHint(expectedHint, userData));
|
||||
}
|
||||
|
||||
@@ -692,8 +695,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
|
||||
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
|
||||
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
|
||||
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
EXPECT_TRUE(containsHint(expectedHint, userData));
|
||||
DebugManager.flags.EnableComputeWorkSizeND.set(isWorkGroupSizeEnabled);
|
||||
}
|
||||
@@ -705,8 +711,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
|
||||
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
|
||||
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
|
||||
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
EXPECT_TRUE(containsHint(expectedHint, userData));
|
||||
DebugManager.flags.EnableComputeWorkSizeND.set(isWorkGroupSizeEnabled);
|
||||
}
|
||||
@@ -716,8 +725,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
|
||||
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
|
||||
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
|
||||
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
EXPECT_TRUE(containsHint(expectedHint, userData));
|
||||
}
|
||||
|
||||
@@ -729,8 +741,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
|
||||
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
|
||||
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
|
||||
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
EXPECT_TRUE(containsHint(expectedHint, userData));
|
||||
}
|
||||
|
||||
@@ -742,8 +757,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
|
||||
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
|
||||
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
|
||||
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
|
||||
*kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
EXPECT_TRUE(containsHint(expectedHint, userData));
|
||||
}
|
||||
|
||||
|
||||
@@ -241,7 +241,7 @@ struct PerformanceHintEnqueueKernelTest : public PerformanceHintEnqueueTest,
|
||||
ProgramFixture::TearDown();
|
||||
PerformanceHintEnqueueTest::TearDown();
|
||||
}
|
||||
Kernel *kernel = nullptr;
|
||||
MockKernel *kernel = nullptr;
|
||||
uint32_t rootDeviceIndex = std::numeric_limits<uint32_t>::max();
|
||||
size_t globalWorkGroupSize[3]{};
|
||||
};
|
||||
|
||||
@@ -328,7 +328,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu
|
||||
|
||||
Kernel *blockKernel = Kernel::create(pKernel->getProgram(), MockKernel::toKernelInfoContainer(*pBlockInfo, rootDeviceIndex), nullptr);
|
||||
blockSSH = alignUp(blockSSH, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
|
||||
if (blockKernel->getNumberOfBindingTableStates() > 0) {
|
||||
if (blockKernel->getNumberOfBindingTableStates(rootDeviceIndex) > 0) {
|
||||
ASSERT_NE(nullptr, pBlockInfo->patchInfo.bindingTableState);
|
||||
auto dstBlockBti = ptrOffset(blockSSH, pBlockInfo->patchInfo.bindingTableState->Offset);
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(dstBlockBti) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE);
|
||||
@@ -336,7 +336,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu
|
||||
|
||||
auto srcBlockBti = ptrOffset(pBlockInfo->heapInfo.pSsh, pBlockInfo->patchInfo.bindingTableState->Offset);
|
||||
auto srcBindingTable = reinterpret_cast<const BINDING_TABLE_STATE *>(srcBlockBti);
|
||||
for (uint32_t i = 0; i < blockKernel->getNumberOfBindingTableStates(); ++i) {
|
||||
for (uint32_t i = 0; i < blockKernel->getNumberOfBindingTableStates(rootDeviceIndex); ++i) {
|
||||
uint32_t dstSurfaceStatePointer = dstBindingTable[i].getSurfaceStatePointer();
|
||||
uint32_t srcSurfaceStatePointer = srcBindingTable[i].getSurfaceStatePointer();
|
||||
auto *dstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh->getCpuBase(), dstSurfaceStatePointer));
|
||||
|
||||
@@ -166,7 +166,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelDispatchTest, givenParentKernelWhenQueue
|
||||
|
||||
size_t sshUsed = blockedCommandsData->ssh->getUsed();
|
||||
|
||||
size_t expectedSizeSSH = pKernel->getNumberOfBindingTableStates() * sizeof(RENDER_SURFACE_STATE) +
|
||||
size_t expectedSizeSSH = pKernel->getNumberOfBindingTableStates(rootDeviceIndex) * sizeof(RENDER_SURFACE_STATE) +
|
||||
pKernel->getKernelInfo(rootDeviceIndex).patchInfo.bindingTableState->Count * sizeof(BINDING_TABLE_STATE) +
|
||||
UnitTestHelper<FamilyType>::getDefaultSshUsage();
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSched
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
|
||||
DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
|
||||
SchedulerKernel &scheduler = context->getSchedulerKernel();
|
||||
auto &scheduler = static_cast<MockSchedulerKernel &>(context->getSchedulerKernel());
|
||||
|
||||
auto *executionModelDshAllocation = pDevQueueHw->getDshBuffer();
|
||||
auto *dshHeap = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
|
||||
@@ -70,27 +70,27 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSched
|
||||
pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
|
||||
false);
|
||||
|
||||
EXPECT_EQ(0u, *scheduler.globalWorkOffsetX);
|
||||
EXPECT_EQ(0u, *scheduler.globalWorkOffsetY);
|
||||
EXPECT_EQ(0u, *scheduler.globalWorkOffsetZ);
|
||||
EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX);
|
||||
EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
|
||||
EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ);
|
||||
|
||||
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.localWorkSizeX);
|
||||
EXPECT_EQ(1u, *scheduler.localWorkSizeY);
|
||||
EXPECT_EQ(1u, *scheduler.localWorkSizeZ);
|
||||
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
|
||||
EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
|
||||
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.localWorkSizeX2);
|
||||
EXPECT_EQ(1u, *scheduler.localWorkSizeY2);
|
||||
EXPECT_EQ(1u, *scheduler.localWorkSizeZ2);
|
||||
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
|
||||
EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
|
||||
EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
|
||||
|
||||
if (scheduler.enqueuedLocalWorkSizeX != &Kernel::dummyPatchLocation) {
|
||||
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.enqueuedLocalWorkSizeX);
|
||||
if (scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX != &Kernel::dummyPatchLocation) {
|
||||
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX);
|
||||
}
|
||||
EXPECT_EQ(1u, *scheduler.enqueuedLocalWorkSizeY);
|
||||
EXPECT_EQ(1u, *scheduler.enqueuedLocalWorkSizeZ);
|
||||
EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY);
|
||||
EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ);
|
||||
|
||||
EXPECT_EQ((uint32_t)(scheduler.getGws() / scheduler.getLws()), *scheduler.numWorkGroupsX);
|
||||
EXPECT_EQ(0u, *scheduler.numWorkGroupsY);
|
||||
EXPECT_EQ(0u, *scheduler.numWorkGroupsZ);
|
||||
EXPECT_EQ((uint32_t)(scheduler.getGws() / scheduler.getLws()), *scheduler.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
|
||||
EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
|
||||
EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
|
||||
|
||||
HardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(commandStream, 0);
|
||||
|
||||
@@ -97,7 +97,7 @@ struct HelloWorldKernelFixture : public ProgramFixture {
|
||||
std::string *pKernelName = nullptr;
|
||||
cl_uint simd = 32;
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
Kernel *pKernel = nullptr;
|
||||
MockKernel *pKernel = nullptr;
|
||||
MockContext *pContext = nullptr;
|
||||
};
|
||||
} // namespace NEO
|
||||
|
||||
@@ -2225,11 +2225,11 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
|
||||
Kernel *pKernel = castToObject<Kernel>(kernel);
|
||||
ASSERT_NE(nullptr, pKernel);
|
||||
|
||||
size_t numBTS1 = pKernel->getNumberOfBindingTableStates();
|
||||
size_t numBTS1 = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
|
||||
EXPECT_EQ(2u, numBTS1);
|
||||
size_t sizeSurfaceStates1 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
|
||||
EXPECT_NE(0u, sizeSurfaceStates1);
|
||||
size_t offsetBTS1 = pKernel->getBindingTableOffset();
|
||||
size_t offsetBTS1 = pKernel->getBindingTableOffset(rootDeviceIndex);
|
||||
EXPECT_NE(0u, offsetBTS1);
|
||||
|
||||
GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().platform.eRenderCoreFamily;
|
||||
@@ -2241,11 +2241,11 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
|
||||
bool surfaceAdded = gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex);
|
||||
EXPECT_TRUE(surfaceAdded);
|
||||
|
||||
size_t numBTS2 = pKernel->getNumberOfBindingTableStates();
|
||||
size_t numBTS2 = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
|
||||
EXPECT_EQ(numBTS1 + 1, numBTS2);
|
||||
size_t sizeSurfaceStates2 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
|
||||
EXPECT_GT(sizeSurfaceStates2, sizeSurfaceStates1);
|
||||
size_t offsetBTS2 = pKernel->getBindingTableOffset();
|
||||
size_t offsetBTS2 = pKernel->getBindingTableOffset(rootDeviceIndex);
|
||||
EXPECT_GT(offsetBTS2, offsetBTS1);
|
||||
|
||||
void *pSS2 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
|
||||
@@ -2261,11 +2261,11 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
|
||||
surfaceAdded = gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex);
|
||||
EXPECT_FALSE(surfaceAdded);
|
||||
|
||||
size_t numBTS3 = pKernel->getNumberOfBindingTableStates();
|
||||
size_t numBTS3 = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
|
||||
EXPECT_EQ(0u, numBTS3);
|
||||
size_t sizeSurfaceStates3 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
|
||||
EXPECT_EQ(0u, sizeSurfaceStates3);
|
||||
size_t offsetBTS3 = pKernel->getBindingTableOffset();
|
||||
size_t offsetBTS3 = pKernel->getBindingTableOffset(rootDeviceIndex);
|
||||
EXPECT_EQ(0u, offsetBTS3);
|
||||
void *pSS3 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
|
||||
EXPECT_EQ(nullptr, pSS3);
|
||||
|
||||
@@ -385,7 +385,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
|
||||
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
|
||||
|
||||
auto expectedBindingTableCount = 3u;
|
||||
mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
|
||||
mockKernelWithInternal->mockKernel->kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = expectedBindingTableCount;
|
||||
|
||||
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
|
||||
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
|
||||
@@ -431,7 +431,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelThatIsSchedulerWhen
|
||||
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
|
||||
|
||||
auto expectedBindingTableCount = 3u;
|
||||
mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
|
||||
mockKernelWithInternal->mockKernel->kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = expectedBindingTableCount;
|
||||
auto isScheduler = const_cast<bool *>(&mockKernelWithInternal->mockKernel->isSchedulerKernel);
|
||||
*isScheduler = true;
|
||||
|
||||
@@ -475,7 +475,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
|
||||
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
|
||||
|
||||
auto expectedBindingTableCount = 100u;
|
||||
mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
|
||||
mockKernelWithInternal->mockKernel->kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = expectedBindingTableCount;
|
||||
|
||||
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
|
||||
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
|
||||
@@ -802,7 +802,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh
|
||||
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
|
||||
|
||||
// Initialize binding table state pointers with pattern
|
||||
EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates());
|
||||
EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates(rootDeviceIndex));
|
||||
|
||||
const size_t localWorkSizes[3]{256, 1, 1};
|
||||
|
||||
@@ -890,7 +890,7 @@ HWTEST_F(HardwareCommandsTest, GivenBuffersNotRequiringSshWhenSettingBindingTabl
|
||||
auto usedBefore = ssh.getUsed();
|
||||
|
||||
// Initialize binding table state pointers with pattern
|
||||
auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
|
||||
auto numSurfaceStates = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
|
||||
EXPECT_EQ(0u, numSurfaceStates);
|
||||
|
||||
// set binding table states
|
||||
@@ -933,7 +933,7 @@ HWTEST_F(HardwareCommandsTest, GivenZeroSurfaceStatesWhenSettingBindingTableStat
|
||||
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
|
||||
|
||||
// Initialize binding table state pointers with pattern
|
||||
auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
|
||||
auto numSurfaceStates = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
|
||||
EXPECT_EQ(0u, numSurfaceStates);
|
||||
|
||||
auto dstBindingTablePointer = pushBindingTableAndSurfaceStates<FamilyType>(ssh, *pKernel);
|
||||
|
||||
@@ -47,6 +47,6 @@ struct HardwareCommandsTest : ClDeviceFixture,
|
||||
size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) {
|
||||
return EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(dstHeap, (srcKernel.getKernelInfo(rootDeviceIndex).patchInfo.bindingTableState != nullptr) ? srcKernel.getKernelInfo(rootDeviceIndex).patchInfo.bindingTableState->Count : 0,
|
||||
srcKernel.getSurfaceStateHeap(rootDeviceIndex), srcKernel.getSurfaceStateHeapSize(rootDeviceIndex),
|
||||
srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset());
|
||||
srcKernel.getNumberOfBindingTableStates(rootDeviceIndex), srcKernel.getBindingTableOffset(rootDeviceIndex));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -65,7 +65,7 @@ class KernelTests : public ProgramFromBinaryFixture {
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
// create a kernel
|
||||
pKernel = Kernel::create(
|
||||
pKernel = Kernel::create<MockKernel>(
|
||||
pProgram,
|
||||
pProgram->getKernelInfosForKernel(kernelName),
|
||||
&retVal);
|
||||
@@ -81,7 +81,7 @@ class KernelTests : public ProgramFromBinaryFixture {
|
||||
ProgramFromBinaryFixture::TearDown();
|
||||
}
|
||||
|
||||
Kernel *pKernel = nullptr;
|
||||
MockKernel *pKernel = nullptr;
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
};
|
||||
|
||||
@@ -278,7 +278,7 @@ TEST_F(KernelTests, GivenKernelWorkGroupSizeWhenGettingWorkGroupInfoThenWorkGrou
|
||||
size_t paramValueSizeRet = 0;
|
||||
|
||||
auto kernelMaxWorkGroupSize = pDevice->getDeviceInfo().maxWorkGroupSize - 1;
|
||||
pKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(kernelMaxWorkGroupSize);
|
||||
pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(kernelMaxWorkGroupSize);
|
||||
|
||||
retVal = pKernel->getWorkGroupInfo(
|
||||
*pClDevice,
|
||||
@@ -2305,10 +2305,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenGlobalWorkOffsetIsCorr
|
||||
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkOffsetX);
|
||||
EXPECT_NE(nullptr, kernel.globalWorkOffsetY);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.globalWorkOffsetY);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkOffsetZ);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX);
|
||||
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ);
|
||||
}
|
||||
|
||||
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSizeIsCorrect) {
|
||||
@@ -2318,10 +2318,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSizeIsCorrect
|
||||
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
EXPECT_NE(nullptr, kernel.localWorkSizeX);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.localWorkSizeX);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeY);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeZ);
|
||||
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
|
||||
}
|
||||
|
||||
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSize2IsCorrect) {
|
||||
@@ -2331,10 +2331,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSize2IsCorrec
|
||||
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeX2);
|
||||
EXPECT_NE(nullptr, kernel.localWorkSizeY2);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.localWorkSizeY2);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeZ2);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
|
||||
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
|
||||
}
|
||||
|
||||
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenGlobalWorkSizeIsCorrect) {
|
||||
@@ -2344,10 +2344,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenGlobalWorkSizeIsCorrec
|
||||
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkSizeX);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkSizeY);
|
||||
EXPECT_NE(nullptr, kernel.globalWorkSizeZ);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.globalWorkSizeZ);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY);
|
||||
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ);
|
||||
}
|
||||
|
||||
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkDimIsCorrect) {
|
||||
@@ -2357,8 +2357,8 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkDimIsCorrect)
|
||||
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
EXPECT_NE(nullptr, kernel.workDim);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.workDim);
|
||||
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
|
||||
}
|
||||
|
||||
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenNumWorkGroupsIsCorrect) {
|
||||
@@ -2370,12 +2370,12 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenNumWorkGroupsIsCorrect
|
||||
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
EXPECT_NE(nullptr, kernel.numWorkGroupsX);
|
||||
EXPECT_NE(nullptr, kernel.numWorkGroupsY);
|
||||
EXPECT_NE(nullptr, kernel.numWorkGroupsZ);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.numWorkGroupsX);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.numWorkGroupsY);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.numWorkGroupsZ);
|
||||
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
|
||||
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
|
||||
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
|
||||
}
|
||||
|
||||
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedLocalWorkSizeIsCorrect) {
|
||||
@@ -2385,10 +2385,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedLocalWorkSizeI
|
||||
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
EXPECT_NE(nullptr, kernel.enqueuedLocalWorkSizeX);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.enqueuedLocalWorkSizeX);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.enqueuedLocalWorkSizeY);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.enqueuedLocalWorkSizeZ);
|
||||
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY);
|
||||
EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ);
|
||||
}
|
||||
|
||||
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedMaxWorkGroupSizeIsCorrect) {
|
||||
@@ -2398,11 +2398,11 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedMaxWorkGroupSi
|
||||
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
EXPECT_NE(nullptr, kernel.maxWorkGroupSizeForCrossThreadData);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.maxWorkGroupSizeForCrossThreadData);
|
||||
EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.maxWorkGroupSizeOffset), static_cast<void *>(kernel.maxWorkGroupSizeForCrossThreadData));
|
||||
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, *kernel.maxWorkGroupSizeForCrossThreadData);
|
||||
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, kernel.maxKernelWorkGroupSize);
|
||||
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData);
|
||||
EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.maxWorkGroupSizeOffset), static_cast<void *>(kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData));
|
||||
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, *kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData);
|
||||
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, kernel.kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize);
|
||||
}
|
||||
|
||||
TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenDataParameterSimdSizeIsCorrect) {
|
||||
@@ -2414,10 +2414,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenDataParameterSimdSizeI
|
||||
executionEnvironment.CompiledSIMD8 = true;
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
EXPECT_NE(nullptr, kernel.dataParameterSimdSize);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.dataParameterSimdSize);
|
||||
EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.simdSizeOffset), static_cast<void *>(kernel.dataParameterSimdSize));
|
||||
EXPECT_EQ_VAL(pKernelInfo->getMaxSimdSize(), *kernel.dataParameterSimdSize);
|
||||
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize);
|
||||
EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.simdSizeOffset), static_cast<void *>(kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize));
|
||||
EXPECT_EQ_VAL(pKernelInfo->getMaxSimdSize(), *kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize);
|
||||
}
|
||||
|
||||
TEST_F(KernelCrossThreadTests, GivenParentEventOffsetWhenKernelIsInitializedThenParentEventIsInitiatedWithInvalid) {
|
||||
@@ -2425,10 +2425,10 @@ TEST_F(KernelCrossThreadTests, GivenParentEventOffsetWhenKernelIsInitializedThen
|
||||
MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
EXPECT_NE(nullptr, kernel.parentEventOffset);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.parentEventOffset);
|
||||
EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.parentEventOffset), static_cast<void *>(kernel.parentEventOffset));
|
||||
EXPECT_EQ(WorkloadInfo::invalidParentEvent, *kernel.parentEventOffset);
|
||||
EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset);
|
||||
EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset);
|
||||
EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.parentEventOffset), static_cast<void *>(kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset));
|
||||
EXPECT_EQ(WorkloadInfo::invalidParentEvent, *kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset);
|
||||
}
|
||||
|
||||
TEST_F(KernelCrossThreadTests, WhenAddingKernelThenProgramRefCountIsIncremented) {
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "opencl/source/sharings/sharing.h"
|
||||
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
|
||||
#include "opencl/test/unit_test/mocks/mock_cl_device.h"
|
||||
#include "opencl/test/unit_test/mocks/mock_kernel.h"
|
||||
|
||||
#include "d3d_sharing_functions.h"
|
||||
|
||||
@@ -128,6 +129,52 @@ void MockContext::initializeWithDevices(const ClDeviceVector &devices, bool noSp
|
||||
setupContextType();
|
||||
}
|
||||
|
||||
SchedulerKernel &MockContext::getSchedulerKernel() {
|
||||
if (schedulerBuiltIn->pKernel) {
|
||||
return *static_cast<SchedulerKernel *>(schedulerBuiltIn->pKernel);
|
||||
}
|
||||
|
||||
auto initializeSchedulerProgramAndKernel = [&] {
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
auto clDevice = getDevice(0);
|
||||
auto src = SchedulerKernel::loadSchedulerKernel(&clDevice->getDevice());
|
||||
|
||||
auto program = Program::createBuiltInFromGenBinary(this,
|
||||
devices,
|
||||
src.resource.data(),
|
||||
src.resource.size(),
|
||||
&retVal);
|
||||
DEBUG_BREAK_IF(retVal != CL_SUCCESS);
|
||||
DEBUG_BREAK_IF(!program);
|
||||
|
||||
retVal = program->processGenBinary(*clDevice);
|
||||
DEBUG_BREAK_IF(retVal != CL_SUCCESS);
|
||||
|
||||
schedulerBuiltIn->pProgram = program;
|
||||
|
||||
KernelInfoContainer kernelInfos;
|
||||
kernelInfos.resize(getMaxRootDeviceIndex() + 1);
|
||||
for (auto rootDeviceIndex : rootDeviceIndices) {
|
||||
auto kernelInfo = schedulerBuiltIn->pProgram->getKernelInfo(SchedulerKernel::schedulerName, rootDeviceIndex);
|
||||
DEBUG_BREAK_IF(!kernelInfo);
|
||||
kernelInfos[rootDeviceIndex] = kernelInfo;
|
||||
}
|
||||
|
||||
schedulerBuiltIn->pKernel = Kernel::create<MockSchedulerKernel>(
|
||||
schedulerBuiltIn->pProgram,
|
||||
kernelInfos,
|
||||
&retVal);
|
||||
|
||||
UNRECOVERABLE_IF(schedulerBuiltIn->pKernel->getScratchSize(clDevice->getRootDeviceIndex()) != 0);
|
||||
|
||||
DEBUG_BREAK_IF(retVal != CL_SUCCESS);
|
||||
};
|
||||
std::call_once(schedulerBuiltIn->programIsInitialized, initializeSchedulerProgramAndKernel);
|
||||
|
||||
UNRECOVERABLE_IF(schedulerBuiltIn->pKernel == nullptr);
|
||||
return *static_cast<SchedulerKernel *>(schedulerBuiltIn->pKernel);
|
||||
}
|
||||
|
||||
MockDefaultContext::MockDefaultContext() : MockContext(nullptr, nullptr) {
|
||||
pRootDevice0 = ultClDeviceFactory.rootDevices[0];
|
||||
pRootDevice1 = ultClDeviceFactory.rootDevices[1];
|
||||
|
||||
@@ -47,6 +47,8 @@ class MockContext : public Context {
|
||||
std::unique_ptr<AsyncEventsHandler> &getAsyncEventsHandlerUniquePtr();
|
||||
void initializeWithDevices(const ClDeviceVector &devices, bool noSpecialQueue);
|
||||
|
||||
SchedulerKernel &getSchedulerKernel() override;
|
||||
|
||||
private:
|
||||
ClDevice *pDevice = nullptr;
|
||||
};
|
||||
|
||||
@@ -40,7 +40,6 @@ class MockKernel : public Kernel {
|
||||
using Kernel::kernelDeviceInfos;
|
||||
using Kernel::kernelSvmGfxAllocations;
|
||||
using Kernel::kernelUnifiedMemoryGfxAllocations;
|
||||
using Kernel::numberOfBindingTableStates;
|
||||
using Kernel::patchBufferOffset;
|
||||
using Kernel::patchWithImplicitSurface;
|
||||
using Kernel::svmAllocationsRequireCacheFlush;
|
||||
@@ -595,6 +594,7 @@ class MockParentKernel : public Kernel {
|
||||
|
||||
class MockSchedulerKernel : public SchedulerKernel {
|
||||
public:
|
||||
using SchedulerKernel::kernelDeviceInfos;
|
||||
MockSchedulerKernel(Program *programArg, const KernelInfoContainer &kernelInfoArg) : SchedulerKernel(programArg, kernelInfoArg){};
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user