OpenCL Queue Families extension 1/n

Basic implementation, some things will be tweaked in future commits

Related-To: NEO-5120
Signed-off-by: Maciej Dziuban <maciej.dziuban@intel.com>
This commit is contained in:
Maciej Dziuban
2020-11-16 11:43:03 +00:00
committed by Compute-Runtime-Automation
parent 2be1b36422
commit 14f92cc7a1
16 changed files with 621 additions and 109 deletions

View File

@@ -4882,6 +4882,8 @@ cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties(cl_context conte
tokenValue != CL_QUEUE_PRIORITY_KHR &&
tokenValue != CL_QUEUE_THROTTLE_KHR &&
tokenValue != CL_QUEUE_SLICE_COUNT_INTEL &&
tokenValue != CL_QUEUE_FAMILY_INTEL &&
tokenValue != CL_QUEUE_INDEX_INTEL &&
!isExtraToken(propertiesAddress)) {
err.set(CL_INVALID_VALUE);
TRACING_EXIT(clCreateCommandQueueWithProperties, &commandQueue);
@@ -4956,6 +4958,23 @@ cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties(cl_context conte
return commandQueue;
}
bool queueFamilySelected = false;
bool queueSelected = false;
const auto queueFamilyIndex = getCmdQueueProperties<cl_uint>(properties, CL_QUEUE_FAMILY_INTEL, &queueFamilySelected);
const auto queueIndex = getCmdQueueProperties<cl_uint>(properties, CL_QUEUE_INDEX_INTEL, &queueSelected);
if (queueFamilySelected != queueSelected) {
err.set(CL_INVALID_QUEUE_PROPERTIES);
TRACING_EXIT(clCreateCommandQueueWithProperties, &commandQueue);
return commandQueue;
}
if (queueFamilySelected &&
(queueFamilyIndex >= pDevice->getDeviceInfo().queueFamilyProperties.size() ||
queueIndex >= pDevice->getDeviceInfo().queueFamilyProperties[queueFamilyIndex].count)) {
err.set(CL_INVALID_QUEUE_PROPERTIES);
TRACING_EXIT(clCreateCommandQueueWithProperties, &commandQueue);
return commandQueue;
}
auto maskedFlags = commandQueueProperties & minimumCreateDeviceQueueFlags;
if (maskedFlags == minimumCreateDeviceQueueFlags) {

View File

@@ -360,6 +360,26 @@ void ClDevice::initializeCaps() {
}
}
const std::vector<std::vector<EngineControl>> &queueFamilies = this->getDevice().getEngineGroups();
if (queueFamilies.size() > 0) {
for (int queueFamilyIndex = 0; queueFamilyIndex < static_cast<int>(EngineGroupType::MaxEngineGroups); queueFamilyIndex++) {
const std::vector<EngineControl> &enginesInFamily = queueFamilies.at(queueFamilyIndex);
if (enginesInFamily.size() > 0) {
cl_queue_family_properties_intel properties;
properties.capabilities = CL_QUEUE_CAPABILITY_ALL_INTEL;
properties.count = static_cast<cl_uint>(enginesInFamily.size());
properties.properties = deviceInfo.queueOnHostProperties;
deviceInfo.queueFamilyProperties.push_back(properties);
}
}
} else {
cl_queue_family_properties_intel properties;
properties.capabilities = CL_QUEUE_CAPABILITY_ALL_INTEL;
properties.count = 1;
properties.properties = deviceInfo.queueOnHostProperties;
deviceInfo.queueFamilyProperties.push_back(properties);
}
deviceInfo.preemptionSupported = false;
deviceInfo.maxGlobalVariableSize = ocl21FeaturesEnabled ? 64 * KB : 0;
deviceInfo.globalVariablePreferredTotalSize = ocl21FeaturesEnabled ? static_cast<size_t>(sharedDeviceInfo.maxMemAllocSize) : 0;

View File

@@ -188,12 +188,21 @@ cl_int ClDevice::getDeviceInfo(cl_device_info paramName,
src = &param;
}
break;
case CL_DEVICE_NUM_QUEUE_FAMILIES_INTEL:
srcSize = retSize = sizeof(cl_uint);
param = static_cast<cl_uint>(deviceInfo.queueFamilyProperties.size());
src = &param;
break;
case CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL:
if (simultaneousInterops.size() > 1u) {
srcSize = retSize = sizeof(cl_uint) * simultaneousInterops.size();
src = &simultaneousInterops[0];
}
break;
case CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL:
src = deviceInfo.queueFamilyProperties.data();
retSize = srcSize = deviceInfo.queueFamilyProperties.size() * sizeof(cl_queue_family_properties_intel);
break;
case CL_DEVICE_REFERENCE_COUNT: {
cl_int ref = this->getReference();
DEBUG_BREAK_IF(ref != 1 && !deviceInfo.parentDevice);

View File

@@ -11,6 +11,8 @@
#include "opencl/extensions/public/cl_ext_private.h"
#include "engine_group_types.h"
#include <vector>
namespace NEO {
@@ -19,116 +21,117 @@ using OpenClCFeaturesContainer = StackVec<cl_name_version, 15>;
// clang-format off
struct ClDeviceInfo {
cl_name_version ilsWithVersion[1];
StackVec<cl_name_version, 3> builtInKernelsWithVersion;
StackVec<cl_name_version, 5> openclCAllVersions;
OpenClCFeaturesContainer openclCFeatures;
std::vector<cl_name_version> extensionsWithVersion;
cl_device_type deviceType;
size_t maxSliceCount;
size_t image3DMaxWidth;
size_t image3DMaxHeight;
size_t maxBufferSize;
size_t maxArraySize;
cl_device_fp_config singleFpConfig;
cl_device_fp_config halfFpConfig;
cl_device_fp_config doubleFpConfig;
cl_ulong globalMemCacheSize;
cl_ulong maxConstantBufferSize;
size_t maxGlobalVariableSize;
size_t globalVariablePreferredTotalSize;
size_t preferredWorkGroupSizeMultiple;
cl_device_exec_capabilities executionCapabilities;
cl_command_queue_properties queueOnHostProperties;
cl_command_queue_properties queueOnDeviceProperties;
const char *builtInKernels;
cl_platform_id platform;
const char *name;
const char *vendor;
const char *driverVersion;
const char *profile;
const char *clVersion;
const char *clCVersion;
const char *spirVersions;
const char *deviceExtensions;
const char *latestConformanceVersionPassed;
cl_device_id parentDevice;
cl_device_affinity_domain partitionAffinityDomain;
cl_uint partitionMaxSubDevices;
cl_device_partition_property partitionProperties[2];
cl_device_partition_property partitionType[3];
cl_device_svm_capabilities svmCapabilities;
double platformHostTimerResolution;
size_t planarYuvMaxWidth;
size_t planarYuvMaxHeight;
cl_version numericClVersion;
cl_uint maxComputUnits;
cl_uint maxWorkItemDimensions;
cl_uint maxNumOfSubGroups;
cl_bool independentForwardProgress;
cl_device_atomic_capabilities atomicMemoryCapabilities;
cl_device_atomic_capabilities atomicFenceCapabilities;
cl_bool nonUniformWorkGroupSupport;
cl_bool workGroupCollectiveFunctionsSupport;
cl_bool genericAddressSpaceSupport;
cl_device_device_enqueue_capabilities deviceEnqueueSupport;
cl_bool pipeSupport;
cl_uint preferredVectorWidthChar;
cl_uint preferredVectorWidthShort;
cl_uint preferredVectorWidthInt;
cl_uint preferredVectorWidthLong;
cl_uint preferredVectorWidthFloat;
cl_uint preferredVectorWidthDouble;
cl_uint preferredVectorWidthHalf;
cl_uint nativeVectorWidthChar;
cl_uint nativeVectorWidthShort;
cl_uint nativeVectorWidthInt;
cl_uint nativeVectorWidthLong;
cl_uint nativeVectorWidthFloat;
cl_uint nativeVectorWidthDouble;
cl_uint nativeVectorWidthHalf;
cl_uint maxReadWriteImageArgs;
cl_uint imagePitchAlignment;
cl_uint imageBaseAddressAlignment;
cl_uint maxPipeArgs;
cl_uint pipeMaxActiveReservations;
cl_uint pipeMaxPacketSize;
cl_uint memBaseAddressAlign;
cl_uint minDataTypeAlignSize;
cl_device_mem_cache_type globalMemCacheType;
cl_uint maxConstantArgs;
cl_device_local_mem_type localMemType;
cl_bool endianLittle;
cl_bool deviceAvailable;
cl_bool compilerAvailable;
cl_bool linkerAvailable;
cl_uint queueOnDevicePreferredSize;
cl_uint queueOnDeviceMaxSize;
cl_uint maxOnDeviceQueues;
cl_uint maxOnDeviceEvents;
cl_bool preferredInteropUserSync;
cl_uint referenceCount;
cl_uint preferredPlatformAtomicAlignment;
cl_uint preferredGlobalAtomicAlignment;
cl_uint preferredLocalAtomicAlignment;
cl_bool hostUnifiedMemory;
cl_bool vmeAvcSupportsTextureSampler;
cl_uint vmeAvcVersion;
cl_uint vmeVersion;
cl_uint internalDriverVersion;
cl_uint grfSize;
bool preemptionSupported;
cl_name_version ilsWithVersion[1];
StackVec<cl_name_version, 3> builtInKernelsWithVersion;
StackVec<cl_name_version, 5> openclCAllVersions;
OpenClCFeaturesContainer openclCFeatures;
std::vector<cl_name_version> extensionsWithVersion;
cl_device_type deviceType;
size_t maxSliceCount;
size_t image3DMaxWidth;
size_t image3DMaxHeight;
size_t maxBufferSize;
size_t maxArraySize;
cl_device_fp_config singleFpConfig;
cl_device_fp_config halfFpConfig;
cl_device_fp_config doubleFpConfig;
cl_ulong globalMemCacheSize;
cl_ulong maxConstantBufferSize;
size_t maxGlobalVariableSize;
size_t globalVariablePreferredTotalSize;
size_t preferredWorkGroupSizeMultiple;
cl_device_exec_capabilities executionCapabilities;
cl_command_queue_properties queueOnHostProperties;
cl_command_queue_properties queueOnDeviceProperties;
const char *builtInKernels;
cl_platform_id platform;
const char *name;
const char *vendor;
const char *driverVersion;
const char *profile;
const char *clVersion;
const char *clCVersion;
const char *spirVersions;
const char *deviceExtensions;
const char *latestConformanceVersionPassed;
cl_device_id parentDevice;
cl_device_affinity_domain partitionAffinityDomain;
cl_uint partitionMaxSubDevices;
cl_device_partition_property partitionProperties[2];
cl_device_partition_property partitionType[3];
cl_device_svm_capabilities svmCapabilities;
StackVec<cl_queue_family_properties_intel, static_cast<size_t>(EngineGroupType::MaxEngineGroups)> queueFamilyProperties;
double platformHostTimerResolution;
size_t planarYuvMaxWidth;
size_t planarYuvMaxHeight;
cl_version numericClVersion;
cl_uint maxComputUnits;
cl_uint maxWorkItemDimensions;
cl_uint maxNumOfSubGroups;
cl_bool independentForwardProgress;
cl_device_atomic_capabilities atomicMemoryCapabilities;
cl_device_atomic_capabilities atomicFenceCapabilities;
cl_bool nonUniformWorkGroupSupport;
cl_bool workGroupCollectiveFunctionsSupport;
cl_bool genericAddressSpaceSupport;
cl_device_device_enqueue_capabilities deviceEnqueueSupport;
cl_bool pipeSupport;
cl_uint preferredVectorWidthChar;
cl_uint preferredVectorWidthShort;
cl_uint preferredVectorWidthInt;
cl_uint preferredVectorWidthLong;
cl_uint preferredVectorWidthFloat;
cl_uint preferredVectorWidthDouble;
cl_uint preferredVectorWidthHalf;
cl_uint nativeVectorWidthChar;
cl_uint nativeVectorWidthShort;
cl_uint nativeVectorWidthInt;
cl_uint nativeVectorWidthLong;
cl_uint nativeVectorWidthFloat;
cl_uint nativeVectorWidthDouble;
cl_uint nativeVectorWidthHalf;
cl_uint maxReadWriteImageArgs;
cl_uint imagePitchAlignment;
cl_uint imageBaseAddressAlignment;
cl_uint maxPipeArgs;
cl_uint pipeMaxActiveReservations;
cl_uint pipeMaxPacketSize;
cl_uint memBaseAddressAlign;
cl_uint minDataTypeAlignSize;
cl_device_mem_cache_type globalMemCacheType;
cl_uint maxConstantArgs;
cl_device_local_mem_type localMemType;
cl_bool endianLittle;
cl_bool deviceAvailable;
cl_bool compilerAvailable;
cl_bool linkerAvailable;
cl_uint queueOnDevicePreferredSize;
cl_uint queueOnDeviceMaxSize;
cl_uint maxOnDeviceQueues;
cl_uint maxOnDeviceEvents;
cl_bool preferredInteropUserSync;
cl_uint referenceCount;
cl_uint preferredPlatformAtomicAlignment;
cl_uint preferredGlobalAtomicAlignment;
cl_uint preferredLocalAtomicAlignment;
cl_bool hostUnifiedMemory;
cl_bool vmeAvcSupportsTextureSampler;
cl_uint vmeAvcVersion;
cl_uint vmeVersion;
cl_uint internalDriverVersion;
cl_uint grfSize;
bool preemptionSupported;
/* Extensions supported */
bool nv12Extension;
bool vmeExtension;
bool platformLP;
bool packedYuvExtension;
bool nv12Extension;
bool vmeExtension;
bool platformLP;
bool packedYuvExtension;
/*Unified Shared Memory Capabilites*/
cl_unified_shared_memory_capabilities_intel hostMemCapabilities;
cl_unified_shared_memory_capabilities_intel deviceMemCapabilities;
cl_unified_shared_memory_capabilities_intel singleDeviceSharedMemCapabilities;
cl_unified_shared_memory_capabilities_intel crossDeviceSharedMemCapabilities;
cl_unified_shared_memory_capabilities_intel sharedSystemMemCapabilities;
cl_unified_shared_memory_capabilities_intel hostMemCapabilities;
cl_unified_shared_memory_capabilities_intel deviceMemCapabilities;
cl_unified_shared_memory_capabilities_intel singleDeviceSharedMemCapabilities;
cl_unified_shared_memory_capabilities_intel crossDeviceSharedMemCapabilities;
cl_unified_shared_memory_capabilities_intel sharedSystemMemCapabilities;
};
// clang-format on

View File

@@ -37,6 +37,7 @@
#include "CL/cl_ext.h"
#include <limits>
#include <map>
namespace NEO {
@@ -705,9 +706,46 @@ void CommandQueue::storeProperties(const cl_queue_properties *properties) {
}
void CommandQueue::processProperties(const cl_queue_properties *properties) {
if (properties != nullptr) {
bool specificEngineSelected = false;
cl_uint selectedQueueFamilyIndex = std::numeric_limits<uint32_t>::max();
cl_uint selectedQueueIndex = std::numeric_limits<uint32_t>::max();
for (auto currentProperties = properties; *currentProperties != 0; currentProperties += 2) {
switch (*currentProperties) {
case CL_QUEUE_FAMILY_INTEL:
selectedQueueFamilyIndex = static_cast<cl_uint>(*(currentProperties + 1));
specificEngineSelected = true;
break;
case CL_QUEUE_INDEX_INTEL:
selectedQueueIndex = static_cast<cl_uint>(*(currentProperties + 1));
specificEngineSelected = true;
break;
}
}
if (specificEngineSelected) {
if (getDevice().getNumAvailableDevices() == 1) {
auto queueFamily = getDevice().getNonEmptyEngineGroup(selectedQueueFamilyIndex);
auto engine = queueFamily->at(selectedQueueIndex);
auto engineType = engine.getEngineType();
this->overrideEngine(engineType);
}
}
}
processPropertiesExtra(properties);
}
void CommandQueue::overrideEngine(aub_stream::EngineType engineType) {
if (engineType == aub_stream::EngineType::ENGINE_BCS) {
bcsEngine = &device->getEngine(engineType, false, false);
timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
isCopyOnly = true;
} else {
gpgpuEngine = &device->getEngine(engineType, false, false);
}
}
void CommandQueue::aubCaptureHook(bool &blocking, bool &clearAllDependencies, const MultiDispatchInfo &multiDispatchInfo) {
if (DebugManager.flags.AUBDumpSubCaptureMode.get()) {
auto status = getGpgpuCommandStreamReceiver().checkAndActivateAubSubCapture(multiDispatchInfo);

View File

@@ -340,6 +340,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
void storeProperties(const cl_queue_properties *properties);
void processProperties(const cl_queue_properties *properties);
void processPropertiesExtra(const cl_queue_properties *properties);
void overrideEngine(aub_stream::EngineType engineType);
bool bufferCpuCopyAllowed(Buffer *buffer, cl_command_type commandType, cl_bool blocking, size_t size, void *ptr,
cl_uint numEventsInWaitList, const cl_event *eventWaitList);
void providePerformanceHint(TransferProperties &transferProperties);

View File

@@ -136,16 +136,23 @@ void getQueueInfo(cl_command_queue commandQueue,
template <typename returnType>
returnType getCmdQueueProperties(const cl_queue_properties *properties,
cl_queue_properties propertyName = CL_QUEUE_PROPERTIES) {
cl_queue_properties propertyName = CL_QUEUE_PROPERTIES,
bool *foundValue = nullptr) {
if (properties != nullptr) {
while (*properties != 0) {
if (*properties == propertyName) {
if (foundValue) {
*foundValue = true;
}
return static_cast<returnType>(*(properties + 1));
}
properties += 2;
}
}
if (foundValue) {
*foundValue = false;
}
return 0;
}
bool isExtraToken(const cl_queue_properties *property);