mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-19 06:24:51 +08:00
Expose copy engines in parent device with implicit scaling
When using implicit scaling, expose the copy engines from sub-device 0 in the root device. This to facilitate programming models of layers above. Related-To: NEO-6815 Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
fccda460d6
commit
6154fb8ea7
@@ -200,14 +200,59 @@ ze_result_t DeviceImp::createCommandQueue(const ze_command_queue_desc_t *desc,
|
||||
return returnValue;
|
||||
}
|
||||
|
||||
uint32_t DeviceImp::getCopyQueueGroupsFromSubDevice(uint32_t numberOfSubDeviceCopyEngineGroupsRequested,
|
||||
ze_command_queue_group_properties_t *pCommandQueueGroupProperties) {
|
||||
NEO::Device *activeDevice = getActiveDevice();
|
||||
|
||||
if (this->isImplicitScalingCapable() == false) {
|
||||
return 0u;
|
||||
}
|
||||
|
||||
uint32_t numCopyEngineGroupsInSubDevice = 0;
|
||||
NEO::Device *activeSubDevice = activeDevice->getSubDevice(0u);
|
||||
auto &subDeviceEngineGroups = activeSubDevice->getRegularEngineGroups();
|
||||
uint32_t subDeviceNumEngineGroups = static_cast<uint32_t>(subDeviceEngineGroups.size());
|
||||
|
||||
const auto &hardwareInfo = activeSubDevice->getHardwareInfo();
|
||||
auto &hwHelper = NEO::HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
|
||||
auto &l0HwHelper = L0HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
|
||||
|
||||
for (uint32_t subDeviceQueueGroupsIter = 0; subDeviceQueueGroupsIter < subDeviceNumEngineGroups; subDeviceQueueGroupsIter++) {
|
||||
if (subDeviceEngineGroups[subDeviceQueueGroupsIter].engineGroupType == NEO::EngineGroupType::Copy ||
|
||||
subDeviceEngineGroups[subDeviceQueueGroupsIter].engineGroupType == NEO::EngineGroupType::LinkedCopy) {
|
||||
|
||||
if (pCommandQueueGroupProperties) {
|
||||
pCommandQueueGroupProperties[numCopyEngineGroupsInSubDevice].flags = ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY;
|
||||
pCommandQueueGroupProperties[numCopyEngineGroupsInSubDevice].maxMemoryFillPatternSize =
|
||||
hwHelper.getMaxFillPaternSizeForCopyEngine();
|
||||
|
||||
l0HwHelper.setAdditionalGroupProperty(pCommandQueueGroupProperties[numCopyEngineGroupsInSubDevice], subDeviceEngineGroups[subDeviceQueueGroupsIter].engineGroupType);
|
||||
pCommandQueueGroupProperties[numCopyEngineGroupsInSubDevice].numQueues =
|
||||
static_cast<uint32_t>(subDeviceEngineGroups[subDeviceQueueGroupsIter].engines.size());
|
||||
}
|
||||
numCopyEngineGroupsInSubDevice++;
|
||||
|
||||
if (numCopyEngineGroupsInSubDevice == numberOfSubDeviceCopyEngineGroupsRequested) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return numCopyEngineGroupsInSubDevice;
|
||||
}
|
||||
|
||||
ze_result_t DeviceImp::getCommandQueueGroupProperties(uint32_t *pCount,
|
||||
ze_command_queue_group_properties_t *pCommandQueueGroupProperties) {
|
||||
NEO::Device *activeDevice = getActiveDevice();
|
||||
auto &engineGroups = activeDevice->getRegularEngineGroups();
|
||||
uint32_t numEngineGroups = static_cast<uint32_t>(engineGroups.size());
|
||||
|
||||
uint32_t numSubDeviceCopyEngineGroups = getCopyQueueGroupsFromSubDevice(std::numeric_limits<uint32_t>::max(), nullptr);
|
||||
|
||||
uint32_t totalEngineGroups = numEngineGroups + numSubDeviceCopyEngineGroups;
|
||||
|
||||
if (*pCount == 0) {
|
||||
*pCount = numEngineGroups;
|
||||
*pCount = totalEngineGroups;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -215,8 +260,8 @@ ze_result_t DeviceImp::getCommandQueueGroupProperties(uint32_t *pCount,
|
||||
auto &hwHelper = NEO::HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
|
||||
auto &l0HwHelper = L0HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
|
||||
|
||||
*pCount = std::min(numEngineGroups, *pCount);
|
||||
for (uint32_t i = 0; i < *pCount; i++) {
|
||||
*pCount = std::min(totalEngineGroups, *pCount);
|
||||
for (uint32_t i = 0; i < std::min(numEngineGroups, *pCount); i++) {
|
||||
if (engineGroups[i].engineGroupType == NEO::EngineGroupType::RenderCompute) {
|
||||
pCommandQueueGroupProperties[i].flags = ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE |
|
||||
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY |
|
||||
@@ -238,6 +283,11 @@ ze_result_t DeviceImp::getCommandQueueGroupProperties(uint32_t *pCount,
|
||||
pCommandQueueGroupProperties[i].numQueues = static_cast<uint32_t>(engineGroups[i].engines.size());
|
||||
}
|
||||
|
||||
if (*pCount > numEngineGroups) {
|
||||
uint32_t remainingEngineGroups = *pCount - numEngineGroups;
|
||||
getCopyQueueGroupsFromSubDevice(remainingEngineGroups, &pCommandQueueGroupProperties[numEngineGroups]);
|
||||
}
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -55,6 +55,8 @@ struct DeviceImp : public Device {
|
||||
ze_result_t setCacheAdvice(void *ptr, size_t regionSize, ze_cache_ext_region_t cacheRegion) override;
|
||||
ze_result_t imageGetProperties(const ze_image_desc_t *desc, ze_image_properties_t *pImageProperties) override;
|
||||
ze_result_t getDeviceImageProperties(ze_device_image_properties_t *pDeviceImageProperties) override;
|
||||
uint32_t getCopyQueueGroupsFromSubDevice(uint32_t numberOfSubDeviceCopyEngineGroupsRequested,
|
||||
ze_command_queue_group_properties_t *pCommandQueueGroupProperties);
|
||||
ze_result_t getCommandQueueGroupProperties(uint32_t *pCount,
|
||||
ze_command_queue_group_properties_t *pCommandQueueGroupProperties) override;
|
||||
ze_result_t getExternalMemoryProperties(ze_device_external_memory_properties_t *pExternalMemoryProperties) override;
|
||||
|
||||
@@ -200,5 +200,93 @@ struct MultipleDevicesWithCustomHwInfo {
|
||||
const uint32_t numSubDevices = 2u;
|
||||
};
|
||||
|
||||
template <uint32_t copyEngineCount, uint32_t implicitScaling>
|
||||
struct SingleRootMultiSubDeviceFixtureWithImplicitScaling : public MultiDeviceFixture {
|
||||
NEO::MockCompilerEnableGuard compilerMock = NEO::MockCompilerEnableGuard(true);
|
||||
|
||||
DebugManagerStateRestore restorer;
|
||||
std::unique_ptr<Mock<L0::DriverHandleImp>> driverHandle;
|
||||
std::vector<NEO::Device *> devices;
|
||||
uint32_t numRootDevices = 1u;
|
||||
uint32_t numSubDevices = 2u;
|
||||
L0::ContextImp *context = nullptr;
|
||||
|
||||
L0::Device *device = nullptr;
|
||||
NEO::Device *neoDevice = nullptr;
|
||||
L0::DeviceImp *deviceImp = nullptr;
|
||||
|
||||
NEO::HardwareInfo hwInfo;
|
||||
uint32_t expectedCopyEngineCount = copyEngineCount;
|
||||
uint32_t expectedComputeEngineCount = 0;
|
||||
|
||||
uint32_t numEngineGroups = 0;
|
||||
uint32_t subDeviceNumEngineGroups = 0;
|
||||
|
||||
void SetUp() { // NOLINT(readability-identifier-naming)
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.EnableImplicitScaling.set(implicitScaling);
|
||||
DebugManager.flags.CreateMultipleRootDevices.set(numRootDevices);
|
||||
DebugManager.flags.CreateMultipleSubDevices.set(numSubDevices);
|
||||
|
||||
NEO::HardwareInfo hwInfo = *NEO::defaultHwInfo.get();
|
||||
hwInfo.featureTable.flags.ftrRcsNode = false;
|
||||
hwInfo.featureTable.flags.ftrCCSNode = true;
|
||||
// hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled = 4;
|
||||
if (expectedCopyEngineCount != 0) {
|
||||
hwInfo.capabilityTable.blitterOperationsSupported = true;
|
||||
hwInfo.featureTable.ftrBcsInfo = maxNBitValue(expectedCopyEngineCount);
|
||||
} else {
|
||||
hwInfo.capabilityTable.blitterOperationsSupported = false;
|
||||
}
|
||||
|
||||
if (implicitScaling) {
|
||||
expectedComputeEngineCount = 1u;
|
||||
} else {
|
||||
expectedComputeEngineCount = hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled;
|
||||
}
|
||||
|
||||
MockDevice *mockDevice = MockDevice::createWithNewExecutionEnvironment<MockDevice>(&hwInfo, 0);
|
||||
|
||||
NEO::DeviceVector devices;
|
||||
devices.push_back(std::unique_ptr<NEO::Device>(mockDevice));
|
||||
|
||||
driverHandle = std::make_unique<Mock<L0::DriverHandleImp>>();
|
||||
ze_result_t res = driverHandle->initialize(std::move(devices));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
|
||||
ze_context_handle_t hContext;
|
||||
ze_context_desc_t desc = {ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
|
||||
res = driverHandle->createContext(&desc, 0u, nullptr, &hContext);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
context = static_cast<ContextImp *>(Context::fromHandle(hContext));
|
||||
|
||||
device = driverHandle->devices[0];
|
||||
neoDevice = device->getNEODevice();
|
||||
deviceImp = static_cast<L0::DeviceImp *>(device);
|
||||
|
||||
NEO::Device *activeDevice = deviceImp->getActiveDevice();
|
||||
auto &engineGroups = activeDevice->getRegularEngineGroups();
|
||||
numEngineGroups = static_cast<uint32_t>(engineGroups.size());
|
||||
|
||||
if (activeDevice->getSubDevices().size() > 0) {
|
||||
NEO::Device *activeSubDevice = activeDevice->getSubDevice(0u);
|
||||
(void)activeSubDevice;
|
||||
auto &subDeviceEngineGroups = activeSubDevice->getRegularEngineGroups();
|
||||
(void)subDeviceEngineGroups;
|
||||
|
||||
for (uint32_t i = 0; i < subDeviceEngineGroups.size(); i++) {
|
||||
if (subDeviceEngineGroups[i].engineGroupType == NEO::EngineGroupType::Copy ||
|
||||
subDeviceEngineGroups[i].engineGroupType == NEO::EngineGroupType::LinkedCopy) {
|
||||
subDeviceNumEngineGroups += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TearDown() { // NOLINT(readability-identifier-naming)
|
||||
context->destroy();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
|
||||
@@ -124,6 +124,110 @@ HWTEST2_F(DeviceTestXeHpc, givenXeHpcBStepWhenCreatingMultiTileDeviceThenExpectI
|
||||
delete device;
|
||||
}
|
||||
|
||||
using MultiDeviceCommandQueueGroupWithNineCopyEnginesTest = Test<SingleRootMultiSubDeviceFixtureWithImplicitScaling<9, 1>>;
|
||||
|
||||
HWTEST2_F(MultiDeviceCommandQueueGroupWithNineCopyEnginesTest, givenMainAndLinkCopyEngineSupportAndCCSAndImplicitScalingThenExpectedQueueGroupsAreReturned, IsXeHpcCore) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t res = deviceImp->getCommandQueueGroupProperties(&count, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
EXPECT_EQ(count, numEngineGroups + subDeviceNumEngineGroups);
|
||||
|
||||
std::vector<ze_command_queue_group_properties_t> properties(count);
|
||||
res = deviceImp->getCommandQueueGroupProperties(&count, properties.data());
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
|
||||
uint32_t numCopyQueues = 0;
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (properties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
|
||||
EXPECT_EQ(properties[i].numQueues, 1u);
|
||||
} else if ((properties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) &&
|
||||
!(properties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) {
|
||||
numCopyQueues += properties[i].numQueues;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(numCopyQueues, expectedCopyEngineCount);
|
||||
}
|
||||
|
||||
HWTEST2_F(MultiDeviceCommandQueueGroupWithNineCopyEnginesTest, givenMainAndLinkCopyEngineSupportAndCCSAndImplicitScalingWhenRequestingFewerGroupsThenExpectedGroupsAreReturned, IsXeHpcCore) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t res = deviceImp->getCommandQueueGroupProperties(&count, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
EXPECT_EQ(count, numEngineGroups + subDeviceNumEngineGroups);
|
||||
|
||||
count--;
|
||||
std::vector<ze_command_queue_group_properties_t> properties(count);
|
||||
deviceImp->getCommandQueueGroupProperties(&count, properties.data());
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
|
||||
uint32_t numCopyQueues = 0;
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (properties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
|
||||
EXPECT_EQ(properties[i].numQueues, 1u);
|
||||
} else if ((properties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) &&
|
||||
!(properties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) {
|
||||
numCopyQueues += properties[i].numQueues;
|
||||
}
|
||||
}
|
||||
EXPECT_LE(numCopyQueues, expectedCopyEngineCount);
|
||||
}
|
||||
|
||||
HWTEST2_F(MultiDeviceCommandQueueGroupWithNineCopyEnginesTest, givenMainAndLinkCopyEngineSupportAndCCSAndImplicitScalingWhenRequestingOnlyOneGroupThenOneQueueGroupIsReturned, IsXeHpcCore) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t res = deviceImp->getCommandQueueGroupProperties(&count, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
EXPECT_EQ(count, numEngineGroups + subDeviceNumEngineGroups);
|
||||
|
||||
count = 1;
|
||||
std::vector<ze_command_queue_group_properties_t> properties(count);
|
||||
res = deviceImp->getCommandQueueGroupProperties(&count, properties.data());
|
||||
EXPECT_EQ(count, 1u);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (properties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
|
||||
EXPECT_EQ(properties[i].numQueues, expectedComputeEngineCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
using MultiDeviceCommandQueueGroupWithNoCopyEnginesTest = Test<SingleRootMultiSubDeviceFixtureWithImplicitScaling<0, 1>>;
|
||||
HWTEST2_F(MultiDeviceCommandQueueGroupWithNoCopyEnginesTest,
|
||||
givenNoCopyEngineSupportAndCCSAndImplicitScalingThenExpectedQueueGroupsAreReturned, IsXeHpcCore) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t res = deviceImp->getCommandQueueGroupProperties(&count, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
EXPECT_EQ(count, numEngineGroups);
|
||||
|
||||
std::vector<ze_command_queue_group_properties_t> properties(count);
|
||||
res = deviceImp->getCommandQueueGroupProperties(&count, properties.data());
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (properties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
|
||||
EXPECT_EQ(properties[i].numQueues, expectedComputeEngineCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
using MultiDeviceCommandQueueGroupWithNoCopyEnginesAndNoImplicitScalingTest = Test<SingleRootMultiSubDeviceFixtureWithImplicitScaling<0, 0>>;
|
||||
HWTEST2_F(MultiDeviceCommandQueueGroupWithNoCopyEnginesAndNoImplicitScalingTest,
|
||||
givenNoCopyEngineSupportAndCCSAndNoImplicitScalingThenOnlyTheQueueGroupsFromSubDeviceAreReturned, IsXeHpcCore) {
|
||||
uint32_t count = 0;
|
||||
ze_result_t res = deviceImp->getCommandQueueGroupProperties(&count, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
EXPECT_EQ(count, numEngineGroups);
|
||||
|
||||
std::vector<ze_command_queue_group_properties_t> properties(count);
|
||||
res = deviceImp->getCommandQueueGroupProperties(&count, properties.data());
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (properties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
|
||||
EXPECT_EQ(properties[i].numQueues, expectedComputeEngineCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
using CommandQueueGroupTest = Test<DeviceFixture>;
|
||||
|
||||
HWTEST2_F(CommandQueueGroupTest, givenNoBlitterSupportAndNoCCSThenOneQueueGroupIsReturned, IsXeHpcCore) {
|
||||
|
||||
@@ -36,7 +36,12 @@ When doing allocations in implicit scaling mode, driver *colors* an allocation a
|
||||
|
||||
When scheduling a kernel for execution, driver distributes the kernel workgroups among the available tiles. Default mechanism is called *Static Partitioning*, where the workgroups are evenly distributed among tiles. For instance, in a 2-tile system, half of the workgroups go to tile 0, and the other half to tile 1.
|
||||
|
||||
The number of CCSs, or compute engines, currently available with implicit scaing on the root device is one. This is because with implicit scaling the driver automatically uses all the EUs available in the device, so no other CCSs are exposed. Even though only one CCS is exposed, multiple kernels submitted to the root device using implicit scaling may execute concurrently on PVC, depending on EU availability. On XeHP_SDV, they may be serialized. See [Limitations](#Limitations) section below.
|
||||
The number of CCSs, or compute engines, currently available with implicit scaling on the root device is one. This is because with implicit scaling the driver automatically uses all the EUs available in the device, so no other CCSs are exposed. Even though only one CCS is exposed, multiple kernels submitted to the root device using implicit scaling may execute concurrently on PVC, depending on EU availability. On XeHP_SDV, they may be serialized. See [Limitations](#Limitations) section below.
|
||||
|
||||
No implicit scaling support is available for BCSs. Considering that, two models are followed in terms of discovery of copy engines:
|
||||
|
||||
* In Level Zero, the copy engines from sub-device 0 are exposed also in the root device. This to align the engine model on both the implicit and the non-implicit-scaling scenarios.
|
||||
* In OpenCL, copy engines are not exposed in the root device.
|
||||
|
||||
Since implicit scaling is only done for EUs, which are associated only with kernels submitted to CCS, BCSs are currently not being exposed and access to them are done through sub-device handles.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user