Assign engine to command queue using round robin algorithm

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2021-10-28 07:52:24 +00:00
committed by Compute-Runtime-Automation
parent cc2ba84fc8
commit 1c68ac1cbc
20 changed files with 215 additions and 3 deletions

View File

@@ -75,6 +75,7 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr
auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily);
gpgpuEngine = &device->getDefaultEngine();
UNRECOVERABLE_IF(gpgpuEngine->getEngineType() >= aub_stream::EngineType::NUM_ENGINES);
bool bcsAllowed = hwInfoConfig->isBlitterFullySupported(hwInfo) &&

View File

@@ -248,6 +248,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
void allocateHeapMemory(IndirectHeap::Type heapType,
size_t minRequiredSize, IndirectHeap *&indirectHeap);
static bool isAssignEngineRoundRobinEnabled();
MOCKABLE_VIRTUAL void releaseIndirectHeap(IndirectHeap::Type heapType);
void releaseVirtualEvent() {

View File

@@ -63,6 +63,24 @@ class CommandQueueHw : public CommandQueue {
this->gpgpuEngine = &device->getInternalEngine();
}
auto &hwInfo = device->getDevice().getHardwareInfo();
auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
auto assignEngineRoundRobin =
!internalUsage &&
!this->queueFamilySelected &&
!(clPriority & static_cast<cl_queue_priority_khr>(CL_QUEUE_PRIORITY_LOW_KHR)) &&
hwHelper.isAssignEngineRoundRobinSupported() &&
this->isAssignEngineRoundRobinEnabled();
if (DebugManager.flags.EnableCmdQRoundRobindEngineAssign.get() != -1) {
assignEngineRoundRobin = DebugManager.flags.EnableCmdQRoundRobindEngineAssign.get();
}
if (assignEngineRoundRobin) {
this->gpgpuEngine = &device->getDevice().getNextEngineForCommandQueue();
}
if (getCmdQueueProperties<cl_queue_properties>(properties, CL_QUEUE_PROPERTIES) & static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
getGpgpuCommandStreamReceiver().overrideDispatchPolicy(DispatchMode::BatchedDispatch);
if (DebugManager.flags.CsrDispatchMode.get() != 0) {
@@ -77,8 +95,6 @@ class CommandQueueHw : public CommandQueue {
auto &stateSaveAreaHeader = SipKernel::getSipKernel(device->getDevice()).getStateSaveAreaHeader();
if (stateSaveAreaHeader.size() > 0) {
auto &hwInfo = device->getDevice().getHardwareInfo();
auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
NEO::MemoryTransferHelper::transferMemoryToAllocation(hwHelper.isBlitCopyRequiredForLocalMemory(hwInfo, *debugSurface),
device->getDevice(), debugSurface, 0, stateSaveAreaHeader.data(),
stateSaveAreaHeader.size());

View File

@@ -12,6 +12,7 @@ endif()
set(RUNTIME_SRCS_DLL_BASE
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/command_queue_dll.cpp
${NEO_SHARED_DIRECTORY}/dll/create_deferred_deleter.cpp
${NEO_SHARED_DIRECTORY}/dll/create_memory_manager_${DRIVER_MODEL}.cpp
${NEO_SHARED_DIRECTORY}/dll/create_tbx_sockets.cpp

View File

@@ -0,0 +1,14 @@
/*
* Copyright (C) 2020-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "opencl/source/command_queue/command_queue.h"
namespace NEO {
bool CommandQueue::isAssignEngineRoundRobinEnabled() {
return true;
}
} // namespace NEO

View File

@@ -19,6 +19,7 @@
#include "opencl/source/cl_device/cl_device.h"
#include "opencl/test/unit_test/mocks/mock_cl_device.h"
#include "opencl/test/unit_test/mocks/mock_command_queue.h"
#include "opencl/test/unit_test/mocks/mock_context.h"
#include "opencl/test/unit_test/mocks/mock_platform.h"
#include "opencl/test/unit_test/mocks/mock_program.h"
@@ -934,6 +935,123 @@ HWTEST_F(EngineInstancedDeviceTests, givenEngineInstancedDeviceWhenCreatingProgr
EXPECT_EQ(clSubSubDevice1, associatedSubDevices[1]);
}
HWTEST_F(EngineInstancedDeviceTests, whenCreateMultipleCommandQueuesThenEnginesAreAssignedUsingRoundRobin) {
constexpr uint32_t genericDevicesCount = 1;
constexpr uint32_t ccsCount = 4;
VariableBackup<UltHwConfig> backup(&ultHwConfig);
ultHwConfig.useRoundRobindEngineAssign = true;
if (!createDevices(genericDevicesCount, ccsCount)) {
GTEST_SKIP();
}
auto &hwInfo = rootDevice->getHardwareInfo();
EXPECT_EQ(ccsCount, hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled);
auto clRootDevice = std::make_unique<ClDevice>(*rootDevice, nullptr);
cl_device_id device_ids[] = {clRootDevice.get()};
ClDeviceVector deviceVector{device_ids, 1};
MockContext context(deviceVector);
std::array<std::unique_ptr<MockCommandQueueHw<FamilyType>>, 24> cmdQs;
for (auto &cmdQ : cmdQs) {
cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, clRootDevice.get(), nullptr);
}
const auto &defaultEngine = clRootDevice->getDefaultEngine();
const auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
const auto engineGroupType = hwHelper.getEngineGroupType(defaultEngine.getEngineType(), defaultEngine.getEngineUsage(), hwInfo);
auto defaultEngineGroupIndex = clRootDevice->getDevice().getIndexOfNonEmptyEngineGroup(engineGroupType);
auto engines = clRootDevice->getDevice().getEngineGroups()[defaultEngineGroupIndex];
for (size_t i = 0; i < cmdQs.size(); i++) {
auto engineIndex = i % engines.size();
auto expectedCsr = engines[engineIndex].commandStreamReceiver;
auto csr = &cmdQs[i]->getGpgpuCommandStreamReceiver();
EXPECT_EQ(csr, expectedCsr);
}
}
HWTEST_F(EngineInstancedDeviceTests, givenEnableCmdQRoundRobindEngineAssignEnabledWhenCreateMultipleCommandQueuesThenEnginesAreAssignedUsingRoundRobin) {
constexpr uint32_t genericDevicesCount = 1;
constexpr uint32_t ccsCount = 4;
DebugManagerStateRestore restorer;
DebugManager.flags.EnableCmdQRoundRobindEngineAssign.set(1);
if (!createDevices(genericDevicesCount, ccsCount)) {
GTEST_SKIP();
}
auto &hwInfo = rootDevice->getHardwareInfo();
EXPECT_EQ(ccsCount, hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled);
auto clRootDevice = std::make_unique<ClDevice>(*rootDevice, nullptr);
cl_device_id device_ids[] = {clRootDevice.get()};
ClDeviceVector deviceVector{device_ids, 1};
MockContext context(deviceVector);
std::array<std::unique_ptr<MockCommandQueueHw<FamilyType>>, 24> cmdQs;
for (auto &cmdQ : cmdQs) {
cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, clRootDevice.get(), nullptr);
}
const auto &defaultEngine = clRootDevice->getDefaultEngine();
const auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
const auto engineGroupType = hwHelper.getEngineGroupType(defaultEngine.getEngineType(), defaultEngine.getEngineUsage(), hwInfo);
auto defaultEngineGroupIndex = clRootDevice->getDevice().getIndexOfNonEmptyEngineGroup(engineGroupType);
auto engines = clRootDevice->getDevice().getEngineGroups()[defaultEngineGroupIndex];
for (size_t i = 0; i < cmdQs.size(); i++) {
auto engineIndex = i % engines.size();
auto expectedCsr = engines[engineIndex].commandStreamReceiver;
auto csr = &cmdQs[i]->getGpgpuCommandStreamReceiver();
EXPECT_EQ(csr, expectedCsr);
}
}
HWTEST_F(EngineInstancedDeviceTests, givenEnableCmdQRoundRobindEngineAssignDisabledWenCreateMultipleCommandQueuesThenDefaultEngineAssigned) {
constexpr uint32_t genericDevicesCount = 1;
constexpr uint32_t ccsCount = 4;
DebugManagerStateRestore restorer;
DebugManager.flags.EnableCmdQRoundRobindEngineAssign.set(0);
VariableBackup<UltHwConfig> backup(&ultHwConfig);
ultHwConfig.useRoundRobindEngineAssign = true;
if (!createDevices(genericDevicesCount, ccsCount)) {
GTEST_SKIP();
}
auto &hwInfo = rootDevice->getHardwareInfo();
EXPECT_EQ(ccsCount, hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled);
auto clRootDevice = std::make_unique<ClDevice>(*rootDevice, nullptr);
cl_device_id device_ids[] = {clRootDevice.get()};
ClDeviceVector deviceVector{device_ids, 1};
MockContext context(deviceVector);
std::array<std::unique_ptr<MockCommandQueueHw<FamilyType>>, 24> cmdQs;
for (auto &cmdQ : cmdQs) {
cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, clRootDevice.get(), nullptr);
}
const auto &defaultEngine = clRootDevice->getDefaultEngine();
for (auto &cmdQ : cmdQs) {
auto expectedCsr = defaultEngine.commandStreamReceiver;
auto csr = &cmdQ->getGpgpuCommandStreamReceiver();
EXPECT_EQ(csr, expectedCsr);
}
}
TEST(SubDevicesTest, whenInitializeRootCsrThenDirectSubmissionIsNotInitialized) {
auto device = std::make_unique<MockDevice>();
device->initializeRootCommandStreamReceiver();

View File

@@ -84,6 +84,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HwHelperTestXeHPAndLater, givenXeHPAndLaterPlatform
EXPECT_TRUE(hwHelper.timestampPacketWriteSupported());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, HwHelperTestXeHPAndLater, givenXeHPAndLaterPlatformWhenCheckAssignEngineRoundRobinSupportedThenReturnTrue) {
auto &hwHelper = HwHelperHw<FamilyType>::get();
EXPECT_TRUE(hwHelper.isAssignEngineRoundRobinSupported());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, HwHelperTestXeHPAndLater, givenAllFlagsSetWhenGetGpgpuEnginesThenReturnThreeRcsEnginesFourCcsEnginesAndOneBcsEngine) {
HardwareInfo hwInfo = *defaultHwInfo;
hwInfo.featureTable.ftrCCSNode = true;

View File

@@ -34,6 +34,7 @@ add_library(igdrcl_libult OBJECT EXCLUDE_FROM_ALL
)
set(IGDRCL_SRCS_LIB_ULT_ENV
${CMAKE_CURRENT_SOURCE_DIR}/command_queue_ult.cpp
${NEO_SOURCE_DIR}/shared/test/common/helpers/custom_event_listener.h
${NEO_SOURCE_DIR}/opencl/test/unit_test/main.cpp
${NEO_SOURCE_DIR}/opencl/test/unit_test/command_queue/command_queue_fixture.cpp

View File

@@ -0,0 +1,16 @@
/*
* Copyright (C) 2020-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/test/common/helpers/ult_hw_config.h"
#include "opencl/source/command_queue/command_queue.h"
namespace NEO {
bool CommandQueue::isAssignEngineRoundRobinEnabled() {
return ultHwConfig.useRoundRobindEngineAssign;
}
} // namespace NEO

View File

@@ -27,6 +27,7 @@ add_executable(igdrcl_${target_name}
${NEO_SHARED_DIRECTORY}/dll/linux/drm_neo_create.cpp
${NEO_SHARED_DIRECTORY}/dll/linux/options_linux.cpp
${NEO_SHARED_DIRECTORY}/dll/linux/os_interface.cpp
${NEO_SOURCE_DIR}/opencl/source/dll/command_queue_dll.cpp
${NEO_SOURCE_DIR}/opencl/source/os_interface/linux/platform_teardown_linux.cpp
${NEO_SOURCE_DIR}/opencl/test/unit_test/linux${BRANCH_DIR_SUFFIX}drm_other_requests.cpp
)

View File

@@ -22,6 +22,7 @@
#include "shared/test/common/mocks/mock_execution_environment.h"
#include "shared/test/common/os_interface/linux/device_command_stream_fixture.h"
#include "opencl/source/command_queue/command_queue.h"
#include "opencl/source/platform/platform.h"
#include "opencl/test/unit_test/linux/drm_wrap.h"
#include "opencl/test/unit_test/linux/mock_os_layer.h"
@@ -802,6 +803,10 @@ TEST(DirectSubmissionControllerTest, whenCheckDirectSubmissionControllerSupportT
EXPECT_TRUE(DirectSubmissionController::isSupported());
}
TEST(CommandQueueTest, whenCheckEngineRoundRobinAssignThenReturnsTrue) {
EXPECT_TRUE(CommandQueue::isAssignEngineRoundRobinEnabled());
}
TEST(PlatformsDestructor, whenGlobalPlatformsDestructorIsCalledThenGlobalPlatformsAreDestroyed) {
EXPECT_NE(nullptr, platformsImpl);
platformsDestructor();

View File

@@ -313,6 +313,7 @@ OverrideNotifyEnableForTagUpdatePostSync = -1
OverrideUseKmdWaitFunction = -1
EnableCacheFlushAfterWalkerForAllQueues = -1
Force32BitDriverSupport = -1
EnableCmdQRoundRobindEngineAssign = -1
OverrideCmdQueueSynchronousMode = -1
UseAtomicsForSelfCleanupSection = -1
HBMSizePerTileInGigabytes = 0

View File

@@ -307,6 +307,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableUserFenceForCompletionWait, -1, "-1: defau
DECLARE_DEBUG_VARIABLE(int32_t, EnableUserFenceUseCtxId, -1, "-1: default (disabled), 0: disable, 1: enable : Use Context Id in Wait User Fence when waiting for completion tag")
DECLARE_DEBUG_VARIABLE(int32_t, SetKmdWaitTimeout, -1, "-1: default (infinity), >0: amount of time units for wait function timeout")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideNotifyEnableForTagUpdatePostSync, -1, "-1: default (usage determined by user fence wait call), 0: disable use of NotifyEnable flag, 1: enable use NotifyEnable flag")
DECLARE_DEBUG_VARIABLE(int32_t, EnableCmdQRoundRobindEngineAssign, -1, "-1: default, 0: disable, 1: enable")
DECLARE_DEBUG_VARIABLE(int32_t, Force32BitDriverSupport, -1, "-1: default, 0: disable, 1: enable, Forces the driver to support 32 bit.")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicPipelineSelect, -1, "set SYSTOLIC MODE ENABLE in PIPELINE_SELECT cmd, -1:default, 0:disable, 1:enable")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicInComputeWalker, -1, "set SYSTOLIC MODE ENABLE in COMPUTE_WALKER cmd, -1:default, 0:disable, 1:enable")

View File

@@ -565,6 +565,20 @@ EngineControl &Device::getInternalEngine() {
return this->getNearestGenericSubDevice(0)->getEngine(engineType, EngineUsage::Internal);
}
EngineControl &Device::getNextEngineForCommandQueue() {
const auto &defaultEngine = this->getDefaultEngine();
const auto &hardwareInfo = this->getHardwareInfo();
const auto &hwHelper = NEO::HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
const auto engineGroupType = hwHelper.getEngineGroupType(defaultEngine.getEngineType(), defaultEngine.getEngineUsage(), hardwareInfo);
auto defaultEngineGroupIndex = this->getIndexOfNonEmptyEngineGroup(engineGroupType);
auto engines = this->getEngineGroups()[defaultEngineGroupIndex];
auto engineIndex = this->regularCommandQueuesCreatedWithinDeviceCount++ % engines.size();
return this->getEngineGroups()[defaultEngineGroupIndex][engineIndex];
}
EngineControl *Device::getInternalCopyEngine() {
if (!getHardwareInfo().capabilityTable.blitterOperationsSupported) {
return nullptr;

View File

@@ -65,6 +65,7 @@ class Device : public ReferenceTrackedObject<Device> {
size_t getIndexOfNonEmptyEngineGroup(EngineGroupType engineGroupType) const;
EngineControl &getEngine(uint32_t index);
EngineControl &getDefaultEngine();
EngineControl &getNextEngineForCommandQueue();
EngineControl &getInternalEngine();
EngineControl *getInternalCopyEngine();
SelectorCopyEngine &getSelectorCopyEngine();
@@ -172,6 +173,7 @@ class Device : public ReferenceTrackedObject<Device> {
aub_stream::EngineType engineInstancedType = aub_stream::EngineType::NUM_ENGINES;
uint32_t defaultEngineIndex = 0;
uint32_t numSubDevices = 0;
std::atomic_uint32_t regularCommandQueuesCreatedWithinDeviceCount{0};
bool hasGenericSubDevices = false;
bool engineInstanced = false;
bool rootCsrCreated = false;

View File

@@ -108,6 +108,7 @@ class HwHelper {
virtual bool useOnlyGlobalTimestamps() const = 0;
virtual bool useSystemMemoryPlacementForISA(const HardwareInfo &hwInfo) const = 0;
virtual bool packedFormatsSupported() const = 0;
virtual bool isAssignEngineRoundRobinSupported() const = 0;
virtual bool isRcsAvailable(const HardwareInfo &hwInfo) const = 0;
virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const HardwareInfo &hwInfo) const = 0;
virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
@@ -357,6 +358,8 @@ class HwHelperHw : public HwHelper {
bool additionalPipeControlArgsRequired() const override;
bool isAssignEngineRoundRobinSupported() const override;
bool isEngineTypeRemappingToHwSpecificRequired() const override;
bool isSipKernelAsHexadecimalArrayPreferred() const override;

View File

@@ -40,6 +40,11 @@ bool HwHelperHw<GfxFamily>::timestampPacketWriteSupported() const {
return false;
}
template <typename GfxFamily>
bool HwHelperHw<GfxFamily>::isAssignEngineRoundRobinSupported() const {
return false;
}
template <typename GfxFamily>
const EngineInstancesContainer HwHelperHw<GfxFamily>::getGpgpuEngineInstances(const HardwareInfo &hwInfo) const {
return {

View File

@@ -133,6 +133,11 @@ uint32_t HwHelperHw<GfxFamily>::getPlanarYuvMaxHeight() const {
return planarYuvMaxHeight;
}
template <typename GfxFamily>
bool HwHelperHw<GfxFamily>::isAssignEngineRoundRobinSupported() const {
return true;
}
template <typename GfxFamily>
aub_stream::MMIOList HwHelperHw<GfxFamily>::getExtraMmioList(const HardwareInfo &hwInfo, const GmmHelper &gmmHelper) const {
aub_stream::MMIOList mmioList;

View File

@@ -12,6 +12,7 @@ struct UltHwConfig {
bool useHwCsr = false;
bool useMockedPrepareDeviceEnvironmentsFunc = true;
bool forceOsAgnosticMemoryManager = true;
bool useRoundRobindEngineAssign = false;
bool csrFailInitDirectSubmission = false;
bool csrBaseCallDirectSubmissionAvailable = false;

View File

@@ -32,7 +32,7 @@ void NEO::BaseUltConfigListener::OnTestEnd(const ::testing::TestInfo &) {
// Ensure that global state is restored
UltHwConfig expectedState{};
static_assert(sizeof(UltHwConfig) == 11 * sizeof(bool), ""); // Ensure that there is no internal padding
static_assert(sizeof(UltHwConfig) == 12 * sizeof(bool), ""); // Ensure that there is no internal padding
EXPECT_EQ(0, memcmp(&expectedState, &ultHwConfig, sizeof(UltHwConfig)));
EXPECT_EQ(0, memcmp(&referencedHwInfo, defaultHwInfo.get(), sizeof(HardwareInfo)));
}