feature: create copy offload queue under debug flag

Related-To: NEO-11376

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2024-05-17 08:33:08 +00:00
committed by Compute-Runtime-Automation
parent 4c5d567b40
commit cb9977b8f4
12 changed files with 199 additions and 0 deletions

View File

@@ -30,6 +30,9 @@ CommandList::~CommandList() {
if (cmdQImmediate) {
cmdQImmediate->destroy();
}
if (cmdQImmediateCopyOffload) {
cmdQImmediateCopyOffload->destroy();
}
removeDeallocationContainerData();
if (!isImmediateType() || !this->isFlushTaskSubmissionEnabled) {
removeHostPtrAllocations();

View File

@@ -412,6 +412,7 @@ struct CommandList : _ze_command_list_handle_t {
ze_context_handle_t hContext = nullptr;
CommandQueue *cmdQImmediate = nullptr;
CommandQueue *cmdQImmediateCopyOffload = nullptr;
NEO::CommandStreamReceiver *csr = nullptr;
Device *device = nullptr;

View File

@@ -255,12 +255,36 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device
pNext = reinterpret_cast<const ze_base_desc_t *>(pNext->pNext);
}
if ((NEO::debugManager.flags.ForceCopyOperationOffloadForComputeCmdList.get() == 1) && !commandList->isCopyOnly() && commandList->isInOrderExecutionEnabled()) {
commandList->enableCopyOperationOffload(productFamily, device, desc);
}
return commandList;
}
return commandList;
}
void CommandListImp::enableCopyOperationOffload(uint32_t productFamily, Device *device, const ze_command_queue_desc_t *desc) {
NEO::CommandStreamReceiver *copyCsr = nullptr;
uint32_t ordinal = static_cast<DeviceImp *>(device)->getCopyEngineOrdinal();
device->getCsrForOrdinalAndIndexWithPriority(&copyCsr, ordinal, 0, desc->priority);
UNRECOVERABLE_IF(!copyCsr);
ze_command_queue_desc_t copyQueueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
copyQueueDesc.ordinal = ordinal;
copyQueueDesc.mode = desc->mode;
copyQueueDesc.priority = desc->priority;
ze_result_t returnValue = ZE_RESULT_SUCCESS;
auto offloadCommandQueue = CommandQueue::create(productFamily, device, copyCsr, &copyQueueDesc, true, false, true, returnValue);
UNRECOVERABLE_IF(!offloadCommandQueue);
this->cmdQImmediateCopyOffload = offloadCommandQueue;
this->copyOperationOffloadEnabled = true;
}
void CommandListImp::setStreamPropertiesDefaultSettings(NEO::StreamProperties &streamProperties) {
if (this->stateComputeModeTracking) {
streamProperties.stateComputeMode.setPropertiesCoherencyDevicePreemption(cmdListDefaultCoherency, this->commandListPreemptionMode, true);

View File

@@ -46,6 +46,7 @@ struct CommandListImp : public CommandList {
virtual void patchInOrderCmds() = 0;
void enableSynchronizedDispatch(NEO::SynchronizedDispatchMode mode);
NEO::SynchronizedDispatchMode getSynchronizedDispatchMode() const { return synchronizedDispatchMode; }
void enableCopyOperationOffload(uint32_t productFamily, Device *device, const ze_command_queue_desc_t *desc);
protected:
std::shared_ptr<NEO::InOrderExecInfo> inOrderExecInfo;
@@ -61,6 +62,7 @@ struct CommandListImp : public CommandList {
static constexpr bool cmdListDefaultMediaSamplerClockGate = false;
static constexpr bool cmdListDefaultGlobalAtomics = false;
std::vector<Event *> mappedTsEventList{};
bool copyOperationOffloadEnabled = false;
};
} // namespace L0

View File

@@ -417,6 +417,20 @@ uint32_t DeviceImp::getCopyQueueGroupsFromSubDevice(uint32_t numberOfSubDeviceCo
return subDeviceQueueGroupsIter;
}
uint32_t DeviceImp::getCopyEngineOrdinal() const {
auto &engineGroups = neoDevice->getRegularEngineGroups();
uint32_t i = 0;
for (; i < static_cast<uint32_t>(engineGroups.size()); i++) {
if (engineGroups[i].engineGroupType == NEO::EngineGroupType::copy) {
return i;
}
}
UNRECOVERABLE_IF(this->subDeviceCopyEngineGroups.size() == 0);
return i;
}
ze_result_t DeviceImp::getCommandQueueGroupProperties(uint32_t *pCount,
ze_command_queue_group_properties_t *pCommandQueueGroupProperties) {
NEO::Device *activeDevice = getActiveDevice();

View File

@@ -170,6 +170,7 @@ struct DeviceImp : public Device, NEO::NonCopyableOrMovableClass {
uint32_t getEventMaxKernelCount() const override;
uint32_t queryDeviceNodeMask();
NEO::EngineGroupType getInternalEngineGroupType();
uint32_t getCopyEngineOrdinal() const;
protected:
void adjustCommandQueueDesc(uint32_t &ordinal, uint32_t &index);

View File

@@ -50,6 +50,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
using BaseClass::cmdListHeapAddressModel;
using BaseClass::cmdListType;
using BaseClass::cmdQImmediate;
using BaseClass::cmdQImmediateCopyOffload;
using BaseClass::commandContainer;
using BaseClass::commandListPerThreadScratchSize;
using BaseClass::commandListPreemptionMode;
@@ -58,6 +59,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
using BaseClass::compactL3FlushEventPacket;
using BaseClass::containsAnyKernel;
using BaseClass::containsCooperativeKernelsFlag;
using BaseClass::copyOperationOffloadEnabled;
using BaseClass::csr;
using BaseClass::currentBindingTablePoolBaseAddress;
using BaseClass::currentDynamicStateBaseAddress;
@@ -182,10 +184,12 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
using BaseClass::cmdListHeapAddressModel;
using BaseClass::cmdListType;
using BaseClass::cmdQImmediate;
using BaseClass::cmdQImmediateCopyOffload;
using BaseClass::commandContainer;
using BaseClass::commandsToPatch;
using BaseClass::compactL3FlushEvent;
using BaseClass::compactL3FlushEventPacket;
using BaseClass::copyOperationOffloadEnabled;
using BaseClass::csr;
using BaseClass::dcFlushSupport;
using BaseClass::device;
@@ -263,9 +267,11 @@ struct WhiteBox<::L0::CommandListImp> : public ::L0::CommandListImp {
using BaseClass::cmdListHeapAddressModel;
using BaseClass::cmdListType;
using BaseClass::cmdQImmediate;
using BaseClass::cmdQImmediateCopyOffload;
using BaseClass::commandContainer;
using BaseClass::commandListPreemptionMode;
using BaseClass::commandsToPatch;
using BaseClass::copyOperationOffloadEnabled;
using BaseClass::copyThroughLockedPtrEnabled;
using BaseClass::csr;
using BaseClass::currentBindingTablePoolBaseAddress;

View File

@@ -28,6 +28,7 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp {
using BaseClass::cmdListWithAssertExecuted;
using BaseClass::commandStream;
using BaseClass::csr;
using BaseClass::desc;
using BaseClass::device;
using BaseClass::preemptionCmdSyncProgramming;
using BaseClass::printfKernelContainer;

View File

@@ -6810,5 +6810,102 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenLimitedSyncDispatchWhenAppend
EXPECT_TRUE(verifyTokenCleanup());
}
struct CopyOffloadInOrderTests : public InOrderCmdListTests {
void SetUp() override {
backupHwInfo = std::make_unique<VariableBackup<NEO::HardwareInfo>>(defaultHwInfo.get());
defaultHwInfo->capabilityTable.blitterOperationsSupported = true;
defaultHwInfo->featureTable.ftrBcsInfo = 0b111;
InOrderCmdListTests::SetUp();
}
std::unique_ptr<VariableBackup<NEO::HardwareInfo>> backupHwInfo;
};
HWTEST2_F(CopyOffloadInOrderTests, givenDebugFlagSetWhenCreatingCmdListThenEnableCopyOffload, IsAtLeastXeHpCore) {
NEO::debugManager.flags.ForceCopyOperationOffloadForComputeCmdList.set(1);
ze_command_list_handle_t cmdListHandle;
ze_command_queue_desc_t cmdQueueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
cmdQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
cmdQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER;
cmdQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
{
EXPECT_EQ(ZE_RESULT_SUCCESS, zeCommandListCreateImmediate(context, device, &cmdQueueDesc, &cmdListHandle));
auto cmdList = static_cast<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>> *>(CommandList::fromHandle(cmdListHandle));
EXPECT_TRUE(cmdList->copyOperationOffloadEnabled);
EXPECT_NE(nullptr, cmdList->cmdQImmediateCopyOffload);
auto queue = static_cast<WhiteBox<L0::CommandQueue> *>(cmdList->cmdQImmediateCopyOffload);
EXPECT_EQ(cmdQueueDesc.priority, queue->desc.priority);
EXPECT_EQ(cmdQueueDesc.mode, queue->desc.mode);
EXPECT_TRUE(queue->peekIsCopyOnlyCommandQueue());
EXPECT_TRUE(NEO::EngineHelpers::isBcs(queue->getCsr()->getOsContext().getEngineType()));
zeCommandListDestroy(cmdListHandle);
}
{
cmdQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH;
cmdQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
EXPECT_EQ(ZE_RESULT_SUCCESS, zeCommandListCreateImmediate(context, device, &cmdQueueDesc, &cmdListHandle));
auto cmdList = static_cast<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>> *>(CommandList::fromHandle(cmdListHandle));
EXPECT_TRUE(cmdList->copyOperationOffloadEnabled);
EXPECT_NE(nullptr, cmdList->cmdQImmediateCopyOffload);
auto queue = static_cast<WhiteBox<L0::CommandQueue> *>(cmdList->cmdQImmediateCopyOffload);
EXPECT_EQ(cmdQueueDesc.priority, queue->desc.priority);
EXPECT_EQ(cmdQueueDesc.mode, queue->desc.mode);
EXPECT_TRUE(queue->peekIsCopyOnlyCommandQueue());
EXPECT_TRUE(NEO::EngineHelpers::isBcs(queue->getCsr()->getOsContext().getEngineType()));
zeCommandListDestroy(cmdListHandle);
cmdQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
cmdQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
}
{
cmdQueueDesc.flags = 0;
EXPECT_EQ(ZE_RESULT_SUCCESS, zeCommandListCreateImmediate(context, device, &cmdQueueDesc, &cmdListHandle));
auto cmdList = static_cast<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>> *>(CommandList::fromHandle(cmdListHandle));
EXPECT_FALSE(cmdList->copyOperationOffloadEnabled);
EXPECT_EQ(nullptr, cmdList->cmdQImmediateCopyOffload);
zeCommandListDestroy(cmdListHandle);
cmdQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER;
}
{
cmdQueueDesc.ordinal = static_cast<DeviceImp *>(device)->getCopyEngineOrdinal();
EXPECT_EQ(ZE_RESULT_SUCCESS, zeCommandListCreateImmediate(context, device, &cmdQueueDesc, &cmdListHandle));
auto cmdList = static_cast<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>> *>(CommandList::fromHandle(cmdListHandle));
EXPECT_FALSE(cmdList->copyOperationOffloadEnabled);
EXPECT_EQ(nullptr, cmdList->cmdQImmediateCopyOffload);
zeCommandListDestroy(cmdListHandle);
cmdQueueDesc.ordinal = 0;
}
{
NEO::debugManager.flags.ForceCopyOperationOffloadForComputeCmdList.set(-1);
EXPECT_EQ(ZE_RESULT_SUCCESS, zeCommandListCreateImmediate(context, device, &cmdQueueDesc, &cmdListHandle));
auto cmdList = static_cast<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>> *>(CommandList::fromHandle(cmdListHandle));
EXPECT_FALSE(cmdList->copyOperationOffloadEnabled);
EXPECT_EQ(nullptr, cmdList->cmdQImmediateCopyOffload);
zeCommandListDestroy(cmdListHandle);
}
}
} // namespace ult
} // namespace L0

View File

@@ -175,6 +175,54 @@ TEST(L0DeviceTest, givenMultipleMaskedSubDevicesWhenCreatingL0DeviceThenDontAddD
EXPECT_EQ(0b100u, deviceImp->subDevices[1]->getNEODevice()->getDeviceBitfield().to_ulong());
}
using DeviceCopyEngineTests = ::testing::Test;
HWTEST2_F(DeviceCopyEngineTests, givenRootOrSubDeviceWhenAskingForCopyOrdinalThenReturnCorrectId, IsAtLeastXeHpCore) {
DebugManagerStateRestore restorer;
debugManager.flags.CreateMultipleSubDevices.set(2);
debugManager.flags.EnableImplicitScaling.set(1);
auto hwInfo = *defaultHwInfo;
hwInfo.capabilityTable.blitterOperationsSupported = true;
hwInfo.featureTable.ftrBcsInfo = 0b111;
auto executionEnvironment = std::make_unique<NEO::ExecutionEnvironment>();
executionEnvironment->prepareRootDeviceEnvironments(1);
executionEnvironment->rootDeviceEnvironments[0]->setHwInfoAndInitHelpers(&hwInfo);
executionEnvironment->rootDeviceEnvironments[0]->initGmm();
executionEnvironment->parseAffinityMask();
auto deviceFactory = std::make_unique<NEO::UltDeviceFactory>(1, 2, *executionEnvironment.release());
auto rootDevice = deviceFactory->rootDevices[0];
EXPECT_NE(nullptr, rootDevice);
EXPECT_EQ(2u, rootDevice->getNumSubDevices());
auto driverHandle = std::make_unique<DriverHandleImp>();
ze_result_t returnValue = ZE_RESULT_SUCCESS;
auto device = std::unique_ptr<L0::Device>(Device::create(driverHandle.get(), rootDevice, false, &returnValue));
ASSERT_NE(nullptr, device);
auto deviceImp = static_cast<DeviceImp *>(device.get());
ASSERT_EQ(2u, deviceImp->numSubDevices);
EXPECT_EQ(static_cast<uint32_t>(device->getNEODevice()->getRegularEngineGroups().size()), deviceImp->getCopyEngineOrdinal());
auto &subDeviceEngines = deviceImp->subDevices[0]->getNEODevice()->getRegularEngineGroups();
uint32_t subDeviceCopyEngineId = std::numeric_limits<uint32_t>::max();
for (uint32_t i = 0; i < subDeviceEngines.size(); i++) {
if (subDeviceEngines[i].engineGroupType == EngineGroupType::copy) {
subDeviceCopyEngineId = i;
break;
}
}
EXPECT_NE(std::numeric_limits<uint32_t>::max(), subDeviceCopyEngineId);
EXPECT_EQ(subDeviceCopyEngineId, static_cast<DeviceImp *>(deviceImp->subDevices[0])->getCopyEngineOrdinal());
EXPECT_EQ(subDeviceCopyEngineId, static_cast<DeviceImp *>(deviceImp->subDevices[1])->getCopyEngineOrdinal());
}
TEST(L0DeviceTest, givenMidThreadPreemptionWhenCreatingDeviceThenSipKernelIsInitialized) {
ze_result_t returnValue = ZE_RESULT_SUCCESS;

View File

@@ -255,6 +255,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, ExitOnSubmissionNumber, -1, "Call exit(0) on X s
DECLARE_DEBUG_VARIABLE(int32_t, ExitOnSubmissionMode, 0, "Exit on X submission mode. 0: Any context type, 1: Compute context only, 2: Copy context only ")
DECLARE_DEBUG_VARIABLE(int32_t, ForceInOrderImmediateCmdListExecution, -1, "-1: default, 0: disabled, 1: all Immediate Command Lists are switched to in-order execution")
DECLARE_DEBUG_VARIABLE(int32_t, ForceInOrderEvents, -1, "-1: default, 0: disabled, 1: Enable all Events as in-order, to rely on command list counter value")
DECLARE_DEBUG_VARIABLE(int32_t, ForceCopyOperationOffloadForComputeCmdList, -1, "-1: default, 0: disabled, 1: Enabled. If set, all immediate compute in-order cmdlist will try to offload copy operations to copy engine")
DECLARE_DEBUG_VARIABLE(int32_t, EnableImplicitConvertionToCounterBasedEvents, -1, "-1: default, 0: Disable, 1: Enable. If enabled, try to convert Regular Events used on Immediate CL to CounterBased")
DECLARE_DEBUG_VARIABLE(int32_t, ForceTlbFlush, -1, "-1: default, 0: Tlb flush disabled, 1: Tlb Flush enabled")
DECLARE_DEBUG_VARIABLE(int32_t, AllowDcFlush, -1, "-1: default, 0: DC flush disabled, 1: DC flush enabled")

View File

@@ -602,4 +602,5 @@ DirectSubmissionControllerAdjustOnThrottleAndAcLineStatus = -1
ReadOnlyAllocationsTypeMask = 0
EnableLogLevel = 6
EnableReusingGpuTimestamps = 0
ForceCopyOperationOffloadForComputeCmdList = -1
# Please don't edit below this line