refactor: Add flag to control BCS split for pageable memory
Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
parent
dc0796c2a1
commit
83bd33befc
|
@ -2838,6 +2838,15 @@ inline size_t CommandListCoreFamily<gfxCoreFamily>::getTotalSizeForCopyRegion(co
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline NEO::MemoryPool getMemoryPoolFromAllocDataForSplit(bool allocFound, const NEO::SvmAllocationData *allocData) {
|
||||||
|
if (allocFound) {
|
||||||
|
return allocData->gpuAllocations.getDefaultGraphicsAllocation()->getMemoryPool();
|
||||||
|
} else if (NEO::DebugManager.flags.SplitBcsCopyHostptr.get() != 0) {
|
||||||
|
return NEO::MemoryPool::System4KBPages;
|
||||||
|
}
|
||||||
|
return NEO::MemoryPool::MemoryNull;
|
||||||
|
}
|
||||||
|
|
||||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||||
bool CommandListCoreFamily<gfxCoreFamily>::isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size, NEO::TransferDirection &directionOut) {
|
bool CommandListCoreFamily<gfxCoreFamily>::isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size, NEO::TransferDirection &directionOut) {
|
||||||
if (size < minimalSizeForBcsSplit) {
|
if (size < minimalSizeForBcsSplit) {
|
||||||
|
@ -2849,8 +2858,14 @@ bool CommandListCoreFamily<gfxCoreFamily>::isAppendSplitNeeded(void *dstPtr, con
|
||||||
bool srcAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcPtr), size, &srcAllocData);
|
bool srcAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcPtr), size, &srcAllocData);
|
||||||
bool dstAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(dstPtr, size, &dstAllocData);
|
bool dstAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(dstPtr, size, &dstAllocData);
|
||||||
|
|
||||||
auto srcMemoryPool = srcAllocFound ? srcAllocData->gpuAllocations.getDefaultGraphicsAllocation()->getMemoryPool() : NEO::MemoryPool::System4KBPages;
|
auto srcMemoryPool = getMemoryPoolFromAllocDataForSplit(srcAllocFound, srcAllocData);
|
||||||
auto dstMemoryPool = dstAllocFound ? dstAllocData->gpuAllocations.getDefaultGraphicsAllocation()->getMemoryPool() : NEO::MemoryPool::System4KBPages;
|
auto dstMemoryPool = getMemoryPoolFromAllocDataForSplit(dstAllocFound, dstAllocData);
|
||||||
|
for (const auto memoryPool : {srcMemoryPool, dstMemoryPool}) {
|
||||||
|
if (memoryPool == NEO::MemoryPool::MemoryNull) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return this->isAppendSplitNeeded(dstMemoryPool, srcMemoryPool, size, directionOut);
|
return this->isAppendSplitNeeded(dstMemoryPool, srcMemoryPool, size, directionOut);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -437,6 +437,98 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe
|
||||||
context->freeMem(dstPtr);
|
context->freeMem(dstPtr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyHostptrDisabledAndImmediateCommandListWhenAppendingMemoryCopyFromNonUsmHostToHostThenDoNotSplit, IsXeHpcCore) {
|
||||||
|
DebugManagerStateRestore restorer;
|
||||||
|
DebugManager.flags.SplitBcsCopy.set(1);
|
||||||
|
DebugManager.flags.SplitBcsCopyHostptr.set(0);
|
||||||
|
DebugManager.flags.EnableFlushTaskSubmission.set(0);
|
||||||
|
|
||||||
|
ze_result_t returnValue;
|
||||||
|
auto hwInfo = *NEO::defaultHwInfo;
|
||||||
|
hwInfo.featureTable.ftrBcsInfo = 0b111111111;
|
||||||
|
hwInfo.capabilityTable.blitterOperationsSupported = true;
|
||||||
|
auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(&hwInfo);
|
||||||
|
auto testL0Device = std::unique_ptr<L0::Device>(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue));
|
||||||
|
|
||||||
|
ze_command_queue_desc_t desc = {};
|
||||||
|
desc.ordinal = static_cast<uint32_t>(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::Copy));
|
||||||
|
|
||||||
|
std::unique_ptr<L0::CommandList> commandList0(CommandList::createImmediate(productFamily,
|
||||||
|
testL0Device.get(),
|
||||||
|
&desc,
|
||||||
|
false,
|
||||||
|
NEO::EngineGroupType::Copy,
|
||||||
|
returnValue));
|
||||||
|
ASSERT_NE(nullptr, commandList0);
|
||||||
|
EXPECT_EQ(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs.size(), 4u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
|
||||||
|
|
||||||
|
constexpr size_t alignment = 4096u;
|
||||||
|
constexpr size_t size = 8 * MemoryConstants::megaByte;
|
||||||
|
void *srcPtr = reinterpret_cast<void *>(0x1234);
|
||||||
|
void *dstPtr;
|
||||||
|
ze_host_mem_alloc_desc_t hostDesc = {};
|
||||||
|
context->allocHostMem(&hostDesc, size, alignment, &dstPtr);
|
||||||
|
|
||||||
|
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr, false, false);
|
||||||
|
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
|
||||||
|
|
||||||
|
context->freeMem(dstPtr);
|
||||||
|
}
|
||||||
|
|
||||||
|
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyHostptrDisabledAndImmediateCommandListWhenAppendingMemoryCopyFromHostToNonUsmHostThenDoNotSplit, IsXeHpcCore) {
|
||||||
|
DebugManagerStateRestore restorer;
|
||||||
|
DebugManager.flags.SplitBcsCopy.set(1);
|
||||||
|
DebugManager.flags.SplitBcsCopyHostptr.set(0);
|
||||||
|
DebugManager.flags.EnableFlushTaskSubmission.set(0);
|
||||||
|
|
||||||
|
ze_result_t returnValue;
|
||||||
|
auto hwInfo = *NEO::defaultHwInfo;
|
||||||
|
hwInfo.featureTable.ftrBcsInfo = 0b111111111;
|
||||||
|
hwInfo.capabilityTable.blitterOperationsSupported = true;
|
||||||
|
auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(&hwInfo);
|
||||||
|
auto testL0Device = std::unique_ptr<L0::Device>(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue));
|
||||||
|
|
||||||
|
ze_command_queue_desc_t desc = {};
|
||||||
|
desc.ordinal = static_cast<uint32_t>(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::Copy));
|
||||||
|
|
||||||
|
std::unique_ptr<L0::CommandList> commandList0(CommandList::createImmediate(productFamily,
|
||||||
|
testL0Device.get(),
|
||||||
|
&desc,
|
||||||
|
false,
|
||||||
|
NEO::EngineGroupType::Copy,
|
||||||
|
returnValue));
|
||||||
|
ASSERT_NE(nullptr, commandList0);
|
||||||
|
EXPECT_EQ(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs.size(), 4u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
|
||||||
|
|
||||||
|
constexpr size_t alignment = 4096u;
|
||||||
|
constexpr size_t size = 8 * MemoryConstants::megaByte;
|
||||||
|
void *srcPtr;
|
||||||
|
void *dstPtr = reinterpret_cast<void *>(0x1234);
|
||||||
|
ze_host_mem_alloc_desc_t hostDesc = {};
|
||||||
|
context->allocHostMem(&hostDesc, size, alignment, &srcPtr);
|
||||||
|
|
||||||
|
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr, false, false);
|
||||||
|
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
|
||||||
|
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
|
||||||
|
|
||||||
|
context->freeMem(srcPtr);
|
||||||
|
}
|
||||||
|
|
||||||
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyFromNonUsmHostToHostThenDoSplit, IsXeHpcCore) {
|
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyFromNonUsmHostToHostThenDoSplit, IsXeHpcCore) {
|
||||||
DebugManagerStateRestore restorer;
|
DebugManagerStateRestore restorer;
|
||||||
DebugManager.flags.SplitBcsCopy.set(1);
|
DebugManager.flags.SplitBcsCopy.set(1);
|
||||||
|
|
|
@ -328,6 +328,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, DeferCmdQGpgpuInitialization, -1, "-1: default,
|
||||||
DECLARE_DEBUG_VARIABLE(int32_t, DeferCmdQBcsInitialization, -1, "-1: default, 0:disabled, 1: enabled.")
|
DECLARE_DEBUG_VARIABLE(int32_t, DeferCmdQBcsInitialization, -1, "-1: default, 0:disabled, 1: enabled.")
|
||||||
DECLARE_DEBUG_VARIABLE(int32_t, PreferInternalBcsEngine, -1, "-1: default, 0:disabled, 1: enabled. When enabled use internal BCS engine for internal transfers, when disabled use regular engine")
|
DECLARE_DEBUG_VARIABLE(int32_t, PreferInternalBcsEngine, -1, "-1: default, 0:disabled, 1: enabled. When enabled use internal BCS engine for internal transfers, when disabled use regular engine")
|
||||||
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsCopy, -1, "-1: default, 0:disabled, 1: enabled. When enqueues copy to main copy engine then split between even linked copy engines")
|
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsCopy, -1, "-1: default, 0:disabled, 1: enabled. When enqueues copy to main copy engine then split between even linked copy engines")
|
||||||
|
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsCopyHostptr, -1, "-1: default, 0:disabled, 1: enabled. Enable split for hostptr allocations")
|
||||||
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsSize, -1, "-1: default, >=0: Size to apply BCS split from")
|
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsSize, -1, "-1: default, >=0: Size to apply BCS split from")
|
||||||
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMask, 0, "0: default, >0: bitmask: indicates bcs engines for split")
|
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMask, 0, "0: default, >0: bitmask: indicates bcs engines for split")
|
||||||
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMaskH2D, 0, "0: default, >0: bitmask: indicates bcs engines for H2D split")
|
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMaskH2D, 0, "0: default, >0: bitmask: indicates bcs engines for H2D split")
|
||||||
|
|
|
@ -420,6 +420,7 @@ AssignBCSAtEnqueue = -1
|
||||||
DeferCmdQGpgpuInitialization = -1
|
DeferCmdQGpgpuInitialization = -1
|
||||||
DeferCmdQBcsInitialization = -1
|
DeferCmdQBcsInitialization = -1
|
||||||
SplitBcsCopy = -1
|
SplitBcsCopy = -1
|
||||||
|
SplitBcsCopyHostptr = -1
|
||||||
SplitBcsSize = -1
|
SplitBcsSize = -1
|
||||||
SplitBcsMask = 0
|
SplitBcsMask = 0
|
||||||
SplitBcsMaskH2D = 0
|
SplitBcsMaskH2D = 0
|
||||||
|
|
Loading…
Reference in New Issue