Split transfers greater than 4MB in L0

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2022-09-13 12:43:39 +00:00
committed by Compute-Runtime-Automation
parent 4d00a7ee8c
commit 88fe22fc49
4 changed files with 198 additions and 7 deletions

View File

@ -243,6 +243,8 @@ struct CommandListCoreFamily : CommandListImp {
void updateStreamProperties(Kernel &kernel, bool isCooperative);
void clearCommandsToPatch();
bool isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size);
void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes,
const void **pRanges);

View File

@ -2385,6 +2385,21 @@ void CommandListCoreFamily<gfxCoreFamily>::clearCommandsToPatch() {
commandsToPatch.clear();
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamily<gfxCoreFamily>::isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size) {
constexpr size_t minimalSizeForBcsSplit = 4 * MemoryConstants::megaByte;
auto dstAllocationStruct = getAlignedAllocation(this->device, dstPtr, size, false);
auto srcAllocationStruct = getAlignedAllocation(this->device, srcPtr, size, true);
auto dstMemoryPool = dstAllocationStruct.alloc->getMemoryPool();
auto srcMemoryPool = srcAllocationStruct.alloc->getMemoryPool();
return this->isBcsSplitNeeded &&
size >= minimalSizeForBcsSplit &&
((!NEO::MemoryPoolHelper::isSystemMemoryPool(dstMemoryPool) && NEO::MemoryPoolHelper::isSystemMemoryPool(srcMemoryPool)) ||
(!NEO::MemoryPoolHelper::isSystemMemoryPool(srcMemoryPool) && NEO::MemoryPoolHelper::isSystemMemoryPool(dstMemoryPool)));
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]) {
NEO::EncodeIndirectParams<GfxFamily>::setGlobalWorkSizeIndirect(commandContainer, offsets, crossThreadAddress, lws);

View File

@ -219,7 +219,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
ze_result_t ret;
if (this->isBcsSplitNeeded) {
if (this->isAppendSplitNeeded(dstptr, srcptr, size)) {
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall(this, dstptr, srcptr, size, hSignalEvent, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, numWaitEvents, phWaitEvents);
});

View File

@ -264,6 +264,158 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopySetZeroWhenCreateImmediate
EXPECT_EQ(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs.size(), 0u);
}
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyWithSizeLessThanFourMBThenDoNotSplit, IsXeHpcCore) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
DebugManagerStateRestore restorer;
DebugManager.flags.SplitBcsCopy.set(1);
ze_result_t returnValue;
auto hwInfo = *NEO::defaultHwInfo;
hwInfo.featureTable.ftrBcsInfo = 0b111111111;
hwInfo.capabilityTable.blitterOperationsSupported = true;
auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(&hwInfo);
auto testL0Device = std::unique_ptr<L0::Device>(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue));
ze_command_queue_desc_t desc = {};
desc.ordinal = static_cast<uint32_t>(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::Copy));
std::unique_ptr<L0::CommandList> commandList0(CommandList::createImmediate(productFamily,
testL0Device.get(),
&desc,
false,
NEO::EngineGroupType::Copy,
returnValue));
ASSERT_NE(nullptr, commandList0);
EXPECT_EQ(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs.size(), 4u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
constexpr size_t alignment = 4096u;
constexpr size_t size = 4 * MemoryConstants::megaByte - 1;
void *srcPtr;
void *dstPtr;
ze_device_mem_alloc_desc_t deviceDesc = {};
context->allocDeviceMem(device->toHandle(),
&deviceDesc,
size, alignment, &srcPtr);
ze_host_mem_alloc_desc_t hostDesc = {};
context->allocHostMem(&hostDesc, size, alignment, &dstPtr);
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
context->freeMem(srcPtr);
context->freeMem(dstPtr);
}
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyFromHostToHostThenDoNotSplit, IsXeHpcCore) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
DebugManagerStateRestore restorer;
DebugManager.flags.SplitBcsCopy.set(1);
ze_result_t returnValue;
auto hwInfo = *NEO::defaultHwInfo;
hwInfo.featureTable.ftrBcsInfo = 0b111111111;
hwInfo.capabilityTable.blitterOperationsSupported = true;
auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(&hwInfo);
auto testL0Device = std::unique_ptr<L0::Device>(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue));
ze_command_queue_desc_t desc = {};
desc.ordinal = static_cast<uint32_t>(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::Copy));
std::unique_ptr<L0::CommandList> commandList0(CommandList::createImmediate(productFamily,
testL0Device.get(),
&desc,
false,
NEO::EngineGroupType::Copy,
returnValue));
ASSERT_NE(nullptr, commandList0);
EXPECT_EQ(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs.size(), 4u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
constexpr size_t alignment = 4096u;
constexpr size_t size = 8 * MemoryConstants::megaByte;
void *srcPtr;
void *dstPtr;
ze_host_mem_alloc_desc_t hostDesc = {};
context->allocHostMem(&hostDesc,
size, alignment, &srcPtr);
context->allocHostMem(&hostDesc, size, alignment, &dstPtr);
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
context->freeMem(srcPtr);
context->freeMem(dstPtr);
}
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyFromDeviceToDeviceThenDoNotSplit, IsXeHpcCore) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
DebugManagerStateRestore restorer;
DebugManager.flags.SplitBcsCopy.set(1);
ze_result_t returnValue;
auto hwInfo = *NEO::defaultHwInfo;
hwInfo.featureTable.ftrBcsInfo = 0b111111111;
hwInfo.capabilityTable.blitterOperationsSupported = true;
auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(&hwInfo);
auto testL0Device = std::unique_ptr<L0::Device>(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue));
ze_command_queue_desc_t desc = {};
desc.ordinal = static_cast<uint32_t>(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::Copy));
std::unique_ptr<L0::CommandList> commandList0(CommandList::createImmediate(productFamily,
testL0Device.get(),
&desc,
false,
NEO::EngineGroupType::Copy,
returnValue));
ASSERT_NE(nullptr, commandList0);
EXPECT_EQ(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs.size(), 4u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
constexpr size_t alignment = 4096u;
constexpr size_t size = 8 * MemoryConstants::megaByte;
void *srcPtr;
void *dstPtr;
ze_device_mem_alloc_desc_t deviceDesc = {};
context->allocDeviceMem(device->toHandle(),
&deviceDesc,
size, alignment, &srcPtr);
context->allocDeviceMem(device->toHandle(),
&deviceDesc,
size, alignment, &dstPtr);
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
context->freeMem(srcPtr);
context->freeMem(dstPtr);
}
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyThenSuccessIsReturned, IsXeHpcCore) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
@ -293,15 +445,26 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
constexpr size_t alignment = 4096u;
constexpr size_t size = 8 * MemoryConstants::megaByte;
void *srcPtr;
void *dstPtr;
ze_device_mem_alloc_desc_t deviceDesc = {};
context->allocDeviceMem(device->toHandle(),
&deviceDesc,
size, alignment, &srcPtr);
ze_host_mem_alloc_desc_t hostDesc = {};
context->allocHostMem(&hostDesc, size, alignment, &dstPtr);
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, 0, nullptr);
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u);
context->freeMem(srcPtr);
context->freeMem(dstPtr);
}
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyWithEventThenSuccessIsReturnedAndMiFlushProgrammed, IsXeHpcCore) {
@ -345,10 +508,18 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe
std::unique_ptr<EventPool> eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
std::unique_ptr<Event> event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
constexpr size_t alignment = 4096u;
constexpr size_t size = 8 * MemoryConstants::megaByte;
void *srcPtr;
void *dstPtr;
ze_device_mem_alloc_desc_t deviceDesc = {};
context->allocDeviceMem(device->toHandle(),
&deviceDesc,
size, alignment, &srcPtr);
ze_host_mem_alloc_desc_t hostDesc = {};
context->allocHostMem(&hostDesc, size, alignment, &dstPtr);
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, 8, event->toHandle(), 0, nullptr);
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, event->toHandle(), 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u);
@ -360,6 +531,9 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe
auto itor = find<MI_FLUSH_DW *>(cmdList.begin(), cmdList.end());
EXPECT_NE(cmdList.end(), itor);
context->freeMem(srcPtr);
context->freeMem(dstPtr);
}
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAllocateNewEventsForSplitThenEventsAreManagedProperly, IsXeHpcCore) {