Add bcs split implementation for memory copy region

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2022-09-14 14:47:54 +00:00
committed by Compute-Runtime-Automation
parent 6dacab1c02
commit 3d4b4b5746
5 changed files with 91 additions and 22 deletions

View File

@@ -243,6 +243,7 @@ struct CommandListCoreFamily : CommandListImp {
void updateStreamProperties(Kernel &kernel, bool isCooperative);
void clearCommandsToPatch();
size_t getTotalSizeForCopyRegion(const ze_copy_region_t *region, uint32_t pitch, uint32_t slicePitch);
bool isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size);
void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes,

View File

@@ -1269,20 +1269,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(void *d
callId = neoDevice->getRootDeviceEnvironment().tagsManager->currentCallCount;
}
size_t dstSize = 0;
size_t srcSize = 0;
if (srcRegion->depth > 1) {
uint32_t hostPtrDstOffset = dstRegion->originX + ((dstRegion->originY) * dstPitch) + ((dstRegion->originZ) * dstSlicePitch);
uint32_t hostPtrSrcOffset = srcRegion->originX + ((srcRegion->originY) * srcPitch) + ((srcRegion->originZ) * srcSlicePitch);
dstSize = (dstRegion->width * dstRegion->height * dstRegion->depth) + hostPtrDstOffset;
srcSize = (srcRegion->width * srcRegion->height * srcRegion->depth) + hostPtrSrcOffset;
} else {
uint32_t hostPtrDstOffset = dstRegion->originX + ((dstRegion->originY) * dstPitch);
uint32_t hostPtrSrcOffset = srcRegion->originX + ((srcRegion->originY) * srcPitch);
dstSize = (dstRegion->width * dstRegion->height) + hostPtrDstOffset;
srcSize = (srcRegion->width * srcRegion->height) + hostPtrSrcOffset;
}
size_t dstSize = this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch);
size_t srcSize = this->getTotalSizeForCopyRegion(srcRegion, srcPitch, srcSlicePitch);
auto dstAllocationStruct = getAlignedAllocation(this->device, dstPtr, dstSize, false);
auto srcAllocationStruct = getAlignedAllocation(this->device, srcPtr, srcSize, true);
@@ -2386,6 +2374,17 @@ void CommandListCoreFamily<gfxCoreFamily>::clearCommandsToPatch() {
commandsToPatch.clear();
}
template <GFXCORE_FAMILY gfxCoreFamily>
inline size_t CommandListCoreFamily<gfxCoreFamily>::getTotalSizeForCopyRegion(const ze_copy_region_t *region, uint32_t pitch, uint32_t slicePitch) {
if (region->depth > 1) {
uint32_t offset = region->originX + ((region->originY) * pitch) + ((region->originZ) * slicePitch);
return (region->width * region->height * region->depth) + offset;
} else {
uint32_t offset = region->originX + ((region->originY) * pitch);
return (region->width * region->height) + offset;
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamily<gfxCoreFamily>::isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size) {
constexpr size_t minimalSizeForBcsSplit = 4 * MemoryConstants::megaByte;

View File

@@ -220,7 +220,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
ze_result_t ret;
if (this->isAppendSplitNeeded(dstptr, srcptr, size)) {
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall(this, dstptr, srcptr, size, hSignalEvent, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, numWaitEvents, phWaitEvents);
});
} else {
@@ -248,9 +248,28 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
if (this->isFlushTaskSubmissionEnabled) {
checkAvailableSpace();
}
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(dstPtr, dstRegion, dstPitch, dstSlicePitch,
srcPtr, srcRegion, srcPitch, srcSlicePitch,
hSignalEvent, numWaitEvents, phWaitEvents);
ze_result_t ret;
if (this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch))) {
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uint32_t, uint32_t>(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
ze_copy_region_t dstRegionLocal = {};
ze_copy_region_t srcRegionLocal = {};
memcpy(&dstRegionLocal, dstRegion, sizeof(ze_copy_region_t));
memcpy(&srcRegionLocal, srcRegion, sizeof(ze_copy_region_t));
dstRegionLocal.originX = dstOriginXParam;
dstRegionLocal.width = static_cast<uint32_t>(sizeParam);
srcRegionLocal.originX = srcOriginXParam;
srcRegionLocal.width = static_cast<uint32_t>(sizeParam);
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(dstPtr, &dstRegionLocal, dstPitch, dstSlicePitch,
srcPtr, &srcRegionLocal, srcPitch, srcSlicePitch,
hSignalEventParam, numWaitEvents, phWaitEvents);
});
} else {
ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(dstPtr, dstRegion, dstPitch, dstSlicePitch,
srcPtr, srcRegion, srcPitch, srcSlicePitch,
hSignalEvent, numWaitEvents, phWaitEvents);
}
return flushImmediate(ret, true);
}

View File

@@ -47,13 +47,13 @@ struct BcsSplit {
std::vector<CommandQueue *> cmdQs;
NEO::BcsInfoMask engines = NEO::EngineHelpers::oddLinkedCopyEnginesMask;
template <GFXCORE_FAMILY gfxCoreFamily>
template <GFXCORE_FAMILY gfxCoreFamily, typename T, typename K>
ze_result_t appendSplitCall(CommandListCoreFamilyImmediate<gfxCoreFamily> *cmdList,
void *dstptr,
const void *srcptr,
T dstptr,
K srcptr,
size_t size,
ze_event_handle_t hSignalEvent,
std::function<ze_result_t(void *, const void *, size_t, ze_event_handle_t)> appendCall) {
std::function<ze_result_t(T, K, size_t, ze_event_handle_t)> appendCall) {
ze_result_t result = ZE_RESULT_SUCCESS;
if (hSignalEvent) {

View File

@@ -467,6 +467,56 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe
context->freeMem(dstPtr);
}
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyRegionThenSuccessIsReturned, IsXeHpcCore) {
DebugManagerStateRestore restorer;
DebugManager.flags.SplitBcsCopy.set(1);
ze_result_t returnValue;
auto hwInfo = *NEO::defaultHwInfo;
hwInfo.featureTable.ftrBcsInfo = 0b111111111;
hwInfo.capabilityTable.blitterOperationsSupported = true;
auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(&hwInfo);
auto testL0Device = std::unique_ptr<L0::Device>(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue));
ze_command_queue_desc_t desc = {};
desc.ordinal = static_cast<uint32_t>(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::Copy));
std::unique_ptr<L0::CommandList> commandList0(CommandList::createImmediate(productFamily,
testL0Device.get(),
&desc,
false,
NEO::EngineGroupType::Copy,
returnValue));
ASSERT_NE(nullptr, commandList0);
EXPECT_EQ(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs.size(), 4u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
constexpr size_t alignment = 4096u;
constexpr size_t size = 8 * MemoryConstants::megaByte;
void *srcPtr;
void *dstPtr;
ze_device_mem_alloc_desc_t deviceDesc = {};
context->allocDeviceMem(device->toHandle(),
&deviceDesc,
size, alignment, &srcPtr);
ze_host_mem_alloc_desc_t hostDesc = {};
context->allocHostMem(&hostDesc, size, alignment, &dstPtr);
ze_copy_region_t region = {2, 1, 1, 4 * MemoryConstants::megaByte, 1, 1};
auto result = commandList0->appendMemoryCopyRegion(dstPtr, &region, 0, 0, srcPtr, &region, 0, 0, nullptr, 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u);
context->freeMem(srcPtr);
context->freeMem(dstPtr);
}
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyWithEventThenSuccessIsReturnedAndMiFlushProgrammed, IsXeHpcCore) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;