Add initial BCS split implementation for L0

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2022-09-09 09:43:47 +00:00
committed by Compute-Runtime-Automation
parent fad4bee432
commit 63e72965a1
7 changed files with 163 additions and 13 deletions

View File

@@ -293,6 +293,7 @@ struct CommandList : _ze_command_list_handle_t {
bool isTbxMode = false;
bool commandListSLMEnabled = false;
bool requiresQueueUncachedMocs = false;
bool isBcsSplitNeeded = false;
protected:
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);

View File

@@ -156,6 +156,7 @@ struct CommandListCoreFamily : CommandListImp {
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
void appendMultiPartitionPrologue(uint32_t partitionDataSize) override;
void appendMultiPartitionEpilogue() override;
void appendEventForProfilingAllWalkers(Event *event, bool beforeWalker);
ze_result_t reserveSpace(size_t size, void **ptr) override;
ze_result_t reset() override;
@@ -249,7 +250,6 @@ struct CommandListCoreFamily : CommandListImp {
void appendWriteKernelTimestamp(Event *event, bool beforeWalker, bool maskLsb, bool workloadPartition);
void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, bool maskLsb, uint32_t mask, bool workloadPartition);
void appendEventForProfiling(Event *event, bool beforeWalker, bool workloadPartition);
void appendEventForProfilingAllWalkers(Event *event, bool beforeWalker);
void appendEventForProfilingCopyCommand(Event *event, bool beforeWalker);
void appendSignalEventPostWalker(Event *event, bool workloadPartition);
virtual void programStateBaseAddress(NEO::CommandContainer &container, bool genericMediaStateClearRequired);

View File

@@ -15,6 +15,7 @@
#include "shared/source/memory_manager/prefetch_manager.h"
#include "level_zero/core/source/cmdlist/cmdlist_hw_immediate.h"
#include "level_zero/core/source/device/bcs_split.h"
namespace L0 {
@@ -215,8 +216,18 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
if (this->isFlushTaskSubmissionEnabled) {
checkAvailableSpace();
}
auto ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptr, srcptr, size, hSignalEvent,
numWaitEvents, phWaitEvents);
ze_result_t ret;
if (this->isBcsSplitNeeded) {
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall(this, dstptr, srcptr, size, hSignalEvent, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, numWaitEvents, phWaitEvents);
});
} else {
ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptr, srcptr, size, hSignalEvent,
numWaitEvents, phWaitEvents);
}
return flushImmediate(ret, true);
}

View File

@@ -148,7 +148,7 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device
commandList->isTbxMode = (csr->getType() == NEO::CommandStreamReceiverType::CSR_TBX) || (csr->getType() == NEO::CommandStreamReceiverType::CSR_TBX_WITH_AUB);
commandList->commandListPreemptionMode = device->getDevicePreemptionMode();
deviceImp->bcsSplit.setupDevice(productFamily, internalUsage, desc, csr);
commandList->isBcsSplitNeeded = deviceImp->bcsSplit.setupDevice(productFamily, internalUsage, desc, csr);
return commandList;
}

View File

@@ -15,17 +15,20 @@
namespace L0 {
void BcsSplit::setupDevice(uint32_t productFamily, bool internalUsage, const ze_command_queue_desc_t *desc, NEO::CommandStreamReceiver *csr) {
bool BcsSplit::setupDevice(uint32_t productFamily, bool internalUsage, const ze_command_queue_desc_t *desc, NEO::CommandStreamReceiver *csr) {
auto initializeBcsSplit = this->device.getNEODevice()->isBcsSplitSupported() &&
csr->getOsContext().getEngineType() == aub_stream::EngineType::ENGINE_BCS &&
!internalUsage;
if (!initializeBcsSplit) {
return false;
}
static std::mutex bcsSplitInitMutex;
std::lock_guard<std::mutex> lock(bcsSplitInitMutex);
auto initializeBcsSplit = this->device.getNEODevice()->isBcsSplitSupported() &&
csr->getOsContext().getEngineType() == aub_stream::EngineType::ENGINE_BCS &&
!internalUsage &&
this->cmdQs.empty();
if (!initializeBcsSplit) {
return;
if (!this->cmdQs.empty()) {
return true;
}
if (NEO::DebugManager.flags.SplitBcsMask.get() > 0) {
@@ -48,6 +51,8 @@ void BcsSplit::setupDevice(uint32_t productFamily, bool internalUsage, const ze_
this->cmdQs.push_back(commandQueue);
}
}
return true;
}
void BcsSplit::releaseResources() {

View File

@@ -10,8 +10,11 @@
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/sku_info/sku_info_base.h"
#include "level_zero/core/source/cmdlist/cmdlist_hw_immediate.h"
#include "level_zero/core/source/cmdqueue/cmdqueue.h"
#include "level_zero/core/source/event/event.h"
#include <functional>
#include <vector>
namespace NEO {
@@ -28,7 +31,39 @@ struct BcsSplit {
std::vector<CommandQueue *> cmdQs;
NEO::BcsInfoMask engines = NEO::EngineHelpers::oddLinkedCopyEnginesMask;
void setupDevice(uint32_t productFamily, bool internalUsage, const ze_command_queue_desc_t *desc, NEO::CommandStreamReceiver *csr);
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t appendSplitCall(CommandListCoreFamilyImmediate<gfxCoreFamily> *cmdList,
void *dstptr,
const void *srcptr,
size_t size,
ze_event_handle_t hSignalEvent,
std::function<ze_result_t(void *, const void *, size_t, ze_event_handle_t)> appendCall) {
if (hSignalEvent) {
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), true);
}
auto totalSize = size;
auto engineCount = this->cmdQs.size();
for (size_t i = 0; i < this->cmdQs.size(); i++) {
auto localSize = totalSize / engineCount;
auto localDstPtr = ptrOffset(dstptr, size - totalSize);
auto localSrcPtr = ptrOffset(srcptr, size - totalSize);
appendCall(localDstPtr, localSrcPtr, localSize, nullptr);
cmdList->executeCommandListImmediateImpl(true, this->cmdQs[i]);
totalSize -= localSize;
engineCount--;
}
if (hSignalEvent) {
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), false);
}
return ZE_RESULT_SUCCESS;
}
bool setupDevice(uint32_t productFamily, bool internalUsage, const ze_command_queue_desc_t *desc, NEO::CommandStreamReceiver *csr);
void releaseResources();
BcsSplit(DeviceImp &device) : device(device){};

View File

@@ -264,5 +264,103 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopySetZeroWhenCreateImmediate
EXPECT_EQ(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs.size(), 0u);
}
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyThenSuccessIsReturned, IsXeHpcCore) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
DebugManagerStateRestore restorer;
DebugManager.flags.SplitBcsCopy.set(1);
ze_result_t returnValue;
auto hwInfo = *NEO::defaultHwInfo;
hwInfo.featureTable.ftrBcsInfo = 0b111111111;
hwInfo.capabilityTable.blitterOperationsSupported = true;
auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(&hwInfo);
auto testL0Device = std::unique_ptr<L0::Device>(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue));
ze_command_queue_desc_t desc = {};
desc.ordinal = static_cast<uint32_t>(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::Copy));
std::unique_ptr<L0::CommandList> commandList0(CommandList::createImmediate(productFamily,
testL0Device.get(),
&desc,
false,
NEO::EngineGroupType::Copy,
returnValue));
ASSERT_NE(nullptr, commandList0);
EXPECT_EQ(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs.size(), 4u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u);
}
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyWithEventThenSuccessIsReturnedAndMiFlushProgrammed, IsXeHpcCore) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
DebugManagerStateRestore restorer;
DebugManager.flags.SplitBcsCopy.set(1);
ze_result_t returnValue;
auto hwInfo = *NEO::defaultHwInfo;
hwInfo.featureTable.ftrBcsInfo = 0b111111111;
hwInfo.capabilityTable.blitterOperationsSupported = true;
auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(&hwInfo);
auto testL0Device = std::unique_ptr<L0::Device>(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue));
ze_command_queue_desc_t desc = {};
desc.ordinal = static_cast<uint32_t>(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::Copy));
std::unique_ptr<L0::CommandList> commandList0(CommandList::createImmediate(productFamily,
testL0Device.get(),
&desc,
false,
NEO::EngineGroupType::Copy,
returnValue));
ASSERT_NE(nullptr, commandList0);
EXPECT_EQ(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs.size(), 4u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
eventPoolDesc.count = 1;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.wait = 0;
eventDesc.signal = 0;
std::unique_ptr<EventPool> eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
std::unique_ptr<Event> event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
void *srcPtr = reinterpret_cast<void *>(0x1234);
void *dstPtr = reinterpret_cast<void *>(0x2345);
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, 8, event->toHandle(), 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, commandList0->commandContainer.getCommandStream()->getCpuBase(), commandList0->commandContainer.getCommandStream()->getUsed()));
auto itor = find<MI_FLUSH_DW *>(cmdList.begin(), cmdList.end());
EXPECT_NE(cmdList.end(), itor);
}
} // namespace ult
} // namespace L0