test: aggregated event bcs split aub

Related-To: NEO-14557

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2025-10-16 09:12:48 +00:00
committed by Compute-Runtime-Automation
parent ce5a17bcc5
commit ee89d0ccd6
8 changed files with 250 additions and 6 deletions

View File

@@ -1,11 +1,11 @@
#
# Copyright (C) 2021-2024 Intel Corporation
# Copyright (C) 2021-2025 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
target_sources(ze_intel_gpu_aub_tests PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/append_kernel_indirect_aub_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cmdlist_aub_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/multitile_aub_tests.cpp
)

View File

@@ -5,16 +5,22 @@
*
*/
#include "shared/source/command_stream/transfer_direction.h"
#include "shared/source/helpers/array_count.h"
#include "shared/source/helpers/file_io.h"
#include "shared/test/common/helpers/test_files.h"
#include "shared/test/common/test_macros/test.h"
#include "shared/test/common/test_macros/hw_test.h"
#include "level_zero/core/source/cmdqueue/cmdqueue.h"
#include "level_zero/core/source/context/context_imp.h"
#include "level_zero/core/source/device/bcs_split.h"
#include "level_zero/core/source/driver/driver_handle_imp.h"
#include "level_zero/core/source/event/event.h"
#include "level_zero/core/test/aub_tests/fixtures/aub_fixture.h"
#include "level_zero/core/test/aub_tests/fixtures/multicontext_l0_aub_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
#include "level_zero/core/test/unit_tests/sources/helper/ze_object_utils.h"
#include "level_zero/driver_experimental/zex_api.h"
namespace L0 {
namespace ult {
@@ -249,5 +255,223 @@ TEST_F(AUBAppendKernelIndirectL0, whenAppendKernelIndirectThenWorkDimIsProperlyP
EXPECT_EQ(ZE_RESULT_SUCCESS, zeModuleDestroy(moduleHandle));
}
template <uint32_t TileCount>
struct BcsSplitAubFixture : public MulticontextL0AubFixture {
virtual void setUp() {
debugManager.flags.BlitterEnableMaskOverride.set(0b1110);
debugManager.flags.SplitBcsCopy.set(1);
debugManager.flags.SplitBcsRequiredTileCount.set(1);
debugManager.flags.SplitBcsRequiredEnginesCount.set(2);
debugManager.flags.SplitBcsMask.set(0b110);
debugManager.flags.SplitBcsTransferDirectionMask.set(~(1 << static_cast<int32_t>(TransferDirection::localToLocal)));
this->dispatchMode = DispatchMode::immediateDispatch;
MulticontextL0AubFixture::setUp(TileCount, EnabledCommandStreamers::single, (TileCount > 1));
if (skipped || rootDevice->getHwInfo().featureTable.ftrBcsInfo.count() < 2) {
GTEST_SKIP();
}
ze_context_handle_t hContext;
ze_context_desc_t desc = {ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
driverHandle->createContext(&desc, 0u, nullptr, &hContext);
ASSERT_NE(nullptr, hContext);
context.reset(static_cast<ContextImp *>(Context::fromHandle(hContext)));
ze_result_t returnValue;
commandList.reset(ult::CommandList::whiteboxCast(CommandList::create(rootDevice->getHwInfo().platform.eProductFamily, rootDevice, NEO::EngineGroupType::compute, 0u, returnValue, false)));
ASSERT_NE(nullptr, commandList.get());
ze_command_queue_desc_t queueDesc = {
.ordinal = queryCopyOrdinal(),
.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER,
};
commandList.reset(CommandList::createImmediate(rootDevice->getHwInfo().platform.eProductFamily,
rootDevice,
&queueDesc,
false,
NEO::EngineGroupType::copy,
returnValue));
ASSERT_NE(nullptr, commandList.get());
}
uint32_t queryCopyOrdinal() {
uint32_t count = 0;
rootDevice->getCommandQueueGroupProperties(&count, nullptr);
std::vector<ze_command_queue_group_properties_t> groups;
groups.resize(count);
rootDevice->getCommandQueueGroupProperties(&count, groups.data());
for (uint32_t i = 0; i < count; i++) {
if (groups[i].flags == ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) {
return i;
}
}
EXPECT_TRUE(false);
return 0;
}
DestroyableZeUniquePtr<Event> createAggregatedEvent(uint64_t incValue, uint64_t completionValue) {
ze_device_mem_alloc_desc_t deviceDesc = {};
void *devAddress = nullptr;
context->allocDeviceMem(rootDevice->toHandle(), &deviceDesc, sizeof(uint64_t), 4096u, &devAddress);
ze_event_handle_t outEvent = nullptr;
zex_counter_based_event_external_storage_properties_t externalStorageAllocProperties = {
.stype = ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_STORAGE_ALLOC_PROPERTIES,
.deviceAddress = reinterpret_cast<uint64_t *>(devAddress),
.incrementValue = incValue,
.completionValue = completionValue,
};
zex_counter_based_event_desc_t counterBasedDesc = {
.stype = ZEX_STRUCTURE_COUNTER_BASED_EVENT_DESC,
.pNext = &externalStorageAllocProperties,
.flags = ZEX_COUNTER_BASED_EVENT_FLAG_IMMEDIATE,
};
EXPECT_EQ(ZE_RESULT_SUCCESS, zexCounterBasedEventCreate2(context.get(), rootDevice, &counterBasedDesc, &outEvent));
return DestroyableZeUniquePtr<Event>(Event::fromHandle(outEvent));
}
DestroyableZeUniquePtr<ContextImp> context;
DestroyableZeUniquePtr<L0::CommandList> commandList;
};
using BcsSplitAubTests = Test<BcsSplitAubFixture<1>>;
HWTEST2_F(BcsSplitAubTests, whenAppendingCopyWithAggregatedEventThenEventIsSignaledAndDataIsCorrect, IsAtLeastXeHpcCore) {
auto whiteboxCmdList = static_cast<ult::WhiteBox<L0::CommandListImp> *>(commandList.get());
if (!whiteboxCmdList->isBcsSplitNeeded) {
GTEST_SKIP();
}
size_t bufferSize = whiteboxCmdList->minimalSizeForBcsSplit;
NEO::SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::hostUnifiedMemory,
1,
context->rootDeviceIndices,
context->deviceBitfields);
auto srcBuffer = driverHandle->svmAllocsManager->createHostUnifiedMemoryAllocation(bufferSize, unifiedMemoryProperties);
auto dstBuffer1 = driverHandle->svmAllocsManager->createHostUnifiedMemoryAllocation(bufferSize, unifiedMemoryProperties);
auto dstBuffer2 = driverHandle->svmAllocsManager->createHostUnifiedMemoryAllocation(bufferSize, unifiedMemoryProperties);
std::fill(static_cast<uint8_t *>(srcBuffer), ptrOffset(static_cast<uint8_t *>(srcBuffer), bufferSize), 1);
memset(dstBuffer1, 0, bufferSize);
memset(dstBuffer2, 0, bufferSize);
uint32_t incValue = 0;
zexDeviceGetAggregatedCopyOffloadIncrementValue(rootDevice->toHandle(), &incValue);
auto event = createAggregatedEvent(incValue, incValue * 2);
auto eventStorage = reinterpret_cast<uint64_t *>(event->getInOrderExecInfo()->getBaseDeviceAddress());
*event->getInOrderExecInfo()->getBaseHostAddress() = 0;
auto csr = getRootSimulatedCsr<FamilyType>();
csr->writeMemory(*event->getInOrderExecInfo()->getDeviceCounterAllocation());
zeCommandListAppendMemoryCopy(commandList->toHandle(), dstBuffer1, srcBuffer, bufferSize, event->toHandle(), 0, nullptr);
zeCommandListHostSynchronize(commandList->toHandle(), std::numeric_limits<uint64_t>::max());
auto bcsSplit = static_cast<DeviceImp *>(rootDevice)->bcsSplit.get();
ASSERT_NE(nullptr, bcsSplit);
auto whiteboxSplitCmdList = static_cast<ult::WhiteBox<L0::CommandListImp> *>(bcsSplit->cmdLists[0]);
auto taskCount = whiteboxSplitCmdList->cmdQImmediate->getTaskCount();
EXPECT_TRUE(taskCount >= 1);
auto expectedIncValue = static_cast<uint64_t>(incValue);
EXPECT_TRUE(csr->expectMemory(dstBuffer1, srcBuffer, bufferSize, aub_stream::CompareOperationValues::CompareEqual));
EXPECT_TRUE(csr->expectMemory(eventStorage, &expectedIncValue, sizeof(uint64_t), aub_stream::CompareOperationValues::CompareEqual));
zeCommandListAppendMemoryCopy(commandList->toHandle(), dstBuffer2, srcBuffer, bufferSize, event->toHandle(), 0, nullptr);
zeCommandListHostSynchronize(commandList->toHandle(), std::numeric_limits<uint64_t>::max());
EXPECT_TRUE(whiteboxSplitCmdList->cmdQImmediate->getTaskCount() > taskCount);
expectedIncValue = incValue * 2;
EXPECT_TRUE(csr->expectMemory(dstBuffer2, srcBuffer, bufferSize, aub_stream::CompareOperationValues::CompareEqual));
EXPECT_TRUE(csr->expectMemory(eventStorage, &expectedIncValue, sizeof(uint64_t), aub_stream::CompareOperationValues::CompareEqual));
driverHandle->svmAllocsManager->freeSVMAlloc(srcBuffer);
driverHandle->svmAllocsManager->freeSVMAlloc(dstBuffer1);
driverHandle->svmAllocsManager->freeSVMAlloc(dstBuffer2);
context->freeMem(eventStorage);
}
using BcsSplitMultitileAubTests = Test<BcsSplitAubFixture<2>>;
HWTEST2_F(BcsSplitMultitileAubTests, whenAppendingCopyWithAggregatedEventThenEventIsSignaledAndDataIsCorrect, IsAtLeastXeHpcCore) {
if (!rootDevice->isImplicitScalingCapable()) {
GTEST_SKIP();
}
ze_result_t returnValue;
ze_command_queue_desc_t queueDesc = {.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER | ZE_COMMAND_QUEUE_FLAG_COPY_OFFLOAD_HINT};
commandList.reset(CommandList::createImmediate(rootDevice->getHwInfo().platform.eProductFamily, rootDevice, &queueDesc, false, NEO::EngineGroupType::compute, returnValue));
ASSERT_NE(nullptr, commandList.get());
auto whiteboxCmdList = static_cast<ult::WhiteBox<L0::CommandListImp> *>(commandList.get());
EXPECT_TRUE(whiteboxCmdList->isCopyOffloadEnabled());
EXPECT_TRUE(whiteboxCmdList->isBcsSplitNeeded);
size_t bufferSize = whiteboxCmdList->minimalSizeForBcsSplit;
NEO::SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::hostUnifiedMemory,
1,
context->rootDeviceIndices,
context->deviceBitfields);
auto srcBuffer = driverHandle->svmAllocsManager->createHostUnifiedMemoryAllocation(bufferSize, unifiedMemoryProperties);
auto dstBuffer1 = driverHandle->svmAllocsManager->createHostUnifiedMemoryAllocation(bufferSize, unifiedMemoryProperties);
auto dstBuffer2 = driverHandle->svmAllocsManager->createHostUnifiedMemoryAllocation(bufferSize, unifiedMemoryProperties);
std::fill(static_cast<uint8_t *>(srcBuffer), ptrOffset(static_cast<uint8_t *>(srcBuffer), bufferSize), 1);
memset(dstBuffer1, 0, bufferSize);
memset(dstBuffer2, 0, bufferSize);
uint32_t incValue = 0;
zexDeviceGetAggregatedCopyOffloadIncrementValue(rootDevice->toHandle(), &incValue);
auto event = createAggregatedEvent(incValue, incValue * 2);
auto eventStorage = reinterpret_cast<uint64_t *>(event->getInOrderExecInfo()->getBaseDeviceAddress());
*event->getInOrderExecInfo()->getBaseHostAddress() = 0;
auto csr = getRootSimulatedCsr<FamilyType>();
csr->writeMemory(*event->getInOrderExecInfo()->getDeviceCounterAllocation());
zeCommandListAppendMemoryCopy(commandList->toHandle(), dstBuffer1, srcBuffer, bufferSize, event->toHandle(), 0, nullptr);
zeCommandListHostSynchronize(commandList->toHandle(), std::numeric_limits<uint64_t>::max());
auto bcsSplit = static_cast<DeviceImp *>(rootDevice)->bcsSplit.get();
ASSERT_NE(nullptr, bcsSplit);
auto whiteboxSplitCmdList = static_cast<ult::WhiteBox<L0::CommandListImp> *>(bcsSplit->cmdLists[0]);
auto taskCount = whiteboxSplitCmdList->cmdQImmediate->getTaskCount();
EXPECT_TRUE(taskCount >= 1);
auto expectedIncValue = static_cast<uint64_t>(incValue);
EXPECT_TRUE(csr->expectMemory(dstBuffer1, srcBuffer, bufferSize, aub_stream::CompareOperationValues::CompareEqual));
EXPECT_TRUE(csr->expectMemory(eventStorage, &expectedIncValue, sizeof(uint64_t), aub_stream::CompareOperationValues::CompareEqual));
zeCommandListAppendMemoryCopy(commandList->toHandle(), dstBuffer2, srcBuffer, bufferSize, event->toHandle(), 0, nullptr);
zeCommandListHostSynchronize(commandList->toHandle(), std::numeric_limits<uint64_t>::max());
EXPECT_TRUE(whiteboxSplitCmdList->cmdQImmediate->getTaskCount() > taskCount);
expectedIncValue = incValue * 2;
EXPECT_TRUE(csr->expectMemory(dstBuffer2, srcBuffer, bufferSize, aub_stream::CompareOperationValues::CompareEqual));
EXPECT_TRUE(csr->expectMemory(eventStorage, &expectedIncValue, sizeof(uint64_t), aub_stream::CompareOperationValues::CompareEqual));
driverHandle->svmAllocsManager->freeSVMAlloc(srcBuffer);
driverHandle->svmAllocsManager->freeSVMAlloc(dstBuffer1);
driverHandle->svmAllocsManager->freeSVMAlloc(dstBuffer2);
context->freeMem(eventStorage);
}
} // namespace ult
} // namespace L0

View File

@@ -31,6 +31,10 @@ CommandStreamReceiver *MulticontextL0AubFixture::getGpgpuCsr(uint32_t tile, uint
return subDevices[tile]->getNEODevice()->getEngine(engine).commandStreamReceiver;
}
CommandStreamReceiver *MulticontextL0AubFixture::getRootCsr() {
return rootDevice->getNEODevice()->getDefaultEngine().commandStreamReceiver;
}
void MulticontextL0AubFixture::createDevices(const HardwareInfo &hwInfo, uint32_t numTiles) {
VariableBackup<UltHwConfig> backup(&ultHwConfig);
ultHwConfig.useHwCsr = true;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2024 Intel Corporation
* Copyright (C) 2024-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -32,6 +32,7 @@ struct MulticontextL0AubFixture : public NEO::MulticontextAubFixture {
void setUp(uint32_t numberOfTiles, EnabledCommandStreamers enabledCommandStreamers, bool implicitScaling);
CommandStreamReceiver *getGpgpuCsr(uint32_t tile, uint32_t engine) override;
CommandStreamReceiver *getRootCsr() override;
void createDevices(const NEO::HardwareInfo &hwInfo, uint32_t numTiles) override;
std::unique_ptr<L0::ult::Mock<L0::DriverHandleImp>> driverHandle;

View File

@@ -329,6 +329,7 @@ struct WhiteBox<::L0::CommandListImp> : public ::L0::CommandListImp {
using BaseClass::initialize;
using BaseClass::inOrderExecInfo;
using BaseClass::interruptEvents;
using BaseClass::isBcsSplitNeeded;
using BaseClass::isSyncModeQueue;
using BaseClass::isTbxMode;
using BaseClass::l3FlushAfterPostSyncEnabled;

View File

@@ -96,6 +96,10 @@ CommandStreamReceiver *MulticontextOclAubFixture::getGpgpuCsr(uint32_t tile, uin
return &(commandQueues[tile][engine]->getGpgpuCommandStreamReceiver());
}
CommandStreamReceiver *MulticontextOclAubFixture::getRootCsr() {
return rootDevice->getDefaultEngine().commandStreamReceiver;
}
void MulticontextOclAubFixture::createDevices(const HardwareInfo &hwInfo, uint32_t numTiles) {
VariableBackup<UltHwConfig> backup(&ultHwConfig);
ultHwConfig.useHwCsr = true;

View File

@@ -24,6 +24,7 @@ struct MulticontextOclAubFixture : public MulticontextAubFixture {
void setUp(uint32_t numberOfTiles, EnabledCommandStreamers enabledCommandStreamers, bool enableCompression);
CommandStreamReceiver *getGpgpuCsr(uint32_t tile, uint32_t engine) override;
CommandStreamReceiver *getRootCsr() override;
void createDevices(const HardwareInfo &hwInfo, uint32_t numTiles) override;
std::vector<ClDevice *> tileDevices;

View File

@@ -38,17 +38,26 @@ struct MulticontextAubFixture {
void tearDown() {}
virtual CommandStreamReceiver *getGpgpuCsr(uint32_t tile, uint32_t engine) = 0;
virtual CommandStreamReceiver *getRootCsr() = 0;
bool isMemoryCompressed(CommandStreamReceiver *csr, void *gfxAddress);
template <typename FamilyType>
CommandStreamReceiverSimulatedCommonHw<FamilyType> *getRootSimulatedCsr() {
return castToSimulatedCsr<FamilyType>(getRootCsr());
}
template <typename FamilyType>
CommandStreamReceiverSimulatedCommonHw<FamilyType> *getSimulatedCsr(uint32_t tile, uint32_t engine) {
return castToSimulatedCsr<FamilyType>(getGpgpuCsr(tile, engine));
}
template <typename FamilyType>
CommandStreamReceiverSimulatedCommonHw<FamilyType> *castToSimulatedCsr(CommandStreamReceiver *csr) {
using CsrWithAubDump = CommandStreamReceiverWithAUBDump<TbxCommandStreamReceiverHw<FamilyType>>;
using SimulatedCsr = CommandStreamReceiverSimulatedCommonHw<FamilyType>;
SimulatedCsr *simulatedCsr = nullptr;
auto csr = getGpgpuCsr(tile, engine);
if (testMode == TestMode::aubTestsWithTbx) {
auto csrWithAubDump = static_cast<CsrWithAubDump *>(csr);
simulatedCsr = static_cast<SimulatedCsr *>(csrWithAubDump);