mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-01 04:23:00 +08:00
[feat, perf] add primary batch buffer support to multi-tile barrier
Implicit Scaling barrier have the same requirements as kernel. It must dispach bb start command with the same level as the command list is dispatched. Related-To: NEO-7807 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
3524085624
commit
62ea1b1a58
@@ -442,7 +442,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendMultiTileBarrier(NEO::Device &n
|
||||
0,
|
||||
0,
|
||||
!(cmdListType == CommandListType::TYPE_IMMEDIATE),
|
||||
!this->isFlushTaskSubmissionEnabled);
|
||||
!(this->isFlushTaskSubmissionEnabled || this->dispatchCmdListBatchBufferAsPrimary));
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
||||
@@ -57,7 +57,9 @@ void MultiTileCommandListFixtureInit::setUp() {
|
||||
SingleRootMultiSubDeviceFixture::setUp();
|
||||
}
|
||||
|
||||
void MultiTileCommandListFixtureInit::setUpParams(bool createImmediate, bool createInternal, bool createCopy) {
|
||||
void MultiTileCommandListFixtureInit::setUpParams(bool createImmediate, bool createInternal, bool createCopy, int32_t primaryBuffer) {
|
||||
DebugManager.flags.DispatchCmdlistCmdBufferPrimary.set(primaryBuffer);
|
||||
|
||||
ze_result_t returnValue;
|
||||
|
||||
NEO::EngineGroupType cmdListEngineType = createCopy ? NEO::EngineGroupType::Copy : NEO::EngineGroupType::RenderCompute;
|
||||
|
||||
@@ -34,7 +34,7 @@ class CommandListFixture : public DeviceFixture {
|
||||
|
||||
struct MultiTileCommandListFixtureInit : public SingleRootMultiSubDeviceFixture {
|
||||
void setUp();
|
||||
void setUpParams(bool createImmediate, bool createInternal, bool createCopy);
|
||||
void setUpParams(bool createImmediate, bool createInternal, bool createCopy, int32_t primaryBuffer);
|
||||
inline void tearDown() {
|
||||
SingleRootMultiSubDeviceFixture::tearDown();
|
||||
}
|
||||
@@ -46,11 +46,11 @@ struct MultiTileCommandListFixtureInit : public SingleRootMultiSubDeviceFixture
|
||||
std::unique_ptr<VariableBackup<bool>> osLocalMemoryBackup;
|
||||
};
|
||||
|
||||
template <bool createImmediate, bool createInternal, bool createCopy>
|
||||
template <bool createImmediate, bool createInternal, bool createCopy, int32_t primaryBuffer>
|
||||
struct MultiTileCommandListFixture : public MultiTileCommandListFixtureInit {
|
||||
void setUp() {
|
||||
MultiTileCommandListFixtureInit::setUp();
|
||||
MultiTileCommandListFixtureInit::setUpParams(createImmediate, createInternal, createCopy);
|
||||
MultiTileCommandListFixtureInit::setUpParams(createImmediate, createInternal, createCopy, primaryBuffer);
|
||||
}
|
||||
|
||||
void tearDown() {
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
|
||||
using MultiTileImmediateCommandListTest = Test<MultiTileCommandListFixture<true, false, false>>;
|
||||
using MultiTileImmediateCommandListTest = Test<MultiTileCommandListFixture<true, false, false, -1>>;
|
||||
|
||||
HWTEST2_F(MultiTileImmediateCommandListTest, GivenMultiTileDeviceWhenCreatingImmediateCommandListThenExpectPartitionCountMatchTileCount, IsWithinXeGfxFamily) {
|
||||
EXPECT_EQ(2u, device->getNEODevice()->getDeviceBitfield().count());
|
||||
@@ -38,7 +38,7 @@ HWTEST2_F(MultiTileImmediateCommandListTest, GivenMultiTileDeviceWhenCreatingImm
|
||||
EXPECT_EQ(2u, commandList->partitionCount);
|
||||
}
|
||||
|
||||
using MultiTileImmediateInternalCommandListTest = Test<MultiTileCommandListFixture<true, true, false>>;
|
||||
using MultiTileImmediateInternalCommandListTest = Test<MultiTileCommandListFixture<true, true, false, -1>>;
|
||||
|
||||
HWTEST2_F(MultiTileImmediateInternalCommandListTest, GivenMultiTileDeviceWhenCreatingInternalImmediateCommandListThenExpectPartitionCountEqualOne, IsWithinXeGfxFamily) {
|
||||
EXPECT_EQ(2u, device->getNEODevice()->getDeviceBitfield().count());
|
||||
@@ -49,7 +49,7 @@ HWTEST2_F(MultiTileImmediateInternalCommandListTest, GivenMultiTileDeviceWhenCre
|
||||
EXPECT_EQ(1u, commandList->partitionCount);
|
||||
}
|
||||
|
||||
using MultiTileCopyEngineCommandListTest = Test<MultiTileCommandListFixture<false, false, true>>;
|
||||
using MultiTileCopyEngineCommandListTest = Test<MultiTileCommandListFixture<false, false, true, -1>>;
|
||||
|
||||
HWTEST2_F(MultiTileCopyEngineCommandListTest, GivenMultiTileDeviceWhenCreatingCopyEngineCommandListThenExpectPartitionCountEqualOne, IsWithinXeGfxFamily) {
|
||||
EXPECT_EQ(2u, device->getNEODevice()->getDeviceBitfield().count());
|
||||
|
||||
@@ -191,9 +191,226 @@ void validateMultiTileBarrier(void *cmdBuffer, size_t &parsedOffset,
|
||||
}
|
||||
}
|
||||
|
||||
using MultiTileCommandListAppendBarrier = Test<MultiTileCommandListFixture<false, false, false>>;
|
||||
template <bool usePrimaryBuffer>
|
||||
struct MultiTileCommandListAppendBarrierFixture : public MultiTileCommandListFixture<false, false, false, static_cast<int32_t>(usePrimaryBuffer)> {
|
||||
using BaseClass = MultiTileCommandListFixture<false, false, false, static_cast<int32_t>(usePrimaryBuffer)>;
|
||||
|
||||
HWTEST2_F(MultiTileCommandListAppendBarrier, WhenAppendingBarrierThenPipeControlIsGenerated, IsWithinXeGfxFamily) {
|
||||
using BaseClass::commandList;
|
||||
using BaseClass::context;
|
||||
using BaseClass::device;
|
||||
using BaseClass::driverHandle;
|
||||
using BaseClass::event;
|
||||
|
||||
void setUp() {
|
||||
BaseClass::setUp();
|
||||
}
|
||||
|
||||
void tearDown() {
|
||||
BaseClass::tearDown();
|
||||
}
|
||||
|
||||
template <typename FamilyType>
|
||||
void testBodyNonTimestampEventSignal() {
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
|
||||
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
|
||||
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
|
||||
|
||||
uint64_t eventGpuAddress = event->getCompletionFieldGpuAddress(device);
|
||||
ze_event_handle_t eventHandle = event->toHandle();
|
||||
|
||||
EXPECT_EQ(2u, device->getNEODevice()->getDeviceBitfield().count());
|
||||
EXPECT_EQ(2u, commandList->partitionCount);
|
||||
|
||||
LinearStream *cmdListStream = commandList->getCmdContainer().getCommandStream();
|
||||
|
||||
size_t beforeControlSectionOffset = sizeof(MI_STORE_DATA_IMM) +
|
||||
sizeof(PIPE_CONTROL) +
|
||||
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
|
||||
sizeof(MI_BATCH_BUFFER_START);
|
||||
|
||||
size_t bbStartOffset = beforeControlSectionOffset +
|
||||
(2 * sizeof(uint32_t));
|
||||
|
||||
size_t multiTileBarrierSize = bbStartOffset +
|
||||
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
|
||||
sizeof(MI_STORE_DATA_IMM) +
|
||||
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait();
|
||||
|
||||
size_t postSyncSize = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment(), false);
|
||||
|
||||
auto useSizeBefore = cmdListStream->getUsed();
|
||||
auto result = commandList->appendBarrier(eventHandle, 0, nullptr);
|
||||
auto useSizeAfter = cmdListStream->getUsed();
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(2u, event->getPacketsInUse());
|
||||
|
||||
size_t totaSizedBarrierWithNonTimestampEvent = multiTileBarrierSize + postSyncSize;
|
||||
|
||||
EXPECT_EQ(totaSizedBarrierWithNonTimestampEvent, (useSizeAfter - useSizeBefore));
|
||||
|
||||
auto gpuBaseAddress = cmdListStream->getGraphicsAllocation()->getGpuAddress() + useSizeBefore;
|
||||
|
||||
auto gpuCrossTileSyncAddress = gpuBaseAddress +
|
||||
beforeControlSectionOffset;
|
||||
|
||||
auto gpuFinalSyncAddress = gpuCrossTileSyncAddress +
|
||||
sizeof(uint32_t);
|
||||
|
||||
auto gpuStartAddress = gpuBaseAddress +
|
||||
bbStartOffset;
|
||||
|
||||
void *cmdBuffer = ptrOffset(cmdListStream->getCpuBase(), useSizeBefore);
|
||||
size_t parsedOffset = 0;
|
||||
|
||||
validateMultiTileBarrier<FamilyType>(cmdBuffer, parsedOffset, gpuFinalSyncAddress, gpuCrossTileSyncAddress, gpuStartAddress, true, !usePrimaryBuffer);
|
||||
EXPECT_EQ(multiTileBarrierSize, parsedOffset);
|
||||
|
||||
cmdBuffer = ptrOffset(cmdBuffer, parsedOffset);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
|
||||
cmdBuffer,
|
||||
postSyncSize));
|
||||
|
||||
auto itorPC = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(0u, itorPC.size());
|
||||
uint32_t postSyncFound = 0;
|
||||
for (auto it : itorPC) {
|
||||
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
|
||||
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
|
||||
EXPECT_EQ(cmd->getImmediateData(), Event::STATE_SIGNALED);
|
||||
EXPECT_TRUE(cmd->getCommandStreamerStallEnable());
|
||||
EXPECT_EQ(eventGpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
|
||||
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
|
||||
postSyncFound++;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(1u, postSyncFound);
|
||||
}
|
||||
|
||||
template <typename FamilyType>
|
||||
void testBodyTimestampEventSignal() {
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
|
||||
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
|
||||
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
|
||||
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
|
||||
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
|
||||
using MI_MATH = typename FamilyType::MI_MATH;
|
||||
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
eventPoolDesc.count = 2;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.wait = 0;
|
||||
eventDesc.signal = 0;
|
||||
|
||||
ze_result_t returnValue;
|
||||
auto eventPoolTimeStamp = std::unique_ptr<L0::EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
auto eventTimeStamp = std::unique_ptr<L0::Event>(Event::create<typename FamilyType::TimestampPacketType>(eventPoolTimeStamp.get(), &eventDesc, device));
|
||||
|
||||
uint64_t eventGpuAddress = eventTimeStamp->getGpuAddress(device);
|
||||
uint64_t contextStartAddress = eventGpuAddress + event->getContextStartOffset();
|
||||
uint64_t globalStartAddress = eventGpuAddress + event->getGlobalStartOffset();
|
||||
uint64_t contextEndAddress = eventGpuAddress + event->getContextEndOffset();
|
||||
uint64_t globalEndAddress = eventGpuAddress + event->getGlobalEndOffset();
|
||||
|
||||
ze_event_handle_t eventHandle = eventTimeStamp->toHandle();
|
||||
|
||||
EXPECT_EQ(2u, device->getNEODevice()->getDeviceBitfield().count());
|
||||
EXPECT_EQ(2u, commandList->partitionCount);
|
||||
|
||||
LinearStream *cmdListStream = commandList->getCmdContainer().getCommandStream();
|
||||
|
||||
size_t beforeControlSectionOffset = sizeof(MI_STORE_DATA_IMM) +
|
||||
sizeof(PIPE_CONTROL) +
|
||||
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
|
||||
sizeof(MI_BATCH_BUFFER_START);
|
||||
|
||||
size_t bbStartOffset = beforeControlSectionOffset +
|
||||
(2 * sizeof(uint32_t));
|
||||
|
||||
size_t multiTileBarrierSize = bbStartOffset +
|
||||
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
|
||||
sizeof(MI_STORE_DATA_IMM) +
|
||||
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait();
|
||||
|
||||
size_t timestampRegisters = 2 * (sizeof(MI_LOAD_REGISTER_REG) + sizeof(MI_LOAD_REGISTER_IMM) +
|
||||
NEO::EncodeMath<FamilyType>::streamCommandSize + sizeof(MI_STORE_REGISTER_MEM));
|
||||
if (NEO::UnitTestHelper<FamilyType>::timestampRegisterHighAddress()) {
|
||||
timestampRegisters *= 2;
|
||||
}
|
||||
|
||||
size_t postBarrierSynchronization = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForSingleBarrier(false) +
|
||||
NEO::MemorySynchronizationCommands<FamilyType>::getSizeForSingleAdditionalSynchronization(device->getNEODevice()->getRootDeviceEnvironment());
|
||||
size_t stopRegisters = timestampRegisters + postBarrierSynchronization;
|
||||
|
||||
auto useSizeBefore = cmdListStream->getUsed();
|
||||
auto result = commandList->appendBarrier(eventHandle, 0, nullptr);
|
||||
auto useSizeAfter = cmdListStream->getUsed();
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(2u, eventTimeStamp->getPacketsInUse());
|
||||
|
||||
size_t totaSizedBarrierWithTimestampEvent = multiTileBarrierSize + timestampRegisters + stopRegisters;
|
||||
EXPECT_EQ(totaSizedBarrierWithTimestampEvent, (useSizeAfter - useSizeBefore));
|
||||
|
||||
void *cmdBuffer = ptrOffset(cmdListStream->getCpuBase(), useSizeBefore);
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
|
||||
cmdBuffer,
|
||||
timestampRegisters));
|
||||
auto begin = cmdList.begin();
|
||||
validateTimestampRegisters<FamilyType>(cmdList,
|
||||
begin,
|
||||
REG_GLOBAL_TIMESTAMP_LDW, globalStartAddress,
|
||||
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextStartAddress,
|
||||
true);
|
||||
|
||||
auto gpuBaseAddress = cmdListStream->getGraphicsAllocation()->getGpuAddress() + useSizeBefore + timestampRegisters;
|
||||
|
||||
auto gpuCrossTileSyncAddress = gpuBaseAddress +
|
||||
beforeControlSectionOffset;
|
||||
|
||||
auto gpuFinalSyncAddress = gpuCrossTileSyncAddress +
|
||||
sizeof(uint32_t);
|
||||
|
||||
auto gpuStartAddress = gpuBaseAddress +
|
||||
bbStartOffset;
|
||||
|
||||
cmdBuffer = ptrOffset(cmdBuffer, timestampRegisters);
|
||||
size_t parsedOffset = 0;
|
||||
|
||||
validateMultiTileBarrier<FamilyType>(cmdBuffer, parsedOffset, gpuFinalSyncAddress, gpuCrossTileSyncAddress, gpuStartAddress, true, !usePrimaryBuffer);
|
||||
EXPECT_EQ(multiTileBarrierSize, parsedOffset);
|
||||
|
||||
cmdBuffer = ptrOffset(cmdBuffer, (parsedOffset + postBarrierSynchronization));
|
||||
cmdList.clear();
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
|
||||
cmdBuffer,
|
||||
timestampRegisters));
|
||||
begin = cmdList.begin();
|
||||
validateTimestampRegisters<FamilyType>(cmdList,
|
||||
begin,
|
||||
REG_GLOBAL_TIMESTAMP_LDW, globalEndAddress,
|
||||
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextEndAddress,
|
||||
true);
|
||||
}
|
||||
};
|
||||
|
||||
using MultiTileCommandListAppendBarrier = Test<MultiTileCommandListAppendBarrierFixture<false>>;
|
||||
|
||||
HWTEST2_F(MultiTileCommandListAppendBarrier, WhenAppendingBarrierThenPipeControlIsGenerated, IsAtLeastXeHpCore) {
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||
@@ -246,7 +463,7 @@ HWTEST2_F(MultiTileCommandListAppendBarrier, WhenAppendingBarrierThenPipeControl
|
||||
}
|
||||
|
||||
HWTEST2_F(MultiTileCommandListAppendBarrier,
|
||||
GivenCurrentCommandBufferExhaustedWhenAppendingMultiTileBarrierThenPipeControlAndCrossTileSyncIsGeneratedInNewBuffer, IsWithinXeGfxFamily) {
|
||||
GivenCurrentCommandBufferExhaustedWhenAppendingMultiTileBarrierThenPipeControlAndCrossTileSyncIsGeneratedInNewBuffer, IsAtLeastXeHpCore) {
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||
@@ -308,204 +525,31 @@ HWTEST2_F(MultiTileCommandListAppendBarrier,
|
||||
}
|
||||
|
||||
HWTEST2_F(MultiTileCommandListAppendBarrier,
|
||||
GivenNonTimestampEventSignalWhenAppendingMultTileBarrierThenExpectMultiTileBarrierAndPostSyncOperation, IsWithinXeGfxFamily) {
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
|
||||
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
|
||||
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
|
||||
|
||||
uint64_t eventGpuAddress = event->getCompletionFieldGpuAddress(device);
|
||||
ze_event_handle_t eventHandle = event->toHandle();
|
||||
|
||||
EXPECT_EQ(2u, device->getNEODevice()->getDeviceBitfield().count());
|
||||
EXPECT_EQ(2u, commandList->partitionCount);
|
||||
|
||||
LinearStream *cmdListStream = commandList->getCmdContainer().getCommandStream();
|
||||
|
||||
size_t beforeControlSectionOffset = sizeof(MI_STORE_DATA_IMM) +
|
||||
sizeof(PIPE_CONTROL) +
|
||||
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
|
||||
sizeof(MI_BATCH_BUFFER_START);
|
||||
|
||||
size_t bbStartOffset = beforeControlSectionOffset +
|
||||
(2 * sizeof(uint32_t));
|
||||
|
||||
size_t multiTileBarrierSize = bbStartOffset +
|
||||
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
|
||||
sizeof(MI_STORE_DATA_IMM) +
|
||||
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait();
|
||||
|
||||
size_t postSyncSize = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment(), false);
|
||||
|
||||
auto useSizeBefore = cmdListStream->getUsed();
|
||||
auto result = commandList->appendBarrier(eventHandle, 0, nullptr);
|
||||
auto useSizeAfter = cmdListStream->getUsed();
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(2u, event->getPacketsInUse());
|
||||
|
||||
size_t totaSizedBarrierWithNonTimestampEvent = multiTileBarrierSize + postSyncSize;
|
||||
|
||||
EXPECT_EQ(totaSizedBarrierWithNonTimestampEvent, (useSizeAfter - useSizeBefore));
|
||||
|
||||
auto gpuBaseAddress = cmdListStream->getGraphicsAllocation()->getGpuAddress() + useSizeBefore;
|
||||
|
||||
auto gpuCrossTileSyncAddress = gpuBaseAddress +
|
||||
beforeControlSectionOffset;
|
||||
|
||||
auto gpuFinalSyncAddress = gpuCrossTileSyncAddress +
|
||||
sizeof(uint32_t);
|
||||
|
||||
auto gpuStartAddress = gpuBaseAddress +
|
||||
bbStartOffset;
|
||||
|
||||
void *cmdBuffer = ptrOffset(cmdListStream->getCpuBase(), useSizeBefore);
|
||||
size_t parsedOffset = 0;
|
||||
|
||||
validateMultiTileBarrier<FamilyType>(cmdBuffer, parsedOffset, gpuFinalSyncAddress, gpuCrossTileSyncAddress, gpuStartAddress, true, true);
|
||||
EXPECT_EQ(multiTileBarrierSize, parsedOffset);
|
||||
|
||||
cmdBuffer = ptrOffset(cmdBuffer, parsedOffset);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
|
||||
cmdBuffer,
|
||||
postSyncSize));
|
||||
|
||||
auto itorPC = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(0u, itorPC.size());
|
||||
uint32_t postSyncFound = 0;
|
||||
for (auto it : itorPC) {
|
||||
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
|
||||
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
|
||||
EXPECT_EQ(cmd->getImmediateData(), Event::STATE_SIGNALED);
|
||||
EXPECT_TRUE(cmd->getCommandStreamerStallEnable());
|
||||
EXPECT_EQ(eventGpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
|
||||
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
|
||||
postSyncFound++;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(1u, postSyncFound);
|
||||
GivenNonTimestampEventSignalWhenAppendingMultTileBarrierThenExpectMultiTileBarrierAndPostSyncOperation, IsAtLeastXeHpCore) {
|
||||
testBodyNonTimestampEventSignal<FamilyType>();
|
||||
}
|
||||
|
||||
HWTEST2_F(MultiTileCommandListAppendBarrier,
|
||||
GivenTimestampEventSignalWhenAppendingMultTileBarrierThenExpectMultiTileBarrierAndTimestampOperations, IsWithinXeGfxFamily) {
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
|
||||
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
|
||||
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
|
||||
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
|
||||
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
|
||||
using MI_MATH = typename FamilyType::MI_MATH;
|
||||
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
eventPoolDesc.count = 2;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.wait = 0;
|
||||
eventDesc.signal = 0;
|
||||
|
||||
ze_result_t returnValue;
|
||||
auto eventPoolTimeStamp = std::unique_ptr<L0::EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
auto eventTimeStamp = std::unique_ptr<L0::Event>(Event::create<typename FamilyType::TimestampPacketType>(eventPoolTimeStamp.get(), &eventDesc, device));
|
||||
|
||||
uint64_t eventGpuAddress = eventTimeStamp->getGpuAddress(device);
|
||||
uint64_t contextStartAddress = eventGpuAddress + event->getContextStartOffset();
|
||||
uint64_t globalStartAddress = eventGpuAddress + event->getGlobalStartOffset();
|
||||
uint64_t contextEndAddress = eventGpuAddress + event->getContextEndOffset();
|
||||
uint64_t globalEndAddress = eventGpuAddress + event->getGlobalEndOffset();
|
||||
|
||||
ze_event_handle_t eventHandle = eventTimeStamp->toHandle();
|
||||
|
||||
EXPECT_EQ(2u, device->getNEODevice()->getDeviceBitfield().count());
|
||||
EXPECT_EQ(2u, commandList->partitionCount);
|
||||
|
||||
LinearStream *cmdListStream = commandList->getCmdContainer().getCommandStream();
|
||||
|
||||
size_t beforeControlSectionOffset = sizeof(MI_STORE_DATA_IMM) +
|
||||
sizeof(PIPE_CONTROL) +
|
||||
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
|
||||
sizeof(MI_BATCH_BUFFER_START);
|
||||
|
||||
size_t bbStartOffset = beforeControlSectionOffset +
|
||||
(2 * sizeof(uint32_t));
|
||||
|
||||
size_t multiTileBarrierSize = bbStartOffset +
|
||||
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
|
||||
sizeof(MI_STORE_DATA_IMM) +
|
||||
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait();
|
||||
|
||||
size_t timestampRegisters = 2 * (sizeof(MI_LOAD_REGISTER_REG) + sizeof(MI_LOAD_REGISTER_IMM) +
|
||||
NEO::EncodeMath<FamilyType>::streamCommandSize + sizeof(MI_STORE_REGISTER_MEM));
|
||||
|
||||
size_t postBarrierSynchronization = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForSingleBarrier(false) +
|
||||
NEO::MemorySynchronizationCommands<FamilyType>::getSizeForSingleAdditionalSynchronization(device->getNEODevice()->getRootDeviceEnvironment());
|
||||
size_t stopRegisters = timestampRegisters + postBarrierSynchronization;
|
||||
|
||||
auto useSizeBefore = cmdListStream->getUsed();
|
||||
auto result = commandList->appendBarrier(eventHandle, 0, nullptr);
|
||||
auto useSizeAfter = cmdListStream->getUsed();
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(2u, eventTimeStamp->getPacketsInUse());
|
||||
|
||||
size_t totaSizedBarrierWithTimestampEvent = multiTileBarrierSize + timestampRegisters + stopRegisters;
|
||||
EXPECT_EQ(totaSizedBarrierWithTimestampEvent, (useSizeAfter - useSizeBefore));
|
||||
|
||||
void *cmdBuffer = ptrOffset(cmdListStream->getCpuBase(), useSizeBefore);
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
|
||||
cmdBuffer,
|
||||
timestampRegisters));
|
||||
auto begin = cmdList.begin();
|
||||
validateTimestampRegisters<FamilyType>(cmdList,
|
||||
begin,
|
||||
REG_GLOBAL_TIMESTAMP_LDW, globalStartAddress,
|
||||
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextStartAddress,
|
||||
true);
|
||||
|
||||
auto gpuBaseAddress = cmdListStream->getGraphicsAllocation()->getGpuAddress() + useSizeBefore + timestampRegisters;
|
||||
|
||||
auto gpuCrossTileSyncAddress = gpuBaseAddress +
|
||||
beforeControlSectionOffset;
|
||||
|
||||
auto gpuFinalSyncAddress = gpuCrossTileSyncAddress +
|
||||
sizeof(uint32_t);
|
||||
|
||||
auto gpuStartAddress = gpuBaseAddress +
|
||||
bbStartOffset;
|
||||
|
||||
cmdBuffer = ptrOffset(cmdBuffer, timestampRegisters);
|
||||
size_t parsedOffset = 0;
|
||||
|
||||
validateMultiTileBarrier<FamilyType>(cmdBuffer, parsedOffset, gpuFinalSyncAddress, gpuCrossTileSyncAddress, gpuStartAddress, true, true);
|
||||
EXPECT_EQ(multiTileBarrierSize, parsedOffset);
|
||||
|
||||
cmdBuffer = ptrOffset(cmdBuffer, (parsedOffset + postBarrierSynchronization));
|
||||
cmdList.clear();
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
|
||||
cmdBuffer,
|
||||
timestampRegisters));
|
||||
begin = cmdList.begin();
|
||||
validateTimestampRegisters<FamilyType>(cmdList,
|
||||
begin,
|
||||
REG_GLOBAL_TIMESTAMP_LDW, globalEndAddress,
|
||||
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextEndAddress,
|
||||
true);
|
||||
GivenTimestampEventSignalWhenAppendingMultTileBarrierThenExpectMultiTileBarrierAndTimestampOperations, IsAtLeastXeHpCore) {
|
||||
testBodyTimestampEventSignal<FamilyType>();
|
||||
}
|
||||
|
||||
using MultiTileImmediateCommandListAppendBarrier = Test<MultiTileCommandListFixture<true, false, false>>;
|
||||
using MultiTilePrimaryBatchBufferCommandListAppendBarrier = Test<MultiTileCommandListAppendBarrierFixture<true>>;
|
||||
|
||||
HWTEST2_F(MultiTilePrimaryBatchBufferCommandListAppendBarrier,
|
||||
GivenNonTimestampEventSignalWhenAppendingMultTileBarrierThenExpectMultiTileBarrierAndPostSyncOperation, IsAtLeastXeHpCore) {
|
||||
testBodyNonTimestampEventSignal<FamilyType>();
|
||||
}
|
||||
|
||||
HWTEST2_F(MultiTilePrimaryBatchBufferCommandListAppendBarrier,
|
||||
GivenTimestampEventSignalWhenAppendingMultTileBarrierThenExpectMultiTileBarrierAndTimestampOperations, IsAtLeastXeHpCore) {
|
||||
testBodyTimestampEventSignal<FamilyType>();
|
||||
}
|
||||
|
||||
using MultiTileImmediateCommandListAppendBarrier = Test<MultiTileCommandListFixture<true, false, false, 0>>;
|
||||
|
||||
HWTEST2_F(MultiTileImmediateCommandListAppendBarrier,
|
||||
givenMultiTileImmediateCommandListWhenAppendingBarrierThenExpectCrossTileSyncAndNoCleanupSection, IsWithinXeGfxFamily) {
|
||||
givenMultiTileImmediateCommandListWhenAppendingBarrierThenExpectCrossTileSyncAndNoCleanupSection, IsAtLeastXeHpCore) {
|
||||
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
@@ -606,7 +650,7 @@ HWTEST2_F(MultiTileImmediateCommandListAppendBarrier,
|
||||
}
|
||||
|
||||
HWTEST2_F(MultiTileImmediateCommandListAppendBarrier,
|
||||
givenMultiTileImmediateCommandListNotUsingFlushTaskWhenAppendingBarrierThenExpectSecondaryBufferStart, IsWithinXeGfxFamily) {
|
||||
givenMultiTileImmediateCommandListNotUsingFlushTaskWhenAppendingBarrierThenExpectSecondaryBufferStart, IsAtLeastXeHpCore) {
|
||||
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
|
||||
|
||||
@@ -595,7 +595,7 @@ HWTEST_F(CommandListAppendWaitOnEvent, givenCommandBufferIsEmptyWhenAppendingWai
|
||||
EXPECT_EQ(1u, semaphoreWaitsFound);
|
||||
}
|
||||
|
||||
using MultTileCommandListAppendWaitOnEvent = Test<MultiTileCommandListFixture<false, false, false>>;
|
||||
using MultTileCommandListAppendWaitOnEvent = Test<MultiTileCommandListFixture<false, false, false, -1>>;
|
||||
HWTEST2_F(MultTileCommandListAppendWaitOnEvent,
|
||||
GivenMultiTileCmdListWhenPartitionedEventUsedToWaitThenExpectProperGpuAddressAndSemaphoreCount, IsAtLeastXeHpCore) {
|
||||
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
|
||||
|
||||
@@ -158,7 +158,7 @@ HWTEST2_F(CommandListTests, whenCommandListIsCreatedAndProgramExtendedPipeContro
|
||||
EXPECT_TRUE(cmdSba->getDisableSupportForMultiGpuAtomicsForStatelessAccesses());
|
||||
}
|
||||
|
||||
using MultiTileCommandListTests = Test<MultiTileCommandListFixture<false, false, false>>;
|
||||
using MultiTileCommandListTests = Test<MultiTileCommandListFixture<false, false, false, -1>>;
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, MultiTileCommandListTests, givenPartitionedCommandListWhenCommandListIsCreatedThenStateBaseAddressCmdWithMultiPartialAndAtomicsCorrectlyProgrammed) {
|
||||
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user