From 3e1023fa1a249c84e56e084a1fdcc73046aacd6b Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Tue, 30 Nov 2021 14:41:26 +0000 Subject: [PATCH] Unify memory layout for all multi tile post sync operations Related-To: NEO-6262 Signed-off-by: Zbigniew Zdanowicz --- .../cmdlist/cmdlist_hw_xehp_and_later.inl | 4 ++-- level_zero/core/source/fence/fence.cpp | 8 +++---- ...cmdlist_append_multipartition_prologue.cpp | 2 +- .../sources/cmdqueue/test_cmdqueue.cpp | 21 ++++++++++++------- .../unit_tests/sources/fence/test_fence.cpp | 11 +++++----- .../command_queue/command_queue_hw_tests.cpp | 8 +++++-- ...ceiver_flush_task_tests_xehp_and_later.cpp | 2 +- .../hw_helper_tests_xehp_and_later.cpp | 2 +- .../helpers/timestamp_packet_1_tests.cpp | 11 ++++++++-- .../internal_allocation_storage_tests.cpp | 8 +++++-- .../command_container/implicit_scaling.h | 2 ++ .../implicit_scaling_xehp_and_later.inl | 6 ++++++ .../aub_command_stream_receiver_hw_base.inl | 4 ++-- .../command_stream_receiver.cpp | 8 +++---- .../command_stream/command_stream_receiver.h | 9 ++++++-- .../command_stream_receiver_hw.h | 1 + .../command_stream_receiver_hw_base.inl | 1 + ...mmand_stream_receiver_hw_bdw_and_later.inl | 4 ++++ ...mand_stream_receiver_hw_xehp_and_later.inl | 7 ++++++- .../tbx_command_stream_receiver_hw.inl | 4 ++-- .../direct_submission_bdw_and_later.inl | 4 ++++ .../direct_submission/direct_submission_hw.h | 2 ++ .../direct_submission_hw.inl | 1 + ...direct_submission_xe_hp_core_and_later.inl | 7 ++++++- .../linux/drm_direct_submission.inl | 2 +- shared/source/helpers/constants.h | 2 -- shared/source/helpers/hw_helper.h | 1 + shared/source/helpers/hw_helper_base.inl | 5 +++++ .../memory_manager/allocations_list.cpp | 4 +++- .../drm_command_stream_xehp_and_later.inl | 4 ++-- .../libult/ult_command_stream_receiver.h | 1 + .../linux/mock_drm_command_stream_receiver.h | 1 + .../mocks/mock_command_stream_receiver.h | 4 +++- .../common/mocks/mock_direct_submission_hw.h | 1 + .../command_stream_receiver_tests.cpp | 19 ++++++++++------- .../direct_submission_tests_2.cpp | 12 +++++++---- .../linux/drm_direct_submission_tests.cpp | 7 +++++-- .../deferrable_allocation_deletion_tests.cpp | 8 ++++--- 38 files changed, 145 insertions(+), 63 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index f3889b8bd1..511aba9843 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -125,7 +125,7 @@ void programEventL3Flush(ze_event_handle_t hEvent, if (partitionCount > 1) { NEO::ImplicitScalingDispatch::dispatchOffsetRegister(cmdListStream, - CommonConstants::partitionAddressOffset); + NEO::ImplicitScalingDispatch::getPostSyncOffset()); } } @@ -321,7 +321,7 @@ void CommandListCoreFamily::appendMultiPartitionEpilogue() { const size_t estimatedSizeRequired = NEO::ImplicitScalingDispatch::getOffsetRegisterSize(); increaseCommandStreamSpace(estimatedSizeRequired); NEO::ImplicitScalingDispatch::dispatchOffsetRegister(*commandContainer.getCommandStream(), - CommonConstants::partitionAddressOffset); + NEO::ImplicitScalingDispatch::getPostSyncOffset()); } template diff --git a/level_zero/core/source/fence/fence.cpp b/level_zero/core/source/fence/fence.cpp index 75a06aa9a2..16672677bf 100644 --- a/level_zero/core/source/fence/fence.cpp +++ b/level_zero/core/source/fence/fence.cpp @@ -31,9 +31,7 @@ FenceImp::~FenceImp() { ze_result_t FenceImp::queryStatus() { auto csr = cmdQueue->getCsr(); - if (csr) { - csr->downloadAllocations(); - } + csr->downloadAllocations(); volatile uint32_t *hostAddr = static_cast(allocation->getUnderlyingBuffer()); uint32_t queryVal = Fence::STATE_CLEARED; @@ -42,7 +40,7 @@ ze_result_t FenceImp::queryStatus() { if (queryVal == Fence::STATE_CLEARED) { break; } - hostAddr = ptrOffset(hostAddr, CommonConstants::partitionAddressOffset); + hostAddr = ptrOffset(hostAddr, csr->getPostSyncWriteOffset()); } return queryVal == Fence::STATE_CLEARED ? ZE_RESULT_NOT_READY : ZE_RESULT_SUCCESS; } @@ -63,7 +61,7 @@ ze_result_t FenceImp::reset() { for (uint32_t i = 0; i < maxPartitionCount; i++) { *hostAddress = Fence::STATE_CLEARED; NEO::CpuIntrinsics::clFlush(const_cast(hostAddress)); - hostAddress = ptrOffset(hostAddress, CommonConstants::partitionAddressOffset); + hostAddress = ptrOffset(hostAddress, cmdQueue->getCsr()->getPostSyncWriteOffset()); } partitionCount = 1; return ZE_RESULT_SUCCESS; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_multipartition_prologue.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_multipartition_prologue.cpp index f4a90e4d34..6572b9c325 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_multipartition_prologue.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_multipartition_prologue.cpp @@ -99,7 +99,7 @@ HWTEST2_F(MultiPartitionEpilogueTest, whenAppendMultiPartitionEpilogueIsCalledTh auto lriCmd = genCmdCast(*itorLri); EXPECT_EQ(NEO::PartitionRegisters::addressOffsetCCSOffset, static_cast(lriCmd->getRegisterOffset())); - EXPECT_EQ(CommonConstants::partitionAddressOffset, static_cast(lriCmd->getDataDword())); + EXPECT_EQ(NEO::ImplicitScalingDispatch::getPostSyncOffset(), static_cast(lriCmd->getDataDword())); EXPECT_EQ(true, lriCmd->getMmioRemapEnable()); auto result = commandList->close(); diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp index bd11b11a48..61dfacc3b4 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp @@ -1613,7 +1613,7 @@ struct SynchronizeCsr : public NEO::UltCommandStreamReceiver { NEO::UltCommandStreamReceiver::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, quickKmdSleep, forcePowerSavingMode); } - static constexpr size_t tagSize = 64; + static constexpr size_t tagSize = 128; static volatile uint32_t tagAddressData[tagSize]; uint32_t waitForComplitionCalledTimes = 0; uint32_t waitForTaskCountWithKmdNotifyFallbackCalled = 0; @@ -1697,16 +1697,21 @@ HWTEST_F(CommandQueueSynchronizeTest, givenDebugOverrideEnabledWhenCallToSynchro L0::CommandQueue::fromHandle(commandQueue)->destroy(); } -HWTEST2_F(MultiTileCommandQueueSynchronizeTest, givenMultiplePartitionCountWhenCallingSynchronizeThenExpectTheSameNumberCsrSynchronizeCalls, IsWithinXeGfxFamily) { +HWTEST2_F(MultiTileCommandQueueSynchronizeTest, givenMultiplePartitionCountWhenCallingSynchronizeThenExpectTheSameNumberCsrSynchronizeCalls, IsAtLeastXeHpCore) { const ze_command_queue_desc_t desc{}; ze_result_t returnValue; auto csr = reinterpret_cast *>(neoDevice->getDefaultEngine().commandStreamReceiver); + if (device->getNEODevice()->getPreemptionMode() == PreemptionMode::MidThread || device->getNEODevice()->isDebuggerActive()) { + csr->createPreemptionAllocation(); + } + EXPECT_NE(0u, csr->getPostSyncWriteOffset()); volatile uint32_t *tagAddress = csr->getTagAddress(); for (uint32_t i = 0; i < 2; i++) { *tagAddress = 0xFF; - tagAddress = ptrOffset(tagAddress, 8); + tagAddress = ptrOffset(tagAddress, csr->getPostSyncWriteOffset()); } + csr->activePartitions = 2u; auto commandQueue = whitebox_cast(CommandQueue::create(productFamily, device, neoDevice->getDefaultEngine().commandStreamReceiver, @@ -1729,20 +1734,22 @@ HWTEST2_F(MultiTileCommandQueueSynchronizeTest, givenMultiplePartitionCountWhenC uint64_t timeout = std::numeric_limits::max(); commandQueue->synchronize(timeout); - EXPECT_EQ(2u, csr->activePartitions); - L0::CommandQueue::fromHandle(commandQueue)->destroy(); } -HWTEST2_F(MultiTileCommandQueueSynchronizeTest, givenCsrHasMultipleActivePartitionWhenExecutingCmdListOnNewCmdQueueThenExpectCmdPartitionCountMatchCsrActivePartitions, IsWithinXeGfxFamily) { +HWTEST2_F(MultiTileCommandQueueSynchronizeTest, givenCsrHasMultipleActivePartitionWhenExecutingCmdListOnNewCmdQueueThenExpectCmdPartitionCountMatchCsrActivePartitions, IsAtLeastXeHpCore) { const ze_command_queue_desc_t desc{}; ze_result_t returnValue; auto csr = reinterpret_cast *>(neoDevice->getDefaultEngine().commandStreamReceiver); + if (device->getNEODevice()->getPreemptionMode() == PreemptionMode::MidThread || device->getNEODevice()->isDebuggerActive()) { + csr->createPreemptionAllocation(); + } + EXPECT_NE(0u, csr->getPostSyncWriteOffset()); volatile uint32_t *tagAddress = csr->getTagAddress(); for (uint32_t i = 0; i < 2; i++) { *tagAddress = 0xFF; - tagAddress = ptrOffset(tagAddress, 8); + tagAddress = ptrOffset(tagAddress, csr->getPostSyncWriteOffset()); } csr->activePartitions = 2u; auto commandQueue = whitebox_cast(CommandQueue::create(productFamily, diff --git a/level_zero/core/test/unit_tests/sources/fence/test_fence.cpp b/level_zero/core/test/unit_tests/sources/fence/test_fence.cpp index 3bceca19ca..564a136670 100644 --- a/level_zero/core/test/unit_tests/sources/fence/test_fence.cpp +++ b/level_zero/core/test/unit_tests/sources/fence/test_fence.cpp @@ -43,7 +43,8 @@ TEST_F(FenceTest, whenQueryingStatusThenCsrAllocationsAreDownloaded) { } TEST_F(FenceTest, whenQueryingStatusWithoutCsrAndFenceUnsignaledThenReturnsNotReady) { - Mock cmdQueue(device, nullptr); + auto csr = std::make_unique(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + Mock cmdQueue(device, csr.get()); auto fence = Fence::create(&cmdQueue, nullptr); EXPECT_NE(nullptr, fence); @@ -126,7 +127,7 @@ TEST_F(FenceSynchronizeTest, givenCallToFenceHostSynchronizeWithTimeoutNonZeroAn TEST_F(FenceSynchronizeTest, givenMultiplePartitionsWhenFenceIsResetThenAllPartitionFenceStatesAreReset) { std::unique_ptr csr = nullptr; csr = std::make_unique(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); - + csr->postSyncWriteOffset = 16; Mock cmdQueue(device, csr.get()); auto fence = whitebox_cast(Fence::create(&cmdQueue, nullptr)); @@ -136,13 +137,13 @@ TEST_F(FenceSynchronizeTest, givenMultiplePartitionsWhenFenceIsResetThenAllParti for (uint32_t i = 0; i < 16; i++) { EXPECT_EQ(Fence::STATE_CLEARED, *hostAddr); - hostAddr = ptrOffset(hostAddr, 8); + hostAddr = ptrOffset(hostAddr, 16); } hostAddr = static_cast(alloc->getUnderlyingBuffer()); fence->partitionCount = 2; *hostAddr = Fence::STATE_SIGNALED; - hostAddr = ptrOffset(hostAddr, 8); + hostAddr = ptrOffset(hostAddr, 16); *hostAddr = Fence::STATE_SIGNALED; ze_result_t result = fence->reset(); @@ -151,7 +152,7 @@ TEST_F(FenceSynchronizeTest, givenMultiplePartitionsWhenFenceIsResetThenAllParti hostAddr = static_cast(alloc->getUnderlyingBuffer()); for (uint32_t i = 0; i < 16; i++) { EXPECT_EQ(Fence::STATE_CLEARED, *hostAddr); - hostAddr = ptrOffset(hostAddr, 8); + hostAddr = ptrOffset(hostAddr, 16); } EXPECT_EQ(1u, fence->partitionCount); diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp index b31f7ed849..b30e6ce780 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp @@ -7,6 +7,7 @@ #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/unit_test_helper.h" +#include "shared/test/common/libult/ult_command_stream_receiver.h" #include "shared/test/common/mocks/mock_allocation_properties.h" #include "shared/test/common/mocks/mock_builtins.h" #include "shared/test/common/mocks/mock_csr.h" @@ -862,8 +863,11 @@ HWTEST_F(CommandQueueHwTest, GivenMultiTileQueueWhenEventNotCompletedAndFinishIs auto &csr = this->pCmdQ->getGpgpuCommandStreamReceiver(); csr.setActivePartitions(2u); + auto ultCsr = reinterpret_cast *>(&csr); + ultCsr->postSyncWriteOffset = 32; + auto tagAddress = csr.getTagAddress(); - *ptrOffset(tagAddress, 8) = *tagAddress; + *ptrOffset(tagAddress, 32) = *tagAddress; struct ClbFuncTempStruct { static void CL_CALLBACK ClbFuncT(cl_event e, cl_int execStatus, void *valueForUpdate) { @@ -877,7 +881,7 @@ HWTEST_F(CommandQueueHwTest, GivenMultiTileQueueWhenEventNotCompletedAndFinishIs EXPECT_GT(3u, csr.peekTaskCount()); *tagAddress = CompletionStamp::notReady + 1; - tagAddress = ptrOffset(tagAddress, 8); + tagAddress = ptrOffset(tagAddress, 32); *tagAddress = CompletionStamp::notReady + 1; cl_int ret = clFinish(this->pCmdQ); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp index f66079af0f..012df86e58 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp @@ -843,7 +843,7 @@ struct CommandStreamReceiverFlushTaskXeHPAndLaterMultiTileTests : public Command expectedWparidData = commandStreamReceiver.getWorkPartitionAllocationGpuAddress(); } uint32_t expectedWparidRegister = 0x221C; - uint32_t expectedAddressOffsetData = 8; + uint32_t expectedAddressOffsetData = commandStreamReceiver.getPostSyncWriteOffset(); uint32_t expectedAddressOffsetRegister = 0x23B4; bool wparidConfiguration = false; diff --git a/opencl/test/unit_test/helpers/hw_helper_tests_xehp_and_later.cpp b/opencl/test/unit_test/helpers/hw_helper_tests_xehp_and_later.cpp index 5cdb49a7a0..9fd6c8ec39 100644 --- a/opencl/test/unit_test/helpers/hw_helper_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/helpers/hw_helper_tests_xehp_and_later.cpp @@ -435,7 +435,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, PipeControlHelperTestsXeHPAndLater, givenPostSyncPi for (size_t i = 0; i < pipeControls.size(); i++) { auto pipeControl = reinterpret_cast(*pipeControls[i]); if (pipeControl->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { - EXPECT_EQ(static_cast(gpuAddress), pipeControl->getAddress()); + EXPECT_EQ(gpuAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*pipeControl)); EXPECT_EQ(data, pipeControl->getImmediateData()); EXPECT_TRUE(pipeControl->getWorkloadPartitionIdOffsetEnable()); foundPostSyncPipeControl = true; diff --git a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp index cae64d037b..cd6e01d7f7 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp @@ -197,15 +197,22 @@ HWTEST_F(TimestampPacketTests, givenDebugFlagSetWhenCreatingAllocatorThenUseCorr } { - DebugManager.flags.OverrideTimestampPacketSize.set(12); - + DebugManager.flags.OverrideTimestampPacketSize.set(-1); CommandStreamReceiverHw csr(*executionEnvironment, 0, osContext.getDeviceBitfield()); csr.setupContext(osContext); + DebugManager.flags.OverrideTimestampPacketSize.set(12); EXPECT_ANY_THROW(csr.getTimestampPacketAllocator()); } } +HWCMDTEST_F(IGFX_XE_HP_CORE, TimestampPacketTests, givenInvalidDebugFlagSetWhenCreatingCsrThenExceptionIsThrown) { + OsContext &osContext = *executionEnvironment->memoryManager->getRegisteredEngines()[0].osContext; + DebugManager.flags.OverrideTimestampPacketSize.set(12); + + EXPECT_ANY_THROW(CommandStreamReceiverHw csr(*executionEnvironment, 0, osContext.getDeviceBitfield())); +} + HWTEST_F(TimestampPacketTests, givenTagAlignmentWhenCreatingAllocatorThenGpuAddressIsAligned) { auto csr = executionEnvironment->memoryManager->getRegisteredEngines()[0].commandStreamReceiver; diff --git a/opencl/test/unit_test/memory_manager/internal_allocation_storage_tests.cpp b/opencl/test/unit_test/memory_manager/internal_allocation_storage_tests.cpp index 07b93c7616..aaf6033506 100644 --- a/opencl/test/unit_test/memory_manager/internal_allocation_storage_tests.cpp +++ b/opencl/test/unit_test/memory_manager/internal_allocation_storage_tests.cpp @@ -8,6 +8,7 @@ #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/os_interface/os_context.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/libult/ult_command_stream_receiver.h" #include "shared/test/common/mocks/mock_allocation_properties.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/unit_test/utilities/containers_tests_helpers.h" @@ -271,11 +272,14 @@ TEST_F(InternalAllocationStorageTest, givenAllocationListWhenTwoThreadsCleanConc EXPECT_TRUE(csr->getTemporaryAllocations().peekIsEmpty()); } -TEST_F(InternalAllocationStorageTest, givenMultipleActivePartitionsWhenDetachingReusableAllocationThenCheckTaskCountFinishedOnAllTiles) { +HWTEST_F(InternalAllocationStorageTest, givenMultipleActivePartitionsWhenDetachingReusableAllocationThenCheckTaskCountFinishedOnAllTiles) { + auto ultCsr = reinterpret_cast *>(csr); csr->setActivePartitions(2u); + ultCsr->postSyncWriteOffset = 32; + auto tagAddress = csr->getTagAddress(); *tagAddress = 0xFF; - tagAddress = ptrOffset(tagAddress, 8); + tagAddress = ptrOffset(tagAddress, 32); *tagAddress = 0x0; auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); diff --git a/shared/source/command_container/implicit_scaling.h b/shared/source/command_container/implicit_scaling.h index 23355a4b2e..c11c0fe54e 100644 --- a/shared/source/command_container/implicit_scaling.h +++ b/shared/source/command_container/implicit_scaling.h @@ -76,6 +76,8 @@ struct ImplicitScalingDispatch { static void dispatchOffsetRegister(LinearStream &commandStream, uint32_t addressOffset); + static uint32_t getPostSyncOffset(); + private: static bool pipeControlStallRequired; }; diff --git a/shared/source/command_container/implicit_scaling_xehp_and_later.inl b/shared/source/command_container/implicit_scaling_xehp_and_later.inl index 8ba4857c42..e5b7fab833 100644 --- a/shared/source/command_container/implicit_scaling_xehp_and_later.inl +++ b/shared/source/command_container/implicit_scaling_xehp_and_later.inl @@ -9,6 +9,7 @@ #include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_container/walker_partition_xehp_and_later.h" #include "shared/source/command_stream/linear_stream.h" +#include "shared/source/helpers/hw_helper.h" namespace NEO { @@ -203,4 +204,9 @@ inline void ImplicitScalingDispatch::dispatchOffsetRegister(LinearStr true); } +template +inline uint32_t ImplicitScalingDispatch::getPostSyncOffset() { + return static_cast(HwHelperHw::getSingleTimestampPacketSizeHw()); +} + } // namespace NEO diff --git a/shared/source/command_stream/aub_command_stream_receiver_hw_base.inl b/shared/source/command_stream/aub_command_stream_receiver_hw_base.inl index 2cc0d50dd9..0428bd374e 100644 --- a/shared/source/command_stream/aub_command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/aub_command_stream_receiver_hw_base.inl @@ -307,7 +307,7 @@ bool AUBCommandStreamReceiverHw::flush(BatchBuffer &batchBuffer, Resi volatile uint32_t *pollAddress = this->tagAddress; for (uint32_t i = 0; i < this->activePartitions; i++) { *pollAddress = this->peekLatestSentTaskCount(); - pollAddress = ptrOffset(pollAddress, CommonConstants::partitionAddressOffset); + pollAddress = ptrOffset(pollAddress, this->postSyncWriteOffset); } } return true; @@ -348,7 +348,7 @@ bool AUBCommandStreamReceiverHw::flush(BatchBuffer &batchBuffer, Resi volatile uint32_t *pollAddress = this->tagAddress; for (uint32_t i = 0; i < this->activePartitions; i++) { *pollAddress = this->peekLatestSentTaskCount(); - pollAddress = ptrOffset(pollAddress, CommonConstants::partitionAddressOffset); + pollAddress = ptrOffset(pollAddress, this->postSyncWriteOffset); } } diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index 289f75131e..7b57b3ced2 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -313,7 +313,7 @@ bool CommandStreamReceiver::baseWaitFunction(volatile uint32_t *pollAddress, boo } } - partitionAddress = ptrOffset(partitionAddress, CommonConstants::partitionAddressOffset); + partitionAddress = ptrOffset(partitionAddress, this->postSyncWriteOffset); } return testTaskCountReady(pollAddress, taskCountToWait); @@ -560,7 +560,7 @@ bool CommandStreamReceiver::initializeTagAllocation() { uint32_t subDevices = static_cast(this->deviceBitfield.count()); for (uint32_t i = 0; i < subDevices; i++) { *tagAddress = initValue; - tagAddress = ptrOffset(tagAddress, CommonConstants::partitionAddressOffset); + tagAddress = ptrOffset(tagAddress, this->postSyncWriteOffset); } *this->debugPauseStateAddress = DebugManager.flags.EnableNullHardware.get() ? DebugPauseState::disabled : DebugPauseState::waitingForFirstSemaphore; @@ -675,7 +675,7 @@ void CommandStreamReceiver::updateTagFromCpu(uint32_t taskCount) { auto partitionAddress = getTagAddress(); for (uint32_t i = 0; i < activePartitions; i++) { *partitionAddress = taskCount; - partitionAddress = ptrOffset(partitionAddress, CommonConstants::partitionAddressOffset); + partitionAddress = ptrOffset(partitionAddress, this->postSyncWriteOffset); } } @@ -759,7 +759,7 @@ bool CommandStreamReceiver::testTaskCountReady(volatile uint32_t *pollAddress, u return false; } - pollAddress = ptrOffset(pollAddress, CommonConstants::partitionAddressOffset); + pollAddress = ptrOffset(pollAddress, this->postSyncWriteOffset); } return true; } diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 90ca884fab..6aba3f7151 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -298,6 +298,10 @@ class CommandStreamReceiver { std::unique_ptr pageTableManager; + inline uint32_t getPostSyncWriteOffset() const { + return postSyncWriteOffset; + } + protected: void cleanupResources(); void printDeviceIndex(); @@ -326,8 +330,8 @@ class CommandStreamReceiver { LinearStream commandStream; StreamProperties streamProperties{}; - // offset for debug state must be 64 bytes, tag writes can use multiple dwords for multiple partitions - const uint64_t debugPauseStateAddressOffset = MemoryConstants::cacheLineSize; + // offset for debug state is 1kbyte, tag writes can use multiple offsets for multiple partitions and each offset can vary per platform + const uint64_t debugPauseStateAddressOffset = MemoryConstants::kiloByte; uint64_t totalMemoryUsed = 0u; volatile uint32_t *tagAddress = nullptr; @@ -374,6 +378,7 @@ class CommandStreamReceiver { MemoryCompressionState lastMemoryCompressionState = MemoryCompressionState::NotApplicable; uint32_t activePartitions = 1; uint32_t activePartitionsConfig = 1; + uint32_t postSyncWriteOffset = 0; const uint32_t rootDeviceIndex; const DeviceBitfield deviceBitfield; diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index c2da359902..eeef4ac23d 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -172,6 +172,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { bool detectInitProgrammingFlagsRequired(const DispatchFlags &dispatchFlags) const; bool checkPlatformSupportsNewResourceImplicitFlush() const; bool checkPlatformSupportsGpuIdleImplicitFlush() const; + void configurePostSyncWriteOffset(); HeapDirtyState dshState; HeapDirtyState iohState; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index f77f46a35d..93356336e6 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -71,6 +71,7 @@ CommandStreamReceiverHw::CommandStreamReceiverHw(ExecutionEnvironment timestampPacketWriteEnabled = !!DebugManager.flags.EnableTimestampPacket.get(); } createScratchSpaceController(); + configurePostSyncWriteOffset(); } template diff --git a/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl index 69c4f30a22..65e7abc29a 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl @@ -165,4 +165,8 @@ inline void CommandStreamReceiverHw::programStallingNoPostSyncCommand MemorySynchronizationCommands::addPipeControl(cmdStream, args); } +template +inline void CommandStreamReceiverHw::configurePostSyncWriteOffset() { +} + } // namespace NEO diff --git a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl index f0eb060106..554db51e43 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl @@ -155,7 +155,7 @@ template inline void CommandStreamReceiverHw::programActivePartitionConfig(LinearStream &csr) { if (this->staticWorkPartitioningEnabled) { uint64_t workPartitionAddress = getWorkPartitionAllocationGpuAddress(); - ImplicitScalingDispatch::dispatchRegisterConfiguration(csr, workPartitionAddress, CommonConstants::partitionAddressOffset); + ImplicitScalingDispatch::dispatchRegisterConfiguration(csr, workPartitionAddress, this->postSyncWriteOffset); } this->activePartitionsConfig = this->activePartitions; } @@ -223,4 +223,9 @@ inline void CommandStreamReceiverHw::programStallingNoPostSyncCommand } } +template +inline void CommandStreamReceiverHw::configurePostSyncWriteOffset() { + this->postSyncWriteOffset = ImplicitScalingDispatch::getPostSyncOffset(); +} + } // namespace NEO diff --git a/shared/source/command_stream/tbx_command_stream_receiver_hw.inl b/shared/source/command_stream/tbx_command_stream_receiver_hw.inl index 15cd546a6c..fdad957a92 100644 --- a/shared/source/command_stream/tbx_command_stream_receiver_hw.inl +++ b/shared/source/command_stream/tbx_command_stream_receiver_hw.inl @@ -482,7 +482,7 @@ void TbxCommandStreamReceiverHw::flushSubmissionsAndDownloadAllocatio while (*pollAddress < this->latestFlushedTaskCount) { downloadAllocation(*this->getTagAllocation()); } - pollAddress = ptrOffset(pollAddress, CommonConstants::partitionAddressOffset); + pollAddress = ptrOffset(pollAddress, this->postSyncWriteOffset); } for (GraphicsAllocation *graphicsAllocation : this->allocationsForDownload) { @@ -553,7 +553,7 @@ void TbxCommandStreamReceiverHw::downloadAllocations() { while (*pollAddress < this->latestFlushedTaskCount) { downloadAllocation(*this->getTagAllocation()); } - pollAddress = ptrOffset(pollAddress, CommonConstants::partitionAddressOffset); + pollAddress = ptrOffset(pollAddress, this->postSyncWriteOffset); } for (GraphicsAllocation *graphicsAllocation : this->allocationsForDownload) { downloadAllocation(*graphicsAllocation); diff --git a/shared/source/direct_submission/direct_submission_bdw_and_later.inl b/shared/source/direct_submission/direct_submission_bdw_and_later.inl index 6885dd9239..28100ac00d 100644 --- a/shared/source/direct_submission/direct_submission_bdw_and_later.inl +++ b/shared/source/direct_submission/direct_submission_bdw_and_later.inl @@ -18,4 +18,8 @@ inline size_t DirectSubmissionHw::getSizePartitionRegiste return 0; } +template +inline void DirectSubmissionHw::setPostSyncOffset() { +} + } // namespace NEO diff --git a/shared/source/direct_submission/direct_submission_hw.h b/shared/source/direct_submission/direct_submission_hw.h index efa949dfb6..f0b36b9ab4 100644 --- a/shared/source/direct_submission/direct_submission_hw.h +++ b/shared/source/direct_submission/direct_submission_hw.h @@ -121,6 +121,7 @@ class DirectSubmissionHw { MOCKABLE_VIRTUAL void performDiagnosticMode(); void dispatchDiagnosticModeSection(); size_t getDiagnosticModeSection(); + void setPostSyncOffset(); enum RingBufferUse : uint32_t { FirstBuffer, @@ -151,6 +152,7 @@ class DirectSubmissionHw { uint32_t workloadMode = 0; uint32_t workloadModeOneExpectedValue = 0u; uint32_t activeTiles = 1u; + uint32_t postSyncOffset = 0u; bool ringStart = false; bool disableCpuCacheFlush = true; diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index b961b7703c..38fe3463c5 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -48,6 +48,7 @@ DirectSubmissionHw::DirectSubmissionHw(Device &device, hwInfo = &device.getHardwareInfo(); createDiagnostic(); + setPostSyncOffset(); } template diff --git a/shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl b/shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl index 5d176c9c08..be74f94185 100644 --- a/shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl +++ b/shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl @@ -14,7 +14,7 @@ template inline void DirectSubmissionHw::dispatchPartitionRegisterConfiguration() { ImplicitScalingDispatch::dispatchRegisterConfiguration(ringCommandStream, this->workPartitionAllocation->getGpuAddress(), - CommonConstants::partitionAddressOffset); + this->postSyncOffset); } template @@ -22,4 +22,9 @@ inline size_t DirectSubmissionHw::getSizePartitionRegiste return ImplicitScalingDispatch::getRegisterConfigurationSize(); } +template +inline void DirectSubmissionHw::setPostSyncOffset() { + this->postSyncOffset = ImplicitScalingDispatch::getPostSyncOffset(); +} + } // namespace NEO diff --git a/shared/source/direct_submission/linux/drm_direct_submission.inl b/shared/source/direct_submission/linux/drm_direct_submission.inl index 61a09bf144..9d6635ee71 100644 --- a/shared/source/direct_submission/linux/drm_direct_submission.inl +++ b/shared/source/direct_submission/linux/drm_direct_submission.inl @@ -182,7 +182,7 @@ void DrmDirectSubmission::wait(uint32_t taskCountToWait) for (uint32_t i = 0; i < this->activeTiles; i++) { while (!WaitUtils::waitFunction(pollAddress, taskCountToWait)) { } - pollAddress = ptrOffset(pollAddress, CommonConstants::partitionAddressOffset); + pollAddress = ptrOffset(pollAddress, this->postSyncOffset); } } diff --git a/shared/source/helpers/constants.h b/shared/source/helpers/constants.h index cd7303a8a7..9a20d65f00 100644 --- a/shared/source/helpers/constants.h +++ b/shared/source/helpers/constants.h @@ -90,6 +90,4 @@ constexpr uint32_t invalidStepping = std::numeric_limits::max(); constexpr uint32_t maximalSimdSize = 32; constexpr uint32_t maximalSizeOfAtomicType = 8; constexpr uint32_t engineGroupCount = static_cast(NEO::EngineGroupType::MaxEngineGroups); -constexpr uint32_t partitionAddressOffsetDwords = 2u; -constexpr uint32_t partitionAddressOffset = sizeof(uint32_t) * partitionAddressOffsetDwords; } // namespace CommonConstants diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index 37c198a235..9367cd0863 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -355,6 +355,7 @@ class HwHelperHw : public HwHelper { size_t getTimestampPacketAllocatorAlignment() const override; size_t getSingleTimestampPacketSize() const override; + static size_t getSingleTimestampPacketSizeHw(); void applyAdditionalCompressionSettings(Gmm &gmm, bool isNotCompressed) const override; diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl index adb66ada76..9994c8eb01 100644 --- a/shared/source/helpers/hw_helper_base.inl +++ b/shared/source/helpers/hw_helper_base.inl @@ -518,6 +518,11 @@ size_t HwHelperHw::getTimestampPacketAllocatorAlignment() const { template size_t HwHelperHw::getSingleTimestampPacketSize() const { + return HwHelperHw::getSingleTimestampPacketSizeHw(); +} + +template +size_t HwHelperHw::getSingleTimestampPacketSizeHw() { if (DebugManager.flags.OverrideTimestampPacketSize.get() != -1) { if (DebugManager.flags.OverrideTimestampPacketSize.get() == 4) { return TimestampPackets::getSinglePacketSize(); diff --git a/shared/source/memory_manager/allocations_list.cpp b/shared/source/memory_manager/allocations_list.cpp index 83177d1518..f2dc5de1aa 100644 --- a/shared/source/memory_manager/allocations_list.cpp +++ b/shared/source/memory_manager/allocations_list.cpp @@ -18,6 +18,7 @@ struct ReusableAllocationRequirements { GraphicsAllocation::AllocationType allocationType; uint32_t contextId; uint32_t activeTileCount; + uint32_t tagOffset; }; AllocationsList::AllocationsList(AllocationUsage allocationUsage) @@ -34,6 +35,7 @@ std::unique_ptr AllocationsList::detachAllocation(size_t req req.contextId = (commandStreamReceiver == nullptr) ? UINT32_MAX : commandStreamReceiver->getOsContext().getContextId(); req.requiredPtr = requiredPtr; req.activeTileCount = (commandStreamReceiver == nullptr) ? 1u : commandStreamReceiver->getActivePartitions(); + req.tagOffset = (commandStreamReceiver == nullptr) ? 0u : commandStreamReceiver->getPostSyncWriteOffset(); GraphicsAllocation *a = nullptr; GraphicsAllocation *retAlloc = processLocked(a, static_cast(&req)); return std::unique_ptr(retAlloc); @@ -79,7 +81,7 @@ bool AllocationsList::checkTagAddressReady(ReusableAllocationRequirements *requi if (*tagAddress < taskCount) { return false; } - tagAddress = ptrOffset(tagAddress, CommonConstants::partitionAddressOffset); + tagAddress = ptrOffset(tagAddress, requirements->tagOffset); } return true; diff --git a/shared/source/os_interface/linux/drm_command_stream_xehp_and_later.inl b/shared/source/os_interface/linux/drm_command_stream_xehp_and_later.inl index 9e0771b270..1f54edc677 100644 --- a/shared/source/os_interface/linux/drm_command_stream_xehp_and_later.inl +++ b/shared/source/os_interface/linux/drm_command_stream_xehp_and_later.inl @@ -64,12 +64,12 @@ int DrmCommandStreamReceiver::waitUserFence(uint32_t waitValue) { UNRECOVERABLE_IF(ctxIds.size() != this->activePartitions); for (uint32_t i = 0; i < this->activePartitions; i++) { ret |= this->drm->waitUserFence(ctxIds[i], tagAddress, waitValue, Drm::ValueWidth::U32, kmdWaitTimeout, 0u); - tagAddress += CommonConstants::partitionAddressOffset; + tagAddress += this->postSyncWriteOffset; } } else { for (uint32_t i = 0; i < this->activePartitions; i++) { ret |= this->drm->waitUserFence(0u, tagAddress, waitValue, Drm::ValueWidth::U32, kmdWaitTimeout, 0u); - tagAddress += CommonConstants::partitionAddressOffset; + tagAddress += this->postSyncWriteOffset; } } diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index bb116fa0e8..7cf88eed94 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -99,6 +99,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ using BaseClass::CommandStreamReceiver::newResources; using BaseClass::CommandStreamReceiver::osContext; using BaseClass::CommandStreamReceiver::perfCounterAllocator; + using BaseClass::CommandStreamReceiver::postSyncWriteOffset; using BaseClass::CommandStreamReceiver::profilingTimeStampAllocator; using BaseClass::CommandStreamReceiver::requiredPrivateScratchSize; using BaseClass::CommandStreamReceiver::requiredScratchSize; diff --git a/shared/test/common/mocks/linux/mock_drm_command_stream_receiver.h b/shared/test/common/mocks/linux/mock_drm_command_stream_receiver.h index fd864b2f3c..4cce3b696e 100644 --- a/shared/test/common/mocks/linux/mock_drm_command_stream_receiver.h +++ b/shared/test/common/mocks/linux/mock_drm_command_stream_receiver.h @@ -23,6 +23,7 @@ class TestedDrmCommandStreamReceiver : public DrmCommandStreamReceiver instructionHeapReserveredData; int *flushBatchedSubmissionsCallCounter = nullptr; @@ -170,6 +171,7 @@ class MockCsrHw2 : public CommandStreamReceiverHw { using CommandStreamReceiver::mediaVfeStateDirty; using CommandStreamReceiver::nTo1SubmissionModelEnabled; using CommandStreamReceiver::pageTableManagerInitialized; + using CommandStreamReceiver::postSyncWriteOffset; using CommandStreamReceiver::requiredScratchSize; using CommandStreamReceiver::requiredThreadArbitrationPolicy; using CommandStreamReceiver::tagAddress; diff --git a/shared/test/common/mocks/mock_direct_submission_hw.h b/shared/test/common/mocks/mock_direct_submission_hw.h index 0a286e2c50..5078af6f1c 100644 --- a/shared/test/common/mocks/mock_direct_submission_hw.h +++ b/shared/test/common/mocks/mock_direct_submission_hw.h @@ -50,6 +50,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw using BaseClass::partitionConfigSet; using BaseClass::partitionedMode; using BaseClass::performDiagnosticMode; + using BaseClass::postSyncOffset; using BaseClass::ringBuffer; using BaseClass::ringBuffer2; using BaseClass::ringCommandStream; diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 6d20ad2f50..427c157147 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -827,6 +827,7 @@ TEST(CommandStreamReceiverSimpleTest, givenCommandStreamReceiverWhenInitializeTa MockExecutionEnvironment executionEnvironment(defaultHwInfo.get()); DeviceBitfield devices(0b11); auto csr = std::make_unique(executionEnvironment, 0, devices); + csr->postSyncWriteOffset = 32u; executionEnvironment.memoryManager.reset(new OsAgnosticMemoryManager(executionEnvironment)); EXPECT_EQ(nullptr, csr->getTagAllocation()); csr->initializeTagAllocation(); @@ -836,7 +837,7 @@ TEST(CommandStreamReceiverSimpleTest, givenCommandStreamReceiverWhenInitializeTa auto tagAddress = csr->getTagAddress(); for (uint32_t i = 0; i < 2; i++) { EXPECT_EQ(*tagAddress, initialHardwareTag); - tagAddress = ptrOffset(tagAddress, 8); + tagAddress = ptrOffset(tagAddress, csr->getPostSyncWriteOffset()); } } @@ -844,19 +845,20 @@ TEST(CommandStreamReceiverSimpleTest, givenCommandStreamReceiverWhenInitializeTa MockExecutionEnvironment executionEnvironment(defaultHwInfo.get(), true, 10u); DeviceBitfield devices(0b1111); auto csr = std::make_unique(executionEnvironment, 0, devices); - executionEnvironment.memoryManager.reset(new OsAgnosticMemoryManager(executionEnvironment)); + csr->postSyncWriteOffset = 32u; + executionEnvironment.memoryManager.reset(new OsAgnosticMemoryManager(executionEnvironment)); EXPECT_EQ(nullptr, csr->getTagAllocation()); csr->initializeTagAllocation(); - EXPECT_NE(nullptr, csr->getTagAllocation()); EXPECT_EQ(GraphicsAllocation::AllocationType::TAG_BUFFER, csr->getTagAllocation()->getAllocationType()); EXPECT_EQ(csr->getTagAllocation()->getUnderlyingBuffer(), csr->getTagAddress()); + auto tagAddress = csr->getTagAddress(); for (uint32_t i = 0; i < 4; i++) { EXPECT_EQ(*tagAddress, initialHardwareTag); - tagAddress = ptrOffset(tagAddress, 8); + tagAddress = ptrOffset(tagAddress, csr->getPostSyncWriteOffset()); } auto tagsMultiAllocation = csr->getTagsMultiAllocation(); @@ -1079,8 +1081,11 @@ TEST(CommandStreamReceiverSimpleTest, givenMultipleActivePartitionsWhenWaitingFo temporaryAllocation->updateTaskCount(0u, 0u); csr.getInternalAllocationStorage()->storeAllocationWithTaskCount(std::move(temporaryAllocation), TEMPORARY_ALLOCATION, 2u); + csr.postSyncWriteOffset = 32u; csr.mockTagAddress[0] = 0u; - csr.mockTagAddress[2] = 0u; + auto nextPartitionTagAddress = ptrOffset(&csr.mockTagAddress[0], csr.getPostSyncWriteOffset()); + *nextPartitionTagAddress = 0u; + csr.taskCount = 3u; csr.activePartitions = 2; @@ -1090,7 +1095,7 @@ TEST(CommandStreamReceiverSimpleTest, givenMultipleActivePartitionsWhenWaitingFo CpuIntrinsicsTests::pauseAddress = &csr.mockTagAddress[0]; CpuIntrinsicsTests::pauseValue = 3u; - CpuIntrinsicsTests::pauseOffset = 8; + CpuIntrinsicsTests::pauseOffset = csr.getPostSyncWriteOffset(); CpuIntrinsicsTests::pauseCounter = 0; csr.waitForTaskCountAndCleanTemporaryAllocationList(3u); @@ -1613,4 +1618,4 @@ TEST(CreateWorkPartitionAllocationTest, givenEnabledBlitterWhenInitializingWorkP auto retVal = commandStreamReceiver->createWorkPartitionAllocation(device); EXPECT_TRUE(retVal); EXPECT_EQ(0u, memoryManager->copyMemoryToAllocationBanksCalled); -} \ No newline at end of file +} diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp index 0f0d45d015..c89e85b739 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp @@ -5,6 +5,7 @@ * */ +#include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_stream/submissions_aggregator.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/direct_submission/dispatchers/render_dispatcher.h" @@ -495,6 +496,9 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, DirectSubmissionDispatchBufferTest, EXPECT_EQ(1u, directSubmission.handleResidencyCount); EXPECT_EQ(4u, directSubmission.makeResourcesResidentVectorSize); + uint32_t expectedOffset = NEO::ImplicitScalingDispatch::getPostSyncOffset(); + EXPECT_EQ(expectedOffset, directSubmission.postSyncOffset); + HardwareParse hwParse; hwParse.parseCommands(directSubmission.ringCommandStream, 0); hwParse.findHardwareCommands(); @@ -504,8 +508,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, DirectSubmissionDispatchBufferTest, for (auto &it : hwParse.lriList) { auto loadRegisterImm = reinterpret_cast(it); if (loadRegisterImm->getRegisterOffset() == 0x23B4u) { - - EXPECT_EQ(8u, loadRegisterImm->getDataDword()); + EXPECT_EQ(expectedOffset, loadRegisterImm->getDataDword()); partitionRegisterFound = true; } } @@ -556,13 +559,14 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, DirectSubmissionDispatchBufferTest, hwParse.parseCommands(directSubmission.ringCommandStream, 0); hwParse.findHardwareCommands(); + uint32_t expectedOffset = NEO::ImplicitScalingDispatch::getPostSyncOffset(); + ASSERT_NE(hwParse.lriList.end(), hwParse.lriList.begin()); bool partitionRegisterFound = false; for (auto &it : hwParse.lriList) { auto loadRegisterImm = reinterpret_cast(it); if (loadRegisterImm->getRegisterOffset() == 0x23B4u) { - - EXPECT_EQ(8u, loadRegisterImm->getDataDword()); + EXPECT_EQ(expectedOffset, loadRegisterImm->getDataDword()); partitionRegisterFound = true; } } diff --git a/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp b/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp index a6b3dfaf58..3f68388e3d 100644 --- a/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp +++ b/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp @@ -69,6 +69,7 @@ struct MockDrmDirectSubmission : public DrmDirectSubmission; MockDrmDirectSubmission directSubmission(*device.get(), *osContext.get()); + uint32_t offset = directSubmission.postSyncOffset; + EXPECT_NE(0u, offset); bool ret = directSubmission.allocateResources(); EXPECT_TRUE(ret); directSubmission.activeTiles = 2; auto pollAddress = directSubmission.tagAddress; *pollAddress = 10; - pollAddress = ptrOffset(pollAddress, 8); + pollAddress = ptrOffset(pollAddress, offset); *pollAddress = 10; CpuIntrinsicsTests::pauseCounter = 0; diff --git a/shared/test/unit_test/memory_manager/deferrable_allocation_deletion_tests.cpp b/shared/test/unit_test/memory_manager/deferrable_allocation_deletion_tests.cpp index df6105b7c8..868be201ff 100644 --- a/shared/test/unit_test/memory_manager/deferrable_allocation_deletion_tests.cpp +++ b/shared/test/unit_test/memory_manager/deferrable_allocation_deletion_tests.cpp @@ -171,9 +171,11 @@ TEST_F(DeferrableAllocationDeletionTest, givenAllocationUsedByUnregisteredEngine EXPECT_EQ(1u, memoryManager->freeGraphicsMemoryCalled); } -TEST_F(DeferrableAllocationDeletionTest, givenMultiTileWhenTaskCompletedOnSingleTileThenDoNotFreeGraphicsAllocation) { - device->getDefaultEngine().commandStreamReceiver->setActivePartitions(2u); - auto hwTagNextTile = ptrOffset(hwTag, 8); +HWTEST_F(DeferrableAllocationDeletionTest, givenMultiTileWhenTaskCompletedOnSingleTileThenDoNotFreeGraphicsAllocation) { + auto csr = reinterpret_cast *>(device->getDefaultEngine().commandStreamReceiver); + csr->setActivePartitions(2u); + csr->postSyncWriteOffset = 32; + auto hwTagNextTile = ptrOffset(hwTag, 32); auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{device->getRootDeviceIndex(), MemoryConstants::pageSize}); allocation->updateTaskCount(1u, defaultOsContextId);