diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index a91a60181d..7a52888c60 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -331,6 +331,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionInsertExtraMiMemFenceCommands, - DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionInsertSfenceInstructionPriorToSubmission, -1, "-1: default, 0 - disable, 1 - Insert _mm_sfence before unlocking semaphore only, 2 - insert before and after semaphore") DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionMaxRingBuffers, -1, "-1: default, >0: max ring buffer count, During switch ring buffer, if there is no available ring, wait for completion instead of allocating new one if DirectSubmissionMaxRingBuffers is reached") DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisablePrefetcher, -1, "-1: default, 0 - disable, 1 - enable. If enabled, disable prefetcher is being dispatched") +DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionRelaxedOrdering, -1, "-1: default, 0 - disable, 1 - enable. If enabled, tasks sent to direct submission ring may be dispatched out of order") DECLARE_DEBUG_VARIABLE(bool, DirectSubmissionPrintBuffers, false, "Print address of submitted command buffers") /*FEATURE FLAGS*/ diff --git a/shared/source/direct_submission/CMakeLists.txt b/shared/source/direct_submission/CMakeLists.txt index 918175238e..c6b7101583 100644 --- a/shared/source/direct_submission/CMakeLists.txt +++ b/shared/source/direct_submission/CMakeLists.txt @@ -17,6 +17,7 @@ set(NEO_CORE_DIRECT_SUBMISSION ${CMAKE_CURRENT_SOURCE_DIR}/direct_submission_hw_diagnostic_mode.cpp ${CMAKE_CURRENT_SOURCE_DIR}/direct_submission_hw_diagnostic_mode.h ${CMAKE_CURRENT_SOURCE_DIR}/direct_submission_properties.h + ${CMAKE_CURRENT_SOURCE_DIR}/relaxed_ordering_helper.h ) if(SUPPORT_XEHP_AND_LATER) diff --git a/shared/source/direct_submission/direct_submission_hw.h b/shared/source/direct_submission/direct_submission_hw.h index 241f29e957..bdcb513f07 100644 --- a/shared/source/direct_submission/direct_submission_hw.h +++ b/shared/source/direct_submission/direct_submission_hw.h @@ -120,6 +120,9 @@ class DirectSubmissionHw { void dispatchSwitchRingBufferSection(uint64_t nextBufferGpuAddress); size_t getSizeSwitchRingBufferSection(); + void dispatchTaskStoreSection(uint64_t taskStartSectionVa); + MOCKABLE_VIRTUAL void preinitializeTaskStoreSection(); + void setReturnAddress(void *returnCmd, uint64_t returnAddress); void *dispatchWorkloadSection(BatchBuffer &batchBuffer); @@ -160,6 +163,7 @@ class DirectSubmissionHw { GraphicsAllocation *ringBuffer = nullptr; }; std::vector ringBuffers; + std::unique_ptr preinitializedTaskStoreSection; uint32_t currentRingBuffer = 0u; uint32_t previousRingBuffer = 0u; uint32_t maxRingBufferCount = std::numeric_limits::max(); @@ -181,6 +185,7 @@ class DirectSubmissionHw { GraphicsAllocation *completionFenceAllocation = nullptr; GraphicsAllocation *semaphores = nullptr; GraphicsAllocation *workPartitionAllocation = nullptr; + GraphicsAllocation *deferredTasksListAllocation = nullptr; void *semaphorePtr = nullptr; volatile RingSemaphoreData *semaphoreData = nullptr; volatile void *workloadModeOneStoreAddress = nullptr; @@ -205,5 +210,7 @@ class DirectSubmissionHw { bool completionFenceSupported = false; bool isDisablePrefetcherRequired = false; bool dcFlushRequired = false; + bool relaxedOrderingEnabled = false; + bool relaxedOrderingInitialized = false; }; } // namespace NEO diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index cf59f1f7af..33da17e053 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -12,6 +12,7 @@ #include "shared/source/device/device.h" #include "shared/source/direct_submission/direct_submission_hw.h" #include "shared/source/direct_submission/direct_submission_hw_diagnostic_mode.h" +#include "shared/source/direct_submission/relaxed_ordering_helper.h" #include "shared/source/helpers/flush_stamp.h" #include "shared/source/helpers/logical_state_helper.h" #include "shared/source/helpers/ptr_math.h" @@ -76,6 +77,7 @@ DirectSubmissionHw::DirectSubmissionHw(const DirectSubmis setPostSyncOffset(); dcFlushRequired = MemorySynchronizationCommands::getDcFlushEnable(true, *hwInfo); + relaxedOrderingEnabled = (DebugManager.flags.DirectSubmissionRelaxedOrdering.get() == 1); } template @@ -118,6 +120,18 @@ bool DirectSubmissionHw::allocateResources() { allocations.push_back(completionFenceAllocation); } + if (this->relaxedOrderingEnabled) { + const AllocationProperties allocationProperties(rootDeviceIndex, + true, MemoryConstants::pageSize64k, + AllocationType::DEFERRED_TASKS_LIST, + isMultiOsContextCapable, false, osContext.getDeviceBitfield()); + + deferredTasksListAllocation = memoryManager->allocateGraphicsMemoryWithProperties(allocationProperties); + UNRECOVERABLE_IF(deferredTasksListAllocation == nullptr); + + allocations.push_back(deferredTasksListAllocation); + } + if (DebugManager.flags.DirectSubmissionPrintBuffers.get()) { for (uint32_t ringBufferIndex = 0; ringBufferIndex < RingBufferUse::initialRingBufferCount; ringBufferIndex++) { const auto ringBuffer = this->ringBuffers[ringBufferIndex].ringBuffer; @@ -214,6 +228,10 @@ bool DirectSubmissionHw::initialize(bool submitOnInit, bo this->systemMemoryFenceAddressSet = true; } + if (this->relaxedOrderingEnabled) { + preinitializeTaskStoreSection(); + this->relaxedOrderingInitialized = true; + } if (workloadMode == 1) { dispatchDiagnosticModeSection(); startBufferSize += getDiagnosticModeSection(); @@ -257,6 +275,11 @@ bool DirectSubmissionHw::startRingBuffer() { this->systemMemoryFenceAddressSet = true; } + if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) { + preinitializeTaskStoreSection(); + this->relaxedOrderingInitialized = true; + } + currentQueueWorkCount++; dispatchSemaphoreSection(currentQueueWorkCount); @@ -388,7 +411,7 @@ inline size_t DirectSubmissionHw::getSizeDispatch() { } else if (workloadMode == 1) { size += getDiagnosticModeSection(); } - //mode 2 does not dispatch any commands + // mode 2 does not dispatch any commands if (!disableCacheFlush) { size += Dispatcher::getSizeCacheFlush(*hwInfo); @@ -435,7 +458,11 @@ void *DirectSubmissionHw::dispatchWorkloadSection(BatchBu DirectSubmissionDiagnostics::diagnosticModeOneDispatch(diagnostic.get()); dispatchDiagnosticModeSection(); } - //mode 2 does not dispatch any commands + // mode 2 does not dispatch any commands + + if (this->relaxedOrderingEnabled) { + dispatchTaskStoreSection(0); + } if (!disableCacheFlush) { Dispatcher::dispatchCacheFlush(ringCommandStream, *hwInfo, gpuVaForMiFlush); @@ -452,9 +479,63 @@ void *DirectSubmissionHw::dispatchWorkloadSection(BatchBu return currentPosition; } +template +void DirectSubmissionHw::preinitializeTaskStoreSection() { + preinitializedTaskStoreSection = std::make_unique(RelaxedOrderingHelper::getSizeTaskStoreSection()); + + LinearStream stream(preinitializedTaskStoreSection.get(), RelaxedOrderingHelper::getSizeTaskStoreSection()); + + EncodeMiPredicate::encode(stream, MiPredicateType::Disable); + + uint64_t deferredWalkerListGpuVa = deferredTasksListAllocation->getGpuAddress(); + LriHelper::program(&stream, CS_GPR_R6, static_cast(deferredWalkerListGpuVa & 0xFFFF'FFFFULL), true); + LriHelper::program(&stream, CS_GPR_R6 + 4, static_cast(deferredWalkerListGpuVa >> 32), true); + + // Task start VA + LriHelper::program(&stream, CS_GPR_R7, 0, true); + LriHelper::program(&stream, CS_GPR_R7 + 4, 0, true); + + // Shift by 8 = multiply by 256. Address must by 64b aligned (shift by 6), but SHL accepts only 1, 2, 4, 8, 16 and 32 + LriHelper::program(&stream, CS_GPR_R8, 8, true); + LriHelper::program(&stream, CS_GPR_R8 + 4, 0, true); + + EncodeAluHelper aluHelper; + aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_1); + aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_8); + aluHelper.setNextAlu(AluRegisters::OPCODE_SHL); + aluHelper.setNextAlu(AluRegisters::OPCODE_STORE, AluRegisters::R_8, AluRegisters::R_ACCU); + aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_8); + aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_6); + aluHelper.setNextAlu(AluRegisters::OPCODE_ADD); + aluHelper.setNextAlu(AluRegisters::OPCODE_STOREIND, AluRegisters::R_ACCU, AluRegisters::R_7); + aluHelper.setNextAlu(AluRegisters::OPCODE_FENCE_WR); + + aluHelper.copyToCmdStream(stream); + + EncodeMathMMIO::encodeIncrement(stream, AluRegisters::R_1); + + UNRECOVERABLE_IF(stream.getUsed() != RelaxedOrderingHelper::getSizeTaskStoreSection()); +} + +template +void DirectSubmissionHw::dispatchTaskStoreSection(uint64_t taskStartSectionVa) { + using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM; + + constexpr size_t patchOffset = EncodeMiPredicate::getCmdSize() + (2 * sizeof(MI_LOAD_REGISTER_IMM)); + + auto lri = reinterpret_cast(ptrOffset(preinitializedTaskStoreSection.get(), patchOffset)); + + lri->setDataDword(static_cast(taskStartSectionVa & 0xFFFF'FFFFULL)); + lri++; + lri->setDataDword(static_cast(taskStartSectionVa >> 32)); + + auto dst = ringCommandStream.getSpace(RelaxedOrderingHelper::getSizeTaskStoreSection()); + memcpy_s(dst, RelaxedOrderingHelper::getSizeTaskStoreSection(), preinitializedTaskStoreSection.get(), RelaxedOrderingHelper::getSizeTaskStoreSection()); +} + template bool DirectSubmissionHw::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) { - //for now workloads requiring cache coherency are not supported + // for now workloads requiring cache coherency are not supported UNRECOVERABLE_IF(batchBuffer.requiresCoherency); if (batchBuffer.ringBufferRestartRequest) { @@ -466,6 +547,9 @@ bool DirectSubmissionHw::dispatchCommandBuffer(BatchBuffe size_t dispatchSize = getSizeDispatch(); size_t cycleSize = getSizeSwitchRingBufferSection(); size_t requiredMinimalSize = dispatchSize + cycleSize + getSizeEnd(); + if (this->relaxedOrderingEnabled) { + requiredMinimalSize += RelaxedOrderingHelper::getSizeTaskStoreSection(); + } getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0)); @@ -588,6 +672,8 @@ void DirectSubmissionHw::deallocateResources() { memoryManager->freeGraphicsMemory(semaphores); semaphores = nullptr; } + + memoryManager->freeGraphicsMemory(deferredTasksListAllocation); } template diff --git a/shared/source/direct_submission/relaxed_ordering_helper.h b/shared/source/direct_submission/relaxed_ordering_helper.h new file mode 100644 index 0000000000..e7f6e1a227 --- /dev/null +++ b/shared/source/direct_submission/relaxed_ordering_helper.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +#include "shared/source/command_container/command_encoder.h" +#include "shared/source/command_container/encode_alu_helper.h" + +namespace NEO { +namespace RelaxedOrderingHelper { + +template +constexpr size_t getSizeTaskStoreSection() { + return ((6 * sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM)) + + EncodeAluHelper::getCmdsSize() + + EncodeMathMMIO::getCmdSizeForIncrementOrDecrement() + + EncodeMiPredicate::getCmdSize()); +} + +} // namespace RelaxedOrderingHelper +} // namespace NEO \ No newline at end of file diff --git a/shared/source/helpers/app_resource_helper.cpp b/shared/source/helpers/app_resource_helper.cpp index 1c3ac9970a..80c9148c30 100644 --- a/shared/source/helpers/app_resource_helper.cpp +++ b/shared/source/helpers/app_resource_helper.cpp @@ -114,6 +114,8 @@ const char *AppResourceHelper::getResourceTagStr(AllocationType type) { return "GPUTSDBF"; case AllocationType::SW_TAG_BUFFER: return "SWTAGBF"; + case AllocationType::DEFERRED_TASKS_LIST: + return "TSKLIST"; default: return "NOTFOUND"; } diff --git a/shared/source/memory_manager/allocation_type.h b/shared/source/memory_manager/allocation_type.h index ae8ca6e316..273c069d68 100644 --- a/shared/source/memory_manager/allocation_type.h +++ b/shared/source/memory_manager/allocation_type.h @@ -55,6 +55,7 @@ enum class AllocationType { WORK_PARTITION_SURFACE, GPU_TIMESTAMP_DEVICE_BUFFER, SW_TAG_BUFFER, + DEFERRED_TASKS_LIST, COUNT }; } // namespace NEO diff --git a/shared/source/memory_manager/definitions/storage_info.cpp b/shared/source/memory_manager/definitions/storage_info.cpp index 0f12455c3a..29f2555563 100644 --- a/shared/source/memory_manager/definitions/storage_info.cpp +++ b/shared/source/memory_manager/definitions/storage_info.cpp @@ -86,6 +86,7 @@ StorageInfo MemoryManager::createStorageInfoFromProperties(const AllocationPrope break; case AllocationType::SCRATCH_SURFACE: case AllocationType::PREEMPTION: + case AllocationType::DEFERRED_TASKS_LIST: if (properties.flags.multiOsContextCapable) { storageInfo.cloningOfPageTables = false; storageInfo.memoryBanks = allTilesValue; diff --git a/shared/source/memory_manager/memory_manager.cpp b/shared/source/memory_manager/memory_manager.cpp index 92278eaa66..bfb7a95819 100644 --- a/shared/source/memory_manager/memory_manager.cpp +++ b/shared/source/memory_manager/memory_manager.cpp @@ -386,6 +386,7 @@ bool MemoryManager::getAllocationData(AllocationData &allocationData, const Allo } switch (properties.allocationType) { + case AllocationType::DEFERRED_TASKS_LIST: case AllocationType::COMMAND_BUFFER: case AllocationType::IMAGE: case AllocationType::INDIRECT_OBJECT_HEAP: diff --git a/shared/source/utilities/logger.cpp b/shared/source/utilities/logger.cpp index 89a575e7d3..7b825b0dad 100644 --- a/shared/source/utilities/logger.cpp +++ b/shared/source/utilities/logger.cpp @@ -230,6 +230,8 @@ const char *getAllocationTypeString(GraphicsAllocation const *graphicsAllocation return "UNIFIED_SHARED_MEMORY"; case AllocationType::SW_TAG_BUFFER: return "SW_TAG_BUFFER"; + case AllocationType::DEFERRED_TASKS_LIST: + return "DEFERRED_TASKS_LIST"; default: return "ILLEGAL_VALUE"; } diff --git a/shared/test/common/mocks/mock_direct_submission_hw.h b/shared/test/common/mocks/mock_direct_submission_hw.h index 611a17a41f..31712758aa 100644 --- a/shared/test/common/mocks/mock_direct_submission_hw.h +++ b/shared/test/common/mocks/mock_direct_submission_hw.h @@ -23,6 +23,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw using BaseClass::currentRingBuffer; using BaseClass::dcFlushRequired; using BaseClass::deallocateResources; + using BaseClass::deferredTasksListAllocation; using BaseClass::diagnostic; using BaseClass::DirectSubmissionHw; using BaseClass::disableCacheFlush; @@ -53,6 +54,8 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw using BaseClass::partitionedMode; using BaseClass::performDiagnosticMode; using BaseClass::postSyncOffset; + using BaseClass::preinitializedTaskStoreSection; + using BaseClass::relaxedOrderingInitialized; using BaseClass::reserved; using BaseClass::ringBuffers; using BaseClass::ringCommandStream; @@ -84,6 +87,11 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw return allocateOsResourcesReturn; } + void preinitializeTaskStoreSection() override { + preinitializeTaskStoreSectionCalled++; + BaseClass::preinitializeTaskStoreSection(); + } + bool makeResourcesResident(DirectSubmissionAllocations &allocations) override { makeResourcesResidentVectorSize = static_cast(allocations.size()); if (callBaseResident) { @@ -139,6 +147,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw uint32_t submitCount = 0u; uint32_t handleResidencyCount = 0u; uint32_t disabledDiagnosticCalled = 0u; + uint32_t preinitializeTaskStoreSectionCalled = 0; uint32_t makeResourcesResidentVectorSize = 0u; bool allocateOsResourcesReturn = true; bool submitReturn = true; diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 4848d2ac7f..53ad81b7ed 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -486,4 +486,5 @@ AdjustThreadGroupDispatchSize = -1 ForceNonblockingExecbufferCalls = -1 UseHighAlignmentForHeapExtended = -1 ForceAutoGrfCompilationMode = -1 -ForceComputeWalkerPostSyncFlush = -1 \ No newline at end of file +ForceComputeWalkerPostSyncFlush = -1 +DirectSubmissionRelaxedOrdering = -1 \ No newline at end of file diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp index 8281426eb0..9cc3e5c736 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp @@ -11,6 +11,7 @@ #include "shared/source/direct_submission/direct_submission_hw.h" #include "shared/source/direct_submission/dispatchers/render_dispatcher.h" #include "shared/source/helpers/flush_stamp.h" +#include "shared/source/helpers/register_offsets.h" #include "shared/source/utilities/cpuintrinsics.h" #include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" @@ -901,4 +902,227 @@ HWTEST_F(DirectSubmissionDispatchBufferTest, givenDebugFlagSetWhenStoppingRingbu EXPECT_EQ(initialCounterValue + expectedCount, CpuIntrinsicsTests::sfenceCounter); } +} + +struct DirectSubmissionRelaxedOrderingTests : public DirectSubmissionDispatchBufferTest { + void SetUp() override { + DebugManager.flags.DirectSubmissionRelaxedOrdering.set(1); + DirectSubmissionDispatchBufferTest::SetUp(); + } + + DebugManagerStateRestore restore; +}; + +HWTEST_F(DirectSubmissionRelaxedOrderingTests, whenAllocatingResourcesThenCreateDeferredTasksAllocation) { + using Dispatcher = RenderDispatcher; + + auto mockMemoryOperations = new MockMemoryOperations(); + mockMemoryOperations->captureGfxAllocationsForMakeResident = true; + + pDevice->getRootDeviceEnvironmentRef().memoryOperationsInterface.reset(mockMemoryOperations); + + MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + directSubmission.callBaseResident = true; + + directSubmission.initialize(false, false); + + EXPECT_EQ(AllocationType::DEFERRED_TASKS_LIST, directSubmission.deferredTasksListAllocation->getAllocationType()); + EXPECT_NE(nullptr, directSubmission.deferredTasksListAllocation); + EXPECT_EQ(directSubmission.deferredTasksListAllocation, mockMemoryOperations->gfxAllocationsForMakeResident.back()); +} + +HWTEST_F(DirectSubmissionRelaxedOrderingTests, whenInitializingThenPreinitializeTaskStoreSection) { + using Dispatcher = RenderDispatcher; + + { + MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + directSubmission.initialize(false, false); + + EXPECT_EQ(0u, directSubmission.preinitializeTaskStoreSectionCalled); + EXPECT_FALSE(directSubmission.relaxedOrderingInitialized); + EXPECT_EQ(nullptr, directSubmission.preinitializedTaskStoreSection.get()); + } + + { + MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + directSubmission.initialize(true, false); + + EXPECT_EQ(1u, directSubmission.preinitializeTaskStoreSectionCalled); + EXPECT_TRUE(directSubmission.relaxedOrderingInitialized); + EXPECT_NE(nullptr, directSubmission.preinitializedTaskStoreSection.get()); + + directSubmission.startRingBuffer(); + + EXPECT_EQ(1u, directSubmission.preinitializeTaskStoreSectionCalled); + } + + { + MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + directSubmission.initialize(false, false); + EXPECT_EQ(0u, directSubmission.preinitializeTaskStoreSectionCalled); + + directSubmission.startRingBuffer(); + + EXPECT_EQ(1u, directSubmission.preinitializeTaskStoreSectionCalled); + EXPECT_TRUE(directSubmission.relaxedOrderingInitialized); + EXPECT_NE(nullptr, directSubmission.preinitializedTaskStoreSection.get()); + + directSubmission.startRingBuffer(); + EXPECT_EQ(1u, directSubmission.preinitializeTaskStoreSectionCalled); + } +} + +HWTEST_F(DirectSubmissionRelaxedOrderingTests, whenDispatchingWorkThenDispatchTaskStoreSection) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE; + using MI_MATH = typename FamilyType::MI_MATH; + using Dispatcher = RenderDispatcher; + + MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + + directSubmission.initialize(true, false); + auto offset = directSubmission.ringCommandStream.getUsed() + directSubmission.getSizeStartSection(); + + FlushStampTracker flushStamp(true); + directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); + + auto taskStoreSection = ptrOffset(directSubmission.ringCommandStream.getCpuBase(), offset); + + if constexpr (FamilyType::isUsingMiSetPredicate) { + using MI_SET_PREDICATE = typename FamilyType::MI_SET_PREDICATE; + using PREDICATE_ENABLE = typename MI_SET_PREDICATE::PREDICATE_ENABLE; + + auto miSetPredicate = reinterpret_cast(taskStoreSection); + EXPECT_EQ(PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE, miSetPredicate->getPredicateEnable()); + + taskStoreSection = ptrOffset(taskStoreSection, sizeof(MI_SET_PREDICATE)); + } + + uint64_t deferredTasksVa = directSubmission.deferredTasksListAllocation->getGpuAddress(); + + auto lriCmd = reinterpret_cast(taskStoreSection); + + EXPECT_EQ(CS_GPR_R6, lriCmd->getRegisterOffset()); + EXPECT_EQ(static_cast(deferredTasksVa & 0xFFFF'FFFFULL), lriCmd->getDataDword()); + + lriCmd++; + EXPECT_EQ(CS_GPR_R6 + 4, lriCmd->getRegisterOffset()); + EXPECT_EQ(static_cast(deferredTasksVa >> 32), lriCmd->getDataDword()); + + lriCmd++; + EXPECT_EQ(CS_GPR_R7, lriCmd->getRegisterOffset()); + EXPECT_EQ(0u, lriCmd->getDataDword()); + + lriCmd++; + EXPECT_EQ(CS_GPR_R7 + 4, lriCmd->getRegisterOffset()); + EXPECT_EQ(0u, lriCmd->getDataDword()); + + lriCmd++; + EXPECT_EQ(CS_GPR_R8, lriCmd->getRegisterOffset()); + EXPECT_EQ(8u, lriCmd->getDataDword()); + + lriCmd++; + EXPECT_EQ(CS_GPR_R8 + 4, lriCmd->getRegisterOffset()); + EXPECT_EQ(0u, lriCmd->getDataDword()); + + auto miMathCmd = reinterpret_cast(++lriCmd); + EXPECT_EQ(8u, miMathCmd->DW0.BitField.DwordLength); + + auto miAluCmd = reinterpret_cast(++miMathCmd); + EXPECT_EQ(static_cast(AluRegisters::OPCODE_LOAD), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(static_cast(AluRegisters::R_SRCA), miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(static_cast(AluRegisters::R_1), miAluCmd->DW0.BitField.Operand2); + + miAluCmd++; + EXPECT_EQ(static_cast(AluRegisters::OPCODE_LOAD), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(static_cast(AluRegisters::R_SRCB), miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(static_cast(AluRegisters::R_8), miAluCmd->DW0.BitField.Operand2); + + miAluCmd++; + EXPECT_EQ(static_cast(AluRegisters::OPCODE_SHL), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand2); + + miAluCmd++; + EXPECT_EQ(static_cast(AluRegisters::OPCODE_STORE), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(static_cast(AluRegisters::R_8), miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(static_cast(AluRegisters::R_ACCU), miAluCmd->DW0.BitField.Operand2); + + miAluCmd++; + EXPECT_EQ(static_cast(AluRegisters::OPCODE_LOAD), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(static_cast(AluRegisters::R_SRCA), miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(static_cast(AluRegisters::R_8), miAluCmd->DW0.BitField.Operand2); + + miAluCmd++; + EXPECT_EQ(static_cast(AluRegisters::OPCODE_LOAD), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(static_cast(AluRegisters::R_SRCB), miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(static_cast(AluRegisters::R_6), miAluCmd->DW0.BitField.Operand2); + + miAluCmd++; + EXPECT_EQ(static_cast(AluRegisters::OPCODE_ADD), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand2); + + miAluCmd++; + EXPECT_EQ(static_cast(AluRegisters::OPCODE_STOREIND), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(static_cast(AluRegisters::R_ACCU), miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(static_cast(AluRegisters::R_7), miAluCmd->DW0.BitField.Operand2); + + miAluCmd++; + EXPECT_EQ(static_cast(AluRegisters::OPCODE_FENCE_WR), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand2); + + // increment + lriCmd = reinterpret_cast(++miAluCmd); + EXPECT_EQ(lriCmd->getRegisterOffset(), CS_GPR_R7); + EXPECT_EQ(lriCmd->getDataDword(), 1u); + + lriCmd++; + EXPECT_EQ(CS_GPR_R7 + 4, lriCmd->getRegisterOffset()); + EXPECT_EQ(0u, lriCmd->getDataDword()); + + miMathCmd = reinterpret_cast(++lriCmd); + EXPECT_EQ(3u, miMathCmd->DW0.BitField.DwordLength); + + miAluCmd = reinterpret_cast(++miMathCmd); + EXPECT_EQ(static_cast(AluRegisters::OPCODE_LOAD), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(static_cast(AluRegisters::R_SRCA), miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(static_cast(AluRegisters::R_1), miAluCmd->DW0.BitField.Operand2); + + miAluCmd++; + EXPECT_EQ(static_cast(AluRegisters::OPCODE_LOAD), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(static_cast(AluRegisters::R_SRCB), miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(static_cast(AluRegisters::R_7), miAluCmd->DW0.BitField.Operand2); + + miAluCmd++; + EXPECT_EQ(static_cast(AluRegisters::OPCODE_ADD), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(0u, miAluCmd->DW0.BitField.Operand2); + + miAluCmd++; + EXPECT_EQ(static_cast(AluRegisters::OPCODE_STORE), miAluCmd->DW0.BitField.ALUOpcode); + EXPECT_EQ(static_cast(AluRegisters::R_1), miAluCmd->DW0.BitField.Operand1); + EXPECT_EQ(static_cast(AluRegisters::R_ACCU), miAluCmd->DW0.BitField.Operand2); +} + +HWTEST_F(DirectSubmissionRelaxedOrderingTests, givenNotEnoughSpaceForTaskStoreSectionWhenDispatchingThenSwitchRingBuffers) { + using Dispatcher = RenderDispatcher; + + MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + + directSubmission.initialize(true, false); + directSubmission.ringCommandStream.getUsed(); + + auto sizeToConsume = directSubmission.ringCommandStream.getAvailableSpace() - + (directSubmission.getSizeDispatch() + directSubmission.getSizeEnd() + directSubmission.getSizeSwitchRingBufferSection()); + + directSubmission.ringCommandStream.getSpace(sizeToConsume); + + auto oldAllocation = directSubmission.ringCommandStream.getGraphicsAllocation(); + + FlushStampTracker flushStamp(true); + directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); + + EXPECT_NE(oldAllocation, directSubmission.ringCommandStream.getGraphicsAllocation()); } \ No newline at end of file diff --git a/shared/test/unit_test/helpers/app_resource_tests.cpp b/shared/test/unit_test/helpers/app_resource_tests.cpp index d92b34a333..c13e45744b 100644 --- a/shared/test/unit_test/helpers/app_resource_tests.cpp +++ b/shared/test/unit_test/helpers/app_resource_tests.cpp @@ -104,7 +104,8 @@ AllocationTypeTagTestCase allocationTypeTagValues[static_cast(AllocationTyp {AllocationType::DEBUG_MODULE_AREA, "DBMDLARE"}, {AllocationType::UNIFIED_SHARED_MEMORY, "USHRDMEM"}, {AllocationType::GPU_TIMESTAMP_DEVICE_BUFFER, "GPUTSDBF"}, - {AllocationType::SW_TAG_BUFFER, "SWTAGBF"}}; + {AllocationType::SW_TAG_BUFFER, "SWTAGBF"}, + {AllocationType::DEFERRED_TASKS_LIST, "TSKLIST"}}; class AllocationTypeTagString : public ::testing::TestWithParam {}; TEST_P(AllocationTypeTagString, givenGraphicsAllocationTypeWhenCopyTagToStorageInfoThenCorrectTagIsReturned) { diff --git a/shared/test/unit_test/memory_manager/memory_manager_allocate_in_preferred_pool_tests.cpp b/shared/test/unit_test/memory_manager/memory_manager_allocate_in_preferred_pool_tests.cpp index 9ecef19f6d..52502b60d9 100644 --- a/shared/test/unit_test/memory_manager/memory_manager_allocate_in_preferred_pool_tests.cpp +++ b/shared/test/unit_test/memory_manager/memory_manager_allocate_in_preferred_pool_tests.cpp @@ -582,6 +582,14 @@ TEST(MemoryManagerTest, givenPreemptionTypeWhenGetAllocationDataIsCalledThen48Bi EXPECT_TRUE(allocData.flags.resource48Bit); } +TEST(MemoryManagerTest, givenDeferredTasksListTypeWhenGetAllocationDataIsCalledThen48BitResourceIsTrue) { + AllocationData allocData; + MockMemoryManager mockMemoryManager; + AllocationProperties properties{mockRootDeviceIndex, 1, AllocationType::DEFERRED_TASKS_LIST, mockDeviceBitfield}; + mockMemoryManager.getAllocationData(allocData, properties, nullptr, mockMemoryManager.createStorageInfoFromProperties(properties)); + EXPECT_TRUE(allocData.flags.resource48Bit); +} + TEST(MemoryManagerTest, givenSharedContextImageTypeWhenGetAllocationDataIsCalledThenSystemMemoryIsRequested) { AllocationData allocData; MockMemoryManager mockMemoryManager; @@ -1094,6 +1102,7 @@ static const AllocationType allocationHaveToBeForcedTo48Bit[] = { AllocationType::TIMESTAMP_PACKET_TAG_BUFFER, AllocationType::RING_BUFFER, AllocationType::SEMAPHORE_BUFFER, + AllocationType::DEFERRED_TASKS_LIST, }; static const AllocationType allocationHaveNotToBeForcedTo48Bit[] = { diff --git a/shared/test/unit_test/memory_manager/storage_info_tests.cpp b/shared/test/unit_test/memory_manager/storage_info_tests.cpp index b0c83297a8..9d7c3b7855 100644 --- a/shared/test/unit_test/memory_manager/storage_info_tests.cpp +++ b/shared/test/unit_test/memory_manager/storage_info_tests.cpp @@ -212,6 +212,23 @@ TEST_F(MultiDeviceStorageInfoTest, givenSingleTileCsrWhenCreatingStorageInfoForP EXPECT_EQ(singleTileMask, storageInfo.pageTablesVisibility); } +TEST_F(MultiDeviceStorageInfoTest, givenMultiTileCsrWhenCreatingStorageInfoForDeferredTasksListAllocationThenAllMemoryBankAreOnAndPageTableClonningIsNotRequired) { + AllocationProperties properties{mockRootDeviceIndex, false, 0u, AllocationType::DEFERRED_TASKS_LIST, true, false, singleTileMask}; + auto storageInfo = memoryManager->createStorageInfoFromProperties(properties); + EXPECT_FALSE(storageInfo.cloningOfPageTables); + EXPECT_TRUE(storageInfo.tileInstanced); + EXPECT_EQ(allTilesMask, storageInfo.memoryBanks); + EXPECT_EQ(allTilesMask, storageInfo.pageTablesVisibility); +} + +TEST_F(MultiDeviceStorageInfoTest, givenSingleTileCsrWhenCreatingStorageInfoForDeferredTasksListAllocationThenSingleMemoryBankIsOnAndPageTableClonningIsRequired) { + AllocationProperties properties{mockRootDeviceIndex, false, 0u, AllocationType::DEFERRED_TASKS_LIST, false, false, singleTileMask}; + auto storageInfo = memoryManager->createStorageInfoFromProperties(properties); + EXPECT_TRUE(storageInfo.cloningOfPageTables); + EXPECT_EQ(singleTileMask, storageInfo.memoryBanks); + EXPECT_EQ(singleTileMask, storageInfo.pageTablesVisibility); +} + TEST_F(MultiDeviceStorageInfoTest, whenCreatingStorageInfoForWorkPartitionSurfaceThenAllMemoryBankAreOnAndPageTableClonningIsNotRequired) { AllocationProperties properties{mockRootDeviceIndex, false, 0u, AllocationType::WORK_PARTITION_SURFACE, true, false, singleTileMask}; auto storageInfo = memoryManager->createStorageInfoFromProperties(properties);