diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 292a92768f..da8c939945 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -1041,17 +1041,8 @@ NEO::SubmissionStatus CommandQueueHw::prepareAndSubmitBatchBuffer startAddress = 0; } - bool indirect = false; - if (csr->directSubmissionRelaxedOrderingEnabled()) { - // Indirect BB_START operates only on GPR_0 - NEO::EncodeSetMMIO::encodeREG(innerCommandStream, CS_GPR_R0, CS_GPR_R3); - NEO::EncodeSetMMIO::encodeREG(innerCommandStream, CS_GPR_R0 + 4, CS_GPR_R3 + 4); - - indirect = true; - } - endingCmd = innerCommandStream.getSpace(0); - NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&innerCommandStream, startAddress, false, indirect, false); + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&innerCommandStream, startAddress, false, false, false); } else { auto buffer = innerCommandStream.getSpaceForCmd(); *(MI_BATCH_BUFFER_END *)buffer = GfxFamily::cmdInitBatchBufferEnd; diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp index fdf7989956..c67202df44 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp @@ -437,54 +437,6 @@ HWTEST2_F(CommandQueueCreate, givenSwTagsEnabledWhenPrepareAndSubmitBatchBufferT commandQueue->destroy(); } -HWTEST2_F(CommandQueueCreate, givenCsrWithRelaxedOrderingWhenSubmittingThenProgramIndirectBbStart, IsAtLeastXeHpcCore) { - using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; - using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG; - - DebugManagerStateRestore restorer; - DebugManager.flags.DirectSubmissionRelaxedOrdering.set(1); - - const ze_command_queue_desc_t desc = {}; - auto commandQueue = new MockCommandQueueHw(device, neoDevice->getDefaultEngine().commandStreamReceiver, &desc); - commandQueue->initialize(false, false); - - ze_result_t returnValue; - auto commandList = std::unique_ptr(whiteboxCast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue))); - ASSERT_NE(nullptr, commandList); - auto &commandStream = commandQueue->commandStream; - - auto ultCsr = static_cast *>(neoDevice->getDefaultEngine().commandStreamReceiver); - - auto directSubmission = new MockDirectSubmissionHw>(*ultCsr); - ultCsr->directSubmission.reset(directSubmission); - - auto estimatedSize = 4096u; - NEO::LinearStream linearStream(commandStream.getSpace(estimatedSize), estimatedSize); - memset(commandStream.getCpuBase(), 0, estimatedSize); - typename MockCommandQueueHw::CommandListExecutionContext ctx{}; - ctx.isDirectSubmissionEnabled = true; - - commandQueue->prepareAndSubmitBatchBuffer(ctx, linearStream); - - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, linearStream.getCpuBase(), linearStream.getUsed())); - - auto itor = find(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itor); - - auto lrrCmd = genCmdCast(*itor); - EXPECT_EQ(lrrCmd->getSourceRegisterAddress(), CS_GPR_R3); - EXPECT_EQ(lrrCmd->getDestinationRegisterAddress(), CS_GPR_R0); - lrrCmd++; - EXPECT_EQ(lrrCmd->getSourceRegisterAddress(), CS_GPR_R3 + 4); - EXPECT_EQ(lrrCmd->getDestinationRegisterAddress(), CS_GPR_R0 + 4); - - auto bbStartCmd = reinterpret_cast(++lrrCmd); - EXPECT_EQ(1u, bbStartCmd->getIndirectAddressEnable()); - - commandQueue->destroy(); -} - template struct MockCommandQueueHwEstimateSizeTest : public MockCommandQueueHw { diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index 6be8aa1dd4..86d94c784f 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -52,7 +52,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { size_t getCmdsSizeForHardwareContext() const override; static void addBatchBufferEnd(LinearStream &commandStream, void **patchLocation); - void programEndingCmd(LinearStream &commandStream, void **patchLocation, bool directSubmissionEnabled, bool sipWaAllowed); + void programEndingCmd(LinearStream &commandStream, void **patchLocation, bool directSubmissionEnabled, bool hasRelaxedOrderingDependencies, bool sipWaAllowed); void addBatchBufferStart(MI_BATCH_BUFFER_START *commandBufferMemory, uint64_t startAddress, bool secondary); size_t getRequiredStateBaseAddressSize(const Device &device) const; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 4ed0e9e8e8..4286f632bb 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -101,7 +101,8 @@ inline void CommandStreamReceiverHw::addBatchBufferEnd(LinearStream & } template -inline void CommandStreamReceiverHw::programEndingCmd(LinearStream &commandStream, void **patchLocation, bool directSubmissionEnabled, bool sipWaAllowed) { +inline void CommandStreamReceiverHw::programEndingCmd(LinearStream &commandStream, void **patchLocation, bool directSubmissionEnabled, + bool hasRelaxedOrderingDependencies, bool sipWaAllowed) { if (directSubmissionEnabled) { uint64_t startAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(); if (DebugManager.flags.BatchBufferStartPrepatchingWaEnabled.get() == 0) { @@ -109,7 +110,7 @@ inline void CommandStreamReceiverHw::programEndingCmd(LinearStream &c } bool indirect = false; - if (DebugManager.flags.DirectSubmissionRelaxedOrdering.get() == 1) { + if (DebugManager.flags.DirectSubmissionRelaxedOrdering.get() == 1 && hasRelaxedOrderingDependencies) { NEO::EncodeSetMMIO::encodeREG(commandStream, CS_GPR_R0, CS_GPR_R3); NEO::EncodeSetMMIO::encodeREG(commandStream, CS_GPR_R0 + 4, CS_GPR_R3 + 4); @@ -571,7 +572,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( GraphicsAllocation *chainedBatchBuffer = nullptr; bool directSubmissionEnabled = isDirectSubmissionEnabled(); if (submitTask) { - programEndingCmd(commandStreamTask, &bbEndLocation, directSubmissionEnabled, true); + programEndingCmd(commandStreamTask, &bbEndLocation, directSubmissionEnabled, dispatchFlags.hasRelaxedOrderingDependencies, true); EncodeNoop::emitNoop(commandStreamTask, bbEndPaddingSize); EncodeNoop::alignToCacheLine(commandStreamTask); @@ -602,7 +603,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( this->programEpilogue(commandStreamCSR, device, &bbEndLocation, dispatchFlags); } else if (submitCSR) { - programEndingCmd(commandStreamCSR, &bbEndLocation, directSubmissionEnabled, true); + programEndingCmd(commandStreamCSR, &bbEndLocation, directSubmissionEnabled, dispatchFlags.hasRelaxedOrderingDependencies, true); EncodeNoop::emitNoop(commandStreamCSR, bbEndPaddingSize); EncodeNoop::alignToCacheLine(commandStreamCSR); DEBUG_BREAK_IF(commandStreamCSR.getUsed() > commandStreamCSR.getMaxAvailableSpace()); @@ -1167,7 +1168,7 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert } void *endingCmdPtr = nullptr; - programEndingCmd(commandStream, &endingCmdPtr, blitterDirectSubmission, false); + programEndingCmd(commandStream, &endingCmdPtr, blitterDirectSubmission, false, false); EncodeNoop::alignToCacheLine(commandStream); @@ -1276,7 +1277,7 @@ SubmissionStatus CommandStreamReceiverHw::flushSmallTask(LinearStream using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END; void *endingCmdPtr = nullptr; - programEndingCmd(commandStreamTask, &endingCmdPtr, isAnyDirectSubmissionEnabled(), false); + programEndingCmd(commandStreamTask, &endingCmdPtr, isAnyDirectSubmissionEnabled(), false, false); auto bytesToPad = EncodeBatchBufferStartOrEnd::getBatchBufferStartSize() - EncodeBatchBufferStartOrEnd::getBatchBufferEndSize(); @@ -1357,7 +1358,7 @@ inline void CommandStreamReceiverHw::programEpilogue(LinearStream &cs addBatchBufferStart(reinterpret_cast(*batchBufferEndLocation), gpuAddress, false); this->programEpliogueCommands(csr, dispatchFlags); - programEndingCmd(csr, batchBufferEndLocation, isDirectSubmissionEnabled(), !EngineHelpers::isBcs(osContext->getEngineType())); + programEndingCmd(csr, batchBufferEndLocation, isDirectSubmissionEnabled(), false, !EngineHelpers::isBcs(osContext->getEngineType())); EncodeNoop::alignToCacheLine(csr); } } diff --git a/shared/source/direct_submission/direct_submission_hw.h b/shared/source/direct_submission/direct_submission_hw.h index 022af122bf..59dde6c92a 100644 --- a/shared/source/direct_submission/direct_submission_hw.h +++ b/shared/source/direct_submission/direct_submission_hw.h @@ -119,7 +119,7 @@ class DirectSubmissionHw { MOCKABLE_VIRTUAL void dispatchRelaxedOrderingSchedulerSection(uint32_t value); - void dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr, bool hasRelaxedOrderingDependencies); + void dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr); void dispatchStartSection(uint64_t gpuStartAddress); size_t getSizeStartSection(); diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index 5c29283081..561f2c2a11 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -661,7 +661,7 @@ void *DirectSubmissionHw::dispatchWorkloadSection(BatchBu void *returnCmd = batchBuffer.endCmdPtr; LinearStream relaxedOrderingReturnPtrCmdStream; - if (this->relaxedOrderingEnabled) { + if (this->relaxedOrderingEnabled && batchBuffer.hasRelaxedOrderingDependencies) { // preallocate and patch after start section auto relaxedOrderingReturnPtrCmds = ringCommandStream.getSpace(RelaxedOrderingHelper::getSizeReturnPtrRegs()); relaxedOrderingReturnPtrCmdStream.replaceBuffer(relaxedOrderingReturnPtrCmds, RelaxedOrderingHelper::getSizeReturnPtrRegs()); @@ -671,8 +671,8 @@ void *DirectSubmissionHw::dispatchWorkloadSection(BatchBu uint64_t returnGpuPointer = ringCommandStream.getCurrentGpuAddressPosition(); - if (this->relaxedOrderingEnabled) { - dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer, batchBuffer.hasRelaxedOrderingDependencies); + if (this->relaxedOrderingEnabled && batchBuffer.hasRelaxedOrderingDependencies) { + dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer); } else { setReturnAddress(returnCmd, returnGpuPointer); } @@ -723,15 +723,13 @@ size_t DirectSubmissionHw::getSizeDispatchRelaxedOrdering } template -void DirectSubmissionHw::dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr, bool hasRelaxedOrderingDependencies) { +void DirectSubmissionHw::dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr) { LriHelper::program(&cmdStream, CS_GPR_R4, static_cast(returnPtr & 0xFFFF'FFFFULL), true); LriHelper::program(&cmdStream, CS_GPR_R4 + 4, static_cast(returnPtr >> 32), true); uint64_t returnPtrAfterTaskStoreSection = returnPtr; - if (hasRelaxedOrderingDependencies) { - returnPtrAfterTaskStoreSection += RelaxedOrderingHelper::getSizeTaskStoreSection(); - } + returnPtrAfterTaskStoreSection += RelaxedOrderingHelper::getSizeTaskStoreSection(); LriHelper::program(&cmdStream, CS_GPR_R3, static_cast(returnPtrAfterTaskStoreSection & 0xFFFF'FFFFULL), true); LriHelper::program(&cmdStream, CS_GPR_R3 + 4, static_cast(returnPtrAfterTaskStoreSection >> 32), true); diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp index cd4c360a25..e48684e2e0 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp @@ -677,7 +677,7 @@ HWTEST_F(DirectSubmissionTest, givenDirectSubmissionAvailableWhenProgrammingEndi uint8_t buffer[128]; mockCsr->commandStream.replaceBuffer(&buffer[0], 128u); mockCsr->commandStream.replaceGraphicsAllocation(&mockAllocation); - mockCsr->programEndingCmd(mockCsr->commandStream, &location, ret, true); + mockCsr->programEndingCmd(mockCsr->commandStream, &location, ret, false, true); EXPECT_EQ(sizeof(MI_BATCH_BUFFER_START), mockCsr->commandStream.getUsed()); DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); @@ -718,7 +718,7 @@ HWTEST_F(DirectSubmissionTest, givenDebugFlagSetWhenProgrammingEndingCommandThen auto currectBbStartCmd = reinterpret_cast(cmdStream.getSpace(0)); uint64_t expectedGpuVa = cmdStream.getGraphicsAllocation()->getGpuAddress() + cmdStream.getUsed(); - mockCsr->programEndingCmd(cmdStream, &location, ret, true); + mockCsr->programEndingCmd(cmdStream, &location, ret, false, true); EncodeNoop::alignToCacheLine(cmdStream); if (value == 0) { diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp index 30a8a89a50..c41f6fe83e 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp @@ -2095,7 +2095,7 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenProgrammingEndingCmdsThenSet auto endingPtr = commandStream.getSpace(0); - ultCsr->programEndingCmd(commandStream, &endingPtr, true, false); + ultCsr->programEndingCmd(commandStream, &endingPtr, true, true, false); auto lrrCmd = reinterpret_cast(commandStream.getCpuBase()); EXPECT_EQ(lrrCmd->getSourceRegisterAddress(), CS_GPR_R3); @@ -2109,6 +2109,28 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenProgrammingEndingCmdsThenSet EXPECT_EQ(1u, bbStartCmd->getIndirectAddressEnable()); } +HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBbWithoutRelaxedOrderingDependencieswhenProgrammingEndingCmdsThenDontSetReturnRegisters, IsAtLeastXeHpcCore) { + using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG; + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + auto ultCsr = static_cast *>(pDevice->getDefaultEngine().commandStreamReceiver); + + auto directSubmission = new MockDirectSubmissionHw>(*ultCsr); + directSubmission->initialize(true, false); + + ultCsr->directSubmission.reset(directSubmission); + + auto &commandStream = ultCsr->getCS(0x100); + + auto endingPtr = commandStream.getSpace(0); + + ultCsr->programEndingCmd(commandStream, &endingPtr, true, false, false); + + auto bbStartCmd = genCmdCast(commandStream.getCpuBase()); + ASSERT_NE(nullptr, bbStartCmd); + EXPECT_EQ(0u, bbStartCmd->getIndirectAddressEnable()); +} + HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenDispatchingWorkloadSectionThenProgramReturnPtrs, IsAtLeastXeHpcCore) { using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; @@ -2136,6 +2158,21 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenDispatchingWorkloadSectionTh EXPECT_EQ(0, memcmp(&originalBbStart, batchBuffer.endCmdPtr, sizeof(MI_BATCH_BUFFER_START))); } +HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBbWithoutRelaxedOrderingDependencieswhenDispatchingWorkloadSectionThenDontProgramReturnPtrs, IsAtLeastXeHpcCore) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + + MockDirectSubmissionHw> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + directSubmission.initialize(true, false); + + auto offset = directSubmission.ringCommandStream.getUsed(); + + batchBuffer.hasRelaxedOrderingDependencies = false; + directSubmission.dispatchWorkloadSection(batchBuffer); + + auto lriCmd = genCmdCast(ptrOffset(directSubmission.ringCommandStream.getCpuBase(), offset)); + EXPECT_EQ(nullptr, lriCmd); +} + HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBbWithStallingCmdsAndDependenciesWhenDispatchingNextCmdBufferThenProgramSchedulerIfNeeded, IsAtLeastXeHpcCore) { { MockDirectSubmissionHw> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); @@ -2523,7 +2560,7 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenSchedulerRequiredWhenDispat size_t offset = directSubmission.ringCommandStream.getUsed(); - directSubmission.dispatchRelaxedOrderingReturnPtrRegs(directSubmission.ringCommandStream, returnPtr, true); + directSubmission.dispatchRelaxedOrderingReturnPtrRegs(directSubmission.ringCommandStream, returnPtr); auto lriCmd = reinterpret_cast(ptrOffset(directSubmission.ringCommandStream.getCpuBase(), offset)); EXPECT_TRUE(verifyLri(lriCmd, CS_GPR_R4, static_cast(returnPtr & 0xFFFF'FFFFULL))); @@ -2531,15 +2568,4 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenSchedulerRequiredWhenDispat EXPECT_TRUE(verifyLri(++lriCmd, CS_GPR_R3, static_cast(returnPtr2 & 0xFFFF'FFFFULL))); EXPECT_TRUE(verifyLri(++lriCmd, CS_GPR_R3 + 4, static_cast(returnPtr2 >> 32))); - - offset = directSubmission.ringCommandStream.getUsed(); - - directSubmission.dispatchRelaxedOrderingReturnPtrRegs(directSubmission.ringCommandStream, returnPtr, false); - - lriCmd = reinterpret_cast(ptrOffset(directSubmission.ringCommandStream.getCpuBase(), offset)); - EXPECT_TRUE(verifyLri(lriCmd, CS_GPR_R4, static_cast(returnPtr & 0xFFFF'FFFFULL))); - EXPECT_TRUE(verifyLri(++lriCmd, CS_GPR_R4 + 4, static_cast(returnPtr >> 32))); - - EXPECT_TRUE(verifyLri(++lriCmd, CS_GPR_R3, static_cast(returnPtr & 0xFFFF'FFFFULL))); - EXPECT_TRUE(verifyLri(++lriCmd, CS_GPR_R3 + 4, static_cast(returnPtr >> 32))); } \ No newline at end of file