diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 8205da8d75..419f94642b 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -147,6 +147,10 @@ struct CommandList : _ze_command_list_handle_t { return commandListPerThreadScratchSize; } + void setCommandListPerThreadScratchSize(uint32_t size) { + commandListPerThreadScratchSize = size; + } + NEO::PreemptionMode getCommandListPreemptionMode() const { return commandListPreemptionMode; } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index a4730ca004..01de7f4fa3 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -69,6 +69,11 @@ ze_result_t CommandListCoreFamily::reset() { removeHostPtrAllocations(); commandContainer.reset(); containsStatelessUncachedResource = false; + indirectAllocationsAllowed = false; + unifiedMemoryControls.indirectHostAllocationsAllowed = false; + unifiedMemoryControls.indirectSharedAllocationsAllowed = false; + commandListPreemptionMode = device->getDevicePreemptionMode(); + commandListPerThreadScratchSize = 0u; if (!isCopyOnly()) { if (!NEO::ApiSpecificConfig::getBindlessConfiguration()) { diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index 5598ed8ab7..e4c6cc71ed 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -57,8 +57,10 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z UNRECOVERABLE_IF(kernel == nullptr); appendEventForProfiling(hEvent, true); const auto functionImmutableData = kernel->getImmutableData(); - commandListPerThreadScratchSize = std::max(commandListPerThreadScratchSize, - kernel->getImmutableData()->getDescriptor().kernelAttributes.perThreadScratchSize[0]); + auto perThreadScratchSize = std::max(this->getCommandListPerThreadScratchSize(), + kernel->getImmutableData()->getDescriptor().kernelAttributes.perThreadScratchSize[0]); + + this->setCommandListPerThreadScratchSize(perThreadScratchSize); auto kernelPreemptionMode = obtainFunctionPreemptionMode(kernel); commandListPreemptionMode = std::min(commandListPreemptionMode, kernelPreemptionMode); diff --git a/level_zero/core/source/cmdqueue/cmdqueue.h b/level_zero/core/source/cmdqueue/cmdqueue.h index beca58232a..24c2ff5aa3 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue.h +++ b/level_zero/core/source/cmdqueue/cmdqueue.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -54,7 +54,6 @@ struct CommandQueue : _ze_command_queue_handle_t { } protected: - std::atomic commandQueuePerThreadScratchSize; NEO::PreemptionMode commandQueuePreemptionMode = NEO::PreemptionMode::Initial; bool commandQueueDebugCmdsProgrammed = false; bool isCopyOnlyCommandQueue = false; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.h b/level_zero/core/source/cmdqueue/cmdqueue_hw.h index 4d7925991f..f047e40355 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.h @@ -37,7 +37,7 @@ struct CommandQueueHw : public CommandQueueImp { void programStateBaseAddress(uint64_t gsba, bool useLocalMemoryForIndirectHeap, NEO::LinearStream &commandStream); size_t estimateStateBaseAddressCmdSize(); - MOCKABLE_VIRTUAL void programFrontEnd(uint64_t scratchAddress, NEO::LinearStream &commandStream); + MOCKABLE_VIRTUAL void programFrontEnd(uint64_t scratchAddress, uint32_t perThreadScratchSpaceSize, NEO::LinearStream &commandStream); size_t estimateFrontEndCmdSize(); size_t estimatePipelineSelect(); @@ -46,7 +46,8 @@ struct CommandQueueHw : public CommandQueueImp { MOCKABLE_VIRTUAL void handleScratchSpace(NEO::ResidencyContainer &residency, NEO::HeapContainer &heapContainer, NEO::ScratchSpaceController *scratchController, - bool &gsbaState, bool &frontEndState); + bool &gsbaState, bool &frontEndState, + uint32_t perThreadScratchSpaceSize); }; } // namespace L0 diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 9f8d96a7df..c6281e062f 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -17,7 +17,6 @@ #include "shared/source/device/device.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/hw_info.h" -#include "shared/source/helpers/interlocked_max.h" #include "shared/source/helpers/preamble.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/residency_container.h" @@ -118,6 +117,7 @@ ze_result_t CommandQueueHw::executeCommandLists( device->activateMetricGroups(); size_t totalCmdBuffers = 0; + uint32_t perThreadScratchSpaceSize = 0; for (auto i = 0u; i < numCommandLists; i++) { auto commandList = CommandList::fromHandle(phCommandLists[i]); @@ -140,7 +140,10 @@ ze_result_t CommandQueueHw::executeCommandLists( statePreemption = commandListPreemption; } - interlockedMax(commandQueuePerThreadScratchSize, commandList->getCommandListPerThreadScratchSize()); + if (perThreadScratchSpaceSize < commandList->getCommandListPerThreadScratchSize()) { + perThreadScratchSpaceSize = commandList->getCommandListPerThreadScratchSize(); + } + if (commandList->getCommandListPerThreadScratchSize() != 0) { if (commandList->commandContainer.getIndirectHeap(NEO::HeapType::SURFACE_STATE) != nullptr) { heapContainer.push_back(commandList->commandContainer.getIndirectHeap(NEO::HeapType::SURFACE_STATE)->getGraphicsAllocation()); @@ -176,10 +179,11 @@ ze_result_t CommandQueueHw::executeCommandLists( handleScratchSpace(residencyContainer, heapContainer, scratchSpaceController, - gsbaStateDirty, frontEndStateDirty); + gsbaStateDirty, frontEndStateDirty, + perThreadScratchSpaceSize); gsbaStateDirty |= !gsbaInit; - frontEndStateDirty |= !frontEndInit; + frontEndStateDirty |= csr->getMediaVFEStateDirty(); if (!isCopyOnlyCommandQueue) { if (!gpgpuEnabled) { @@ -225,7 +229,7 @@ ze_result_t CommandQueueHw::executeCommandLists( } if (frontEndStateDirty) { - programFrontEnd(scratchSpaceController->getScratchPatchAddress(), child); + programFrontEnd(scratchSpaceController->getScratchPatchAddress(), scratchSpaceController->getPerThreadScratchSpaceSize(), child); } if (gsbaStateDirty) { auto indirectHeap = CommandList::fromHandle(phCommandLists[0])->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT); @@ -372,18 +376,18 @@ ze_result_t CommandQueueHw::executeCommandLists( } template -void CommandQueueHw::programFrontEnd(uint64_t scratchAddress, NEO::LinearStream &commandStream) { +void CommandQueueHw::programFrontEnd(uint64_t scratchAddress, uint32_t perThreadScratchSpaceSize, NEO::LinearStream &commandStream) { using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; UNRECOVERABLE_IF(csr == nullptr); NEO::PreambleHelper::programVFEState(&commandStream, device->getHwInfo(), - commandQueuePerThreadScratchSize, + perThreadScratchSpaceSize, scratchAddress, device->getMaxNumHwThreads(), csr->getOsContext().getEngineType(), NEO::AdditionalKernelExecInfo::NotApplicable, NEO::KernelExecutionType::NotApplicable); - frontEndInit = true; + csr->setMediaVFEStateDirty(false); } template diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl index fbaae85813..6f43936e49 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl @@ -110,10 +110,11 @@ template void CommandQueueHw::handleScratchSpace(NEO::ResidencyContainer &residency, NEO::HeapContainer &heapContainer, NEO::ScratchSpaceController *scratchController, - bool &gsbaState, bool &frontEndState) { + bool &gsbaState, bool &frontEndState, + uint32_t perThreadScratchSpaceSize) { - if (commandQueuePerThreadScratchSize > 0) { - scratchController->setRequiredScratchSpace(nullptr, 0u, commandQueuePerThreadScratchSize, 0u, csr->peekTaskCount(), + if (perThreadScratchSpaceSize > 0) { + scratchController->setRequiredScratchSpace(nullptr, 0u, perThreadScratchSpaceSize, 0u, csr->peekTaskCount(), csr->getOsContext(), gsbaState, frontEndState); auto scratchAllocation = scratchController->getScratchSpaceAllocation(); residency.push_back(scratchAllocation); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_imp.h b/level_zero/core/source/cmdqueue/cmdqueue_imp.h index ff0bd80b41..940d33280b 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_imp.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_imp.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -62,7 +62,6 @@ struct CommandQueueImp : public CommandQueue { CommandQueueImp() = delete; CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc) : device(device), csr(csr), desc(*desc) { - std::atomic_init(&commandQueuePerThreadScratchSize, 0u); } ze_result_t destroy() override; @@ -95,7 +94,6 @@ struct CommandQueueImp : public CommandQueue { std::atomic taskCount{0}; std::vector printfFunctionContainer; bool gsbaInit = false; - bool frontEndInit = false; bool gpgpuEnabled = false; CommandBufferManager buffers; NEO::ResidencyContainer residencyContainer; diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h index 5a9b797215..c03ae86bd6 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -25,7 +25,6 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp { using BaseClass::device; using BaseClass::printfFunctionContainer; using BaseClass::synchronizeByPollingForTaskCount; - using CommandQueue::commandQueuePerThreadScratchSize; using CommandQueue::internalUsage; WhiteBox(Device *device, NEO::CommandStreamReceiver *csr, @@ -78,7 +77,6 @@ struct MockCommandQueueHw : public L0::CommandQueueHw { using BaseClass = ::L0::CommandQueueHw; using BaseClass::commandStream; using BaseClass::printfFunctionContainer; - using L0::CommandQueue::commandQueuePerThreadScratchSize; using L0::CommandQueue::internalUsage; MockCommandQueueHw(L0::Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc) : L0::CommandQueueHw(device, csr, desc) { diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp index 0152ccd97d..537e267b75 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp @@ -735,11 +735,12 @@ class MockCommandQueue : public L0::CommandQueueHw { void handleScratchSpace(NEO::ResidencyContainer &residency, NEO::HeapContainer &heapContainer, NEO::ScratchSpaceController *scratchController, - bool &gsbaState, bool &frontEndState) override { + bool &gsbaState, bool &frontEndState, + uint32_t perThreadScratchSpaceSize) override { this->mockHeapContainer = heapContainer; } - void programFrontEnd(uint64_t scratchAddress, NEO::LinearStream &commandStream) override { + void programFrontEnd(uint64_t scratchAddress, uint32_t perThreadScratchSpaceSize, NEO::LinearStream &commandStream) override { return; } }; @@ -755,7 +756,7 @@ HWTEST2_F(CommandQueueDestroy, givenCommandQueueAndCommandListWithSshAndScratchW commandQueue->initialize(false, false); auto commandList = new CommandListCoreFamily(); commandList->initialize(device, NEO::EngineGroupType::Compute); - commandList->commandListPerThreadScratchSize = 100u; + commandList->setCommandListPerThreadScratchSize(100u); auto commandListHandle = commandList->toHandle(); void *alloc = alignedMalloc(0x100, 0x100); @@ -784,7 +785,7 @@ HWTEST2_F(CommandQueueDestroy, givenCommandQueueAndCommandListWithWhenBindlessEn commandQueue->initialize(false, false); auto commandList = new CommandListCoreFamily(); commandList->initialize(device, NEO::EngineGroupType::Compute); - commandList->commandListPerThreadScratchSize = 100u; + commandList->setCommandListPerThreadScratchSize(100u); auto commandListHandle = commandList->toHandle(); commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false); @@ -803,7 +804,7 @@ HWTEST2_F(ExecuteCommandListTests, givenExecuteCommandListWhenItReturnsThenConta commandQueue->initialize(false, false); auto commandList = new CommandListCoreFamily(); commandList->initialize(device, NEO::EngineGroupType::Compute); - commandList->commandListPerThreadScratchSize = 100u; + commandList->setCommandListPerThreadScratchSize(100u); auto commandListHandle = commandList->toHandle(); void *alloc = alignedMalloc(0x100, 0x100); @@ -823,6 +824,344 @@ HWTEST2_F(ExecuteCommandListTests, givenExecuteCommandListWhenItReturnsThenConta alignedFree(alloc); } +HWTEST2_F(ExecuteCommandListTests, givenCommandQueueHavingTwoB2BCommandListsThenMVSDirtyFlagIsSetOnlyOnce, CommandQueueExecuteTestSupport) { + ze_command_queue_desc_t desc = {}; + NEO::CommandStreamReceiver *csr; + device->getCsrForOrdinalAndIndex(&csr, 0u, 0u); + ze_result_t returnValue; + auto commandQueue = CommandQueue::create(productFamily, + device, + csr, + &desc, + false, + false, + returnValue); + auto commandList0 = new CommandListCoreFamily(); + commandList0->initialize(device, NEO::EngineGroupType::Compute); + commandList0->setCommandListPerThreadScratchSize(0u); + auto commandList1 = new CommandListCoreFamily(); + commandList1->initialize(device, NEO::EngineGroupType::Compute); + commandList1->setCommandListPerThreadScratchSize(0u); + auto commandListHandle0 = commandList0->toHandle(); + auto commandListHandle1 = commandList1->toHandle(); + + EXPECT_EQ(true, csr->getMediaVFEStateDirty()); + commandQueue->executeCommandLists(1, &commandListHandle0, nullptr, false); + EXPECT_EQ(false, csr->getMediaVFEStateDirty()); + commandQueue->executeCommandLists(1, &commandListHandle1, nullptr, false); + EXPECT_EQ(false, csr->getMediaVFEStateDirty()); + + commandQueue->destroy(); + commandList0->destroy(); + commandList1->destroy(); +} + +using CommandQueueExecuteSupport = IsWithinProducts; +HWTEST2_F(ExecuteCommandListTests, givenCommandQueueHavingTwoB2BCommandListsThenMVSIsProgrammedOnlyOnce, CommandQueueExecuteSupport) { + using MEDIA_VFE_STATE = typename FamilyType::MEDIA_VFE_STATE; + ze_command_queue_desc_t desc = {}; + NEO::CommandStreamReceiver *csr; + device->getCsrForOrdinalAndIndex(&csr, 0u, 0u); + ze_result_t returnValue; + auto commandQueue = whitebox_cast(CommandQueue::create(productFamily, + device, + csr, + &desc, + false, + false, + returnValue)); + auto commandList0 = std::unique_ptr(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue))); + commandList0->setCommandListPerThreadScratchSize(0u); + auto commandList1 = std::unique_ptr(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue))); + commandList1->setCommandListPerThreadScratchSize(0u); + auto commandListHandle0 = commandList0->toHandle(); + auto commandListHandle1 = commandList1->toHandle(); + + ASSERT_NE(nullptr, commandQueue->commandStream); + + commandQueue->executeCommandLists(1, &commandListHandle0, nullptr, false); + commandQueue->executeCommandLists(1, &commandListHandle1, nullptr, false); + + auto usedSpaceAfter = commandQueue->commandStream->getUsed(); + + GenCmdList cmdList1; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList1, ptrOffset(commandQueue->commandStream->getCpuBase(), 0), usedSpaceAfter)); + + auto mediaVfeStates = findAll(cmdList1.begin(), cmdList1.end()); + // We should have only 1 state added + ASSERT_EQ(1u, mediaVfeStates.size()); + + commandQueue->destroy(); +} + +HWTEST2_F(ExecuteCommandListTests, givenTwoCommandQueuesHavingTwoB2BCommandListsWithPTSSsetForFirstCmdListThenMVSIsProgrammedOnlyOnce, CommandQueueExecuteSupport) { + using MEDIA_VFE_STATE = typename FamilyType::MEDIA_VFE_STATE; + ze_command_queue_desc_t desc = {}; + NEO::CommandStreamReceiver *csr; + device->getCsrForOrdinalAndIndex(&csr, 0u, 0u); + ze_result_t returnValue; + auto commandQueue = whitebox_cast(CommandQueue::create(productFamily, + device, + csr, + &desc, + false, + false, + returnValue)); + auto commandList0 = std::unique_ptr(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue))); + auto commandList1 = std::unique_ptr(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue))); + commandList0->setCommandListPerThreadScratchSize(512u); + commandList1->setCommandListPerThreadScratchSize(0u); + auto commandListHandle0 = commandList0->toHandle(); + auto commandListHandle1 = commandList1->toHandle(); + + commandQueue->executeCommandLists(1, &commandListHandle0, nullptr, false); + EXPECT_EQ(512u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + commandQueue->executeCommandLists(1, &commandListHandle1, nullptr, false); + EXPECT_EQ(512u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + + auto usedSpaceAfter = commandQueue->commandStream->getUsed(); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandQueue->commandStream->getCpuBase(), 0), usedSpaceAfter)); + + auto mediaVfeStates = findAll(cmdList.begin(), cmdList.end()); + // We should have only 1 state added + ASSERT_EQ(1u, mediaVfeStates.size()); + + commandList0->reset(); + commandList0->setCommandListPerThreadScratchSize(0u); + commandList1->reset(); + commandList1->setCommandListPerThreadScratchSize(0u); + + auto commandQueue1 = whitebox_cast(CommandQueue::create(productFamily, + device, + csr, + &desc, + false, + false, + returnValue)); + + commandQueue1->executeCommandLists(1, &commandListHandle0, nullptr, false); + EXPECT_EQ(512u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + commandQueue1->executeCommandLists(1, &commandListHandle1, nullptr, false); + EXPECT_EQ(512u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + + usedSpaceAfter = commandQueue1->commandStream->getUsed(); + + GenCmdList cmdList1; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList1, ptrOffset(commandQueue1->commandStream->getCpuBase(), 0), usedSpaceAfter)); + + mediaVfeStates = findAll(cmdList1.begin(), cmdList1.end()); + // We should have no state added + ASSERT_EQ(0u, mediaVfeStates.size()); + + commandQueue->destroy(); + commandQueue1->destroy(); +} + +HWTEST2_F(ExecuteCommandListTests, givenTwoCommandQueuesHavingTwoB2BCommandListsAndWithPTSSsetForSecondCmdListThenMVSIsProgrammedTwice, CommandQueueExecuteSupport) { + using MEDIA_VFE_STATE = typename FamilyType::MEDIA_VFE_STATE; + ze_command_queue_desc_t desc = {}; + NEO::CommandStreamReceiver *csr; + device->getCsrForOrdinalAndIndex(&csr, 0u, 0u); + ze_result_t returnValue; + auto commandQueue = whitebox_cast(CommandQueue::create(productFamily, + device, + csr, + &desc, + false, + false, + returnValue)); + auto commandList0 = std::unique_ptr(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue))); + auto commandList1 = std::unique_ptr(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue))); + commandList0->setCommandListPerThreadScratchSize(0u); + commandList1->setCommandListPerThreadScratchSize(512u); + auto commandListHandle0 = commandList0->toHandle(); + auto commandListHandle1 = commandList1->toHandle(); + + commandQueue->executeCommandLists(1, &commandListHandle0, nullptr, false); + EXPECT_EQ(0u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + commandQueue->executeCommandLists(1, &commandListHandle1, nullptr, false); + EXPECT_EQ(512u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + + auto usedSpaceAfter = commandQueue->commandStream->getUsed(); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandQueue->commandStream->getCpuBase(), 0), usedSpaceAfter)); + + auto mediaVfeStates = findAll(cmdList.begin(), cmdList.end()); + // We should have 2 states added + ASSERT_EQ(2u, mediaVfeStates.size()); + + commandList0->reset(); + commandList0->setCommandListPerThreadScratchSize(512u); + commandList1->reset(); + commandList1->setCommandListPerThreadScratchSize(0u); + + auto commandQueue1 = whitebox_cast(CommandQueue::create(productFamily, + device, + csr, + &desc, + false, + false, + returnValue)); + + commandQueue1->executeCommandLists(1, &commandListHandle0, nullptr, false); + EXPECT_EQ(512u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + commandQueue1->executeCommandLists(1, &commandListHandle1, nullptr, false); + EXPECT_EQ(512u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + + usedSpaceAfter = commandQueue1->commandStream->getUsed(); + + GenCmdList cmdList1; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList1, ptrOffset(commandQueue1->commandStream->getCpuBase(), 0), usedSpaceAfter)); + + mediaVfeStates = findAll(cmdList1.begin(), cmdList1.end()); + // We should have no state added + ASSERT_EQ(0u, mediaVfeStates.size()); + + commandQueue->destroy(); + commandQueue1->destroy(); +} + +HWTEST2_F(ExecuteCommandListTests, givenTwoCommandQueuesHavingTwoB2BCommandListsAndWithPTSSGrowingThenMVSIsProgrammedTwice, CommandQueueExecuteSupport) { + using MEDIA_VFE_STATE = typename FamilyType::MEDIA_VFE_STATE; + ze_command_queue_desc_t desc = {}; + NEO::CommandStreamReceiver *csr; + device->getCsrForOrdinalAndIndex(&csr, 0u, 0u); + ze_result_t returnValue; + auto commandQueue = whitebox_cast(CommandQueue::create(productFamily, + device, + csr, + &desc, + false, + false, + returnValue)); + auto commandList0 = std::unique_ptr(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue))); + auto commandList1 = std::unique_ptr(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue))); + commandList0->setCommandListPerThreadScratchSize(512u); + commandList1->setCommandListPerThreadScratchSize(512u); + auto commandListHandle0 = commandList0->toHandle(); + auto commandListHandle1 = commandList1->toHandle(); + + commandQueue->executeCommandLists(1, &commandListHandle0, nullptr, false); + EXPECT_EQ(512u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + commandQueue->executeCommandLists(1, &commandListHandle1, nullptr, false); + EXPECT_EQ(512u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + + auto usedSpaceAfter = commandQueue->commandStream->getUsed(); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandQueue->commandStream->getCpuBase(), 0), usedSpaceAfter)); + + auto mediaVfeStates = findAll(cmdList.begin(), cmdList.end()); + // We should have only 1 state added + ASSERT_EQ(1u, mediaVfeStates.size()); + + commandList0->reset(); + commandList0->setCommandListPerThreadScratchSize(1024u); + commandList1->reset(); + commandList1->setCommandListPerThreadScratchSize(1024u); + + auto commandQueue1 = whitebox_cast(CommandQueue::create(productFamily, + device, + csr, + &desc, + false, + false, + returnValue)); + + commandQueue1->executeCommandLists(1, &commandListHandle0, nullptr, false); + EXPECT_EQ(1024u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + commandQueue1->executeCommandLists(1, &commandListHandle1, nullptr, false); + EXPECT_EQ(1024u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + + usedSpaceAfter = commandQueue1->commandStream->getUsed(); + + GenCmdList cmdList1; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList1, ptrOffset(commandQueue1->commandStream->getCpuBase(), 0), usedSpaceAfter)); + + mediaVfeStates = findAll(cmdList1.begin(), cmdList1.end()); + // We should have only 1 state added + ASSERT_EQ(1u, mediaVfeStates.size()); + + commandQueue->destroy(); + commandQueue1->destroy(); +} + +HWTEST2_F(ExecuteCommandListTests, givenTwoCommandQueuesHavingTwoB2BCommandListsAndWithPTSSUniquePerCmdListThenMVSIsProgrammedOncePerSubmission, CommandQueueExecuteSupport) { + using MEDIA_VFE_STATE = typename FamilyType::MEDIA_VFE_STATE; + ze_command_queue_desc_t desc = {}; + NEO::CommandStreamReceiver *csr; + device->getCsrForOrdinalAndIndex(&csr, 0u, 0u); + ze_result_t returnValue; + auto commandQueue = whitebox_cast(CommandQueue::create(productFamily, + device, + csr, + &desc, + false, + false, + returnValue)); + auto commandList0 = std::unique_ptr(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue))); + auto commandList1 = std::unique_ptr(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue))); + commandList0->setCommandListPerThreadScratchSize(0u); + commandList1->setCommandListPerThreadScratchSize(512u); + auto commandListHandle0 = commandList0->toHandle(); + auto commandListHandle1 = commandList1->toHandle(); + + commandQueue->executeCommandLists(1, &commandListHandle0, nullptr, false); + EXPECT_EQ(0u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + commandQueue->executeCommandLists(1, &commandListHandle1, nullptr, false); + EXPECT_EQ(512u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + + auto usedSpaceAfter = commandQueue->commandStream->getUsed(); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandQueue->commandStream->getCpuBase(), 0), usedSpaceAfter)); + + auto mediaVfeStates = findAll(cmdList.begin(), cmdList.end()); + // We should have 2 states added + ASSERT_EQ(2u, mediaVfeStates.size()); + + commandList0->reset(); + commandList0->setCommandListPerThreadScratchSize(1024u); + commandList1->reset(); + commandList1->setCommandListPerThreadScratchSize(2048u); + + auto commandQueue1 = whitebox_cast(CommandQueue::create(productFamily, + device, + csr, + &desc, + false, + false, + returnValue)); + commandQueue1->executeCommandLists(1, &commandListHandle0, nullptr, false); + EXPECT_EQ(1024u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + commandQueue1->executeCommandLists(1, &commandListHandle1, nullptr, false); + EXPECT_EQ(2048u, csr->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + + usedSpaceAfter = commandQueue1->commandStream->getUsed(); + + GenCmdList cmdList1; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList1, ptrOffset(commandQueue1->commandStream->getCpuBase(), 0), usedSpaceAfter)); + + mediaVfeStates = findAll(cmdList1.begin(), cmdList1.end()); + // We should have 2 states added + ASSERT_EQ(2u, mediaVfeStates.size()); + + commandQueue->destroy(); + commandQueue1->destroy(); +} + using CommandQueueSynchronizeTest = Test; HWTEST_F(CommandQueueSynchronizeTest, givenCallToSynchronizeThenCorrectEnableTimeoutAndTimeoutValuesAreUsed) { diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueuecommandlist.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueuecommandlist.cpp index 3e41a6609b..7a1bb61a52 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueuecommandlist.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueuecommandlist.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Intel Corporation + * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -195,5 +195,68 @@ HWTEST_F(CommandQueueExecuteCommandLists, whenExecutingCommandListsThenEndingPip commandQueue->destroy(); } +using CommandQueueExecuteSupport = IsWithinProducts; +HWTEST2_F(CommandQueueExecuteCommandLists, givenCommandQueueHaving2CommandListsThenMVSIsProgrammedWithMaxPTSS, CommandQueueExecuteSupport) { + using MEDIA_VFE_STATE = typename FamilyType::MEDIA_VFE_STATE; + using PARSE = typename FamilyType::PARSE; + ze_command_queue_desc_t desc = {}; + ze_result_t returnValue; + auto commandQueue = whitebox_cast(CommandQueue::create(productFamily, + device, + neoDevice->getDefaultEngine().commandStreamReceiver, + &desc, + false, + false, + returnValue)); + + CommandList::fromHandle(commandLists[0])->setCommandListPerThreadScratchSize(512u); + CommandList::fromHandle(commandLists[1])->setCommandListPerThreadScratchSize(1024u); + + ASSERT_NE(nullptr, commandQueue->commandStream); + auto usedSpaceBefore = commandQueue->commandStream->getUsed(); + + auto result = commandQueue->executeCommandLists(numCommandLists, commandLists, nullptr, true); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(1024u, neoDevice->getDefaultEngine().commandStreamReceiver->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + + auto usedSpaceAfter = commandQueue->commandStream->getUsed(); + ASSERT_GT(usedSpaceAfter, usedSpaceBefore); + + GenCmdList cmdList; + ASSERT_TRUE(PARSE::parseCommandBuffer(cmdList, + ptrOffset(commandQueue->commandStream->getCpuBase(), 0), + usedSpaceAfter)); + + auto mediaVfeStates = findAll(cmdList.begin(), cmdList.end()); + // We should have only 1 state added + ASSERT_EQ(1u, mediaVfeStates.size()); + + CommandList::fromHandle(commandLists[0])->reset(); + CommandList::fromHandle(commandLists[1])->reset(); + CommandList::fromHandle(commandLists[0])->setCommandListPerThreadScratchSize(2048u); + CommandList::fromHandle(commandLists[1])->setCommandListPerThreadScratchSize(1024u); + + ASSERT_NE(nullptr, commandQueue->commandStream); + usedSpaceBefore = commandQueue->commandStream->getUsed(); + + result = commandQueue->executeCommandLists(numCommandLists, commandLists, nullptr, true); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(2048u, neoDevice->getDefaultEngine().commandStreamReceiver->getScratchSpaceController()->getPerThreadScratchSpaceSize()); + + usedSpaceAfter = commandQueue->commandStream->getUsed(); + ASSERT_GT(usedSpaceAfter, usedSpaceBefore); + + GenCmdList cmdList1; + ASSERT_TRUE(PARSE::parseCommandBuffer(cmdList1, + ptrOffset(commandQueue->commandStream->getCpuBase(), 0), + usedSpaceAfter)); + + mediaVfeStates = findAll(cmdList1.begin(), cmdList1.end()); + // We should have 2 states added + ASSERT_EQ(2u, mediaVfeStates.size()); + + commandQueue->destroy(); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/debugger/test_l0_debugger.cpp b/level_zero/core/test/unit_tests/sources/debugger/test_l0_debugger.cpp index 4a6bec0fe5..a1d29d75fd 100644 --- a/level_zero/core/test/unit_tests/sources/debugger/test_l0_debugger.cpp +++ b/level_zero/core/test/unit_tests/sources/debugger/test_l0_debugger.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Intel Corporation + * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -178,12 +178,12 @@ HWTEST2_F(L0DebuggerTest, givenDebuggingEnabledAndRequiredGsbaWhenCommandListIsE GTEST_SKIP(); } - commandQueue->commandQueuePerThreadScratchSize = 4096; - auto usedSpaceBefore = commandQueue->commandStream->getUsed(); ze_command_list_handle_t commandLists[] = { CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue)->toHandle()}; + CommandList::fromHandle(commandLists[0])->setCommandListPerThreadScratchSize(4096); + uint32_t numCommandLists = sizeof(commandLists) / sizeof(commandLists[0]); auto result = commandQueue->executeCommandLists(numCommandLists, commandLists, nullptr, true); diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 936a114ae2..ed364f7077 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -131,6 +131,7 @@ class CommandStreamReceiver { void overrideDispatchPolicy(DispatchMode overrideValue) { this->dispatchMode = overrideValue; } void setMediaVFEStateDirty(bool dirty) { mediaVfeStateDirty = dirty; } + bool getMediaVFEStateDirty() { return mediaVfeStateDirty; } void setRequiredScratchSizes(uint32_t newRequiredScratchSize, uint32_t newRequiredPrivateScratchSize); GraphicsAllocation *getScratchAllocation(); diff --git a/shared/source/command_stream/scratch_space_controller.h b/shared/source/command_stream/scratch_space_controller.h index 1145dbd0a9..8893772186 100644 --- a/shared/source/command_stream/scratch_space_controller.h +++ b/shared/source/command_stream/scratch_space_controller.h @@ -49,6 +49,9 @@ class ScratchSpaceController { virtual uint64_t calculateNewGSH() = 0; virtual uint64_t getScratchPatchAddress() = 0; + inline uint32_t getPerThreadScratchSpaceSize() { + return static_cast(scratchSizeBytes / computeUnitsUsedForScratch); + } virtual void reserveHeap(IndirectHeap::Type heapType, IndirectHeap *&indirectHeap) = 0; virtual void programHeaps(HeapContainer &heapContainer, diff --git a/shared/source/command_stream/scratch_space_controller_base.cpp b/shared/source/command_stream/scratch_space_controller_base.cpp index e58ea588f0..b9b2c1ef31 100644 --- a/shared/source/command_stream/scratch_space_controller_base.cpp +++ b/shared/source/command_stream/scratch_space_controller_base.cpp @@ -31,7 +31,7 @@ void ScratchSpaceControllerBase::setRequiredScratchSpace(void *sshBaseAddress, bool &stateBaseAddressDirty, bool &vfeStateDirty) { size_t requiredScratchSizeInBytes = requiredPerThreadScratchSize * computeUnitsUsedForScratch; - if (requiredScratchSizeInBytes && (!scratchAllocation || scratchSizeBytes < requiredScratchSizeInBytes)) { + if (requiredScratchSizeInBytes && (scratchSizeBytes < requiredScratchSizeInBytes)) { if (scratchAllocation) { scratchAllocation->updateTaskCount(currentTaskCount, osContext.getContextId()); csrAllocationStorage.storeAllocation(std::unique_ptr(scratchAllocation), TEMPORARY_ALLOCATION);