refactor: change queue interfaces to provide different scratch controller

Related-To: NEO-10381

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2024-03-27 20:58:13 +00:00
committed by Compute-Runtime-Automation
parent 96abe38c6d
commit 94cf31033c
9 changed files with 104 additions and 22 deletions

View File

@@ -52,6 +52,7 @@ struct CommandQueueHw : public CommandQueueImp {
MOCKABLE_VIRTUAL void handleScratchSpace(NEO::HeapContainer &heapContainer,
NEO::ScratchSpaceController *scratchController,
NEO::GraphicsAllocation *globalStatelessAllocation,
bool &gsbaState, bool &frontEndState,
uint32_t perThreadScratchSpaceSlot0Size,
uint32_t perThreadScratchSpaceSlot1Size);
@@ -68,6 +69,8 @@ struct CommandQueueHw : public CommandQueueImp {
uint32_t numCommandLists,
NEO::PreemptionMode contextPreemptionMode,
Device *device,
NEO::ScratchSpaceController *scratchSpaceController,
NEO::GraphicsAllocation *globalStatelessAllocation,
bool debugEnabled,
bool programActivePartitionConfig,
bool performMigration,
@@ -83,6 +86,8 @@ struct CommandQueueHw : public CommandQueueImp {
CommandList *firstCommandList = nullptr;
CommandList *lastCommandList = nullptr;
void *currentPatchForChainedBbStart = nullptr;
NEO::ScratchSpaceController *scratchSpaceController = nullptr;
NEO::GraphicsAllocation *globalStatelessAllocation = nullptr;
NEO::PreemptionMode preemptionMode{};
NEO::PreemptionMode statePreemption{};
@@ -109,9 +114,10 @@ struct CommandQueueHw : public CommandQueueImp {
bool hasIndirectAccess{};
bool rtDispatchRequired = false;
bool globalInit = false;
bool lockScratchController = false;
};
ze_result_t executeCommandListsRegular(CommandListExecutionContext &ctx,
MOCKABLE_VIRTUAL ze_result_t executeCommandListsRegular(CommandListExecutionContext &ctx,
uint32_t numCommandLists,
ze_command_list_handle_t *commandListHandles,
ze_fence_handle_t hFence,
@@ -144,7 +150,7 @@ struct CommandQueueHw : public CommandQueueImp {
MOCKABLE_VIRTUAL ze_result_t makeAlignedChildStreamAndSetGpuBase(NEO::LinearStream &child, size_t requiredSize);
inline void getGlobalFenceAndMakeItResident();
inline void getWorkPartitionAndMakeItResident();
inline void getGlobalStatelessHeapAndMakeItResident();
inline void getGlobalStatelessHeapAndMakeItResident(CommandListExecutionContext &ctx);
inline void getTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(NEO::LinearStream &commandStream);
inline void makeSbaTrackingBufferResidentIfL0DebuggerEnabled(bool isDebugEnabled);
inline void programCommandQueueDebugCmdsForSourceLevelOrL0DebuggerIfEnabled(bool isDebugEnabled, NEO::LinearStream &commandStream);

View File

@@ -69,24 +69,35 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
auto ret = ZE_RESULT_SUCCESS;
auto lockCSR = this->csr->obtainUniqueOwnership();
auto neoDevice = device->getNEODevice();
if (NEO::ApiSpecificConfig::isSharedAllocPrefetchEnabled()) {
auto svmAllocMgr = device->getDriverHandle()->getSvmAllocsManager();
svmAllocMgr->prefetchSVMAllocs(*device->getNEODevice(), *csr);
svmAllocMgr->prefetchSVMAllocs(*neoDevice, *csr);
}
registerCsrClient();
auto neoDevice = device->getNEODevice();
auto scratchController = this->csr->getScratchSpaceController();
auto globalStatelessHeapAllocation = this->csr->getGlobalStatelessHeapAllocation();
bool lockScratchController = false;
if (this->heaplessModeEnabled) {
scratchController = neoDevice->getDefaultEngine().commandStreamReceiver->getScratchSpaceController();
globalStatelessHeapAllocation = neoDevice->getDefaultEngine().commandStreamReceiver->getGlobalStatelessHeapAllocation();
lockScratchController = scratchController != this->csr->getScratchSpaceController();
}
auto ctx = CommandListExecutionContext{phCommandLists,
numCommandLists,
this->isCopyOnlyCommandQueue ? NEO::PreemptionMode::Disabled : csr->getPreemptionMode(),
device,
scratchController,
globalStatelessHeapAllocation,
NEO::Debugger::isDebugEnabled(internalUsage),
csr->isProgramActivePartitionConfigRequired(),
performMigration,
csr->getSipSentFlag()};
ctx.globalInit |= ctx.isDebugEnabled && !this->commandQueueDebugCmdsProgrammed && device->getL0Debugger();
ctx.lockScratchController = lockScratchController;
this->startingCmdBuffer = &this->commandStream;
this->device->activateMetricGroups();
@@ -160,7 +171,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
this->getGlobalFenceAndMakeItResident();
this->getWorkPartitionAndMakeItResident();
this->getGlobalStatelessHeapAndMakeItResident();
this->getGlobalStatelessHeapAndMakeItResident(ctx);
this->makePreemptionAllocationResidentForModeMidThread(ctx.isDevicePreemptionModeMidThread);
this->makeSipIsaResidentIfSipKernelUsed(ctx);
this->makeDebugSurfaceResidentIfNEODebuggerActive(ctx.isNEODebuggerActive(this->device));
@@ -473,10 +484,14 @@ CommandQueueHw<gfxCoreFamily>::CommandListExecutionContext::CommandListExecution
uint32_t numCommandLists,
NEO::PreemptionMode contextPreemptionMode,
Device *device,
NEO::ScratchSpaceController *scratchSpaceController,
NEO::GraphicsAllocation *globalStatelessAllocation,
bool debugEnabled,
bool programActivePartitionConfig,
bool performMigration,
bool sipSent) : preemptionMode{contextPreemptionMode},
bool sipSent) : scratchSpaceController(scratchSpaceController),
globalStatelessAllocation(globalStatelessAllocation),
preemptionMode{contextPreemptionMode},
statePreemption{contextPreemptionMode},
isPreemptionModeInitial{contextPreemptionMode == NEO::PreemptionMode::Initial},
isDebugEnabled{debugEnabled},
@@ -689,13 +704,17 @@ void CommandQueueHw<gfxCoreFamily>::setFrontEndStateProperties(CommandListExecut
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::handleScratchSpaceAndUpdateGSBAStateDirtyFlag(CommandListExecutionContext &ctx) {
auto scratchController = this->csr->getScratchSpaceController();
std::unique_lock<NEO::CommandStreamReceiver::MutexType> defaultCsrLock;
if (ctx.lockScratchController) {
defaultCsrLock = device->getNEODevice()->getDefaultEngine().commandStreamReceiver->obtainUniqueOwnership();
}
handleScratchSpace(this->heapContainer,
scratchController,
ctx.scratchSpaceController,
ctx.globalStatelessAllocation,
ctx.gsbaStateDirty, ctx.frontEndStateDirty,
ctx.perThreadScratchSpaceSlot0Size, ctx.perThreadScratchSpaceSlot1Size);
ctx.gsbaStateDirty |= this->csr->getGSBAStateDirty();
ctx.scratchGsba = scratchController->calculateNewGSH();
ctx.scratchGsba = ctx.scratchSpaceController->calculateNewGSH();
ctx.globalInit |= ctx.gsbaStateDirty;
}
@@ -814,10 +833,9 @@ void CommandQueueHw<gfxCoreFamily>::getWorkPartitionAndMakeItResident() {
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::getGlobalStatelessHeapAndMakeItResident() {
const auto globalStatelessAllocation = this->csr->getGlobalStatelessHeapAllocation();
if (globalStatelessAllocation) {
this->csr->makeResident(*globalStatelessAllocation);
void CommandQueueHw<gfxCoreFamily>::getGlobalStatelessHeapAndMakeItResident(CommandListExecutionContext &ctx) {
if (ctx.globalStatelessAllocation) {
this->csr->makeResident(*ctx.globalStatelessAllocation);
}
}

View File

@@ -120,6 +120,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateStateBaseAddressCmdSize() {
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::handleScratchSpace(NEO::HeapContainer &heapContainer,
NEO::ScratchSpaceController *scratchController,
NEO::GraphicsAllocation *globalStatelessAllocation,
bool &gsbaState, bool &frontEndState,
uint32_t perThreadScratchSpaceSlot0Size, uint32_t perThreadScratchSpaceSlot1Size) {

View File

@@ -135,12 +135,12 @@ constexpr uint32_t maxPtssIndex = 15u;
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::handleScratchSpace(NEO::HeapContainer &sshHeaps,
NEO::ScratchSpaceController *scratchController,
NEO::GraphicsAllocation *globalStatelessAllocation,
bool &gsbaState, bool &frontEndState,
uint32_t perThreadScratchSpaceSlot0Size, uint32_t perThreadScratchSpaceSlot1Size) {
if (perThreadScratchSpaceSlot0Size > 0 || perThreadScratchSpaceSlot1Size > 0) {
if (this->cmdListHeapAddressModel == NEO::HeapAddressModel::globalStateless) {
auto globalStatelessHeapAllocation = csr->getGlobalStatelessHeapAllocation();
scratchController->setRequiredScratchSpace(globalStatelessHeapAllocation->getUnderlyingBuffer(), 0, perThreadScratchSpaceSlot0Size, perThreadScratchSpaceSlot1Size, csr->peekTaskCount(),
scratchController->setRequiredScratchSpace(globalStatelessAllocation->getUnderlyingBuffer(), 0, perThreadScratchSpaceSlot0Size, perThreadScratchSpaceSlot1Size, csr->peekTaskCount(),
csr->getOsContext(), gsbaState, frontEndState);
}
if (sshHeaps.size() > 0) {

View File

@@ -78,6 +78,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
using L0::CommandQueue::dispatchCmdListBatchBufferAsPrimary;
using L0::CommandQueue::doubleSbaWa;
using L0::CommandQueue::frontEndStateTracking;
using L0::CommandQueue::heaplessModeEnabled;
using L0::CommandQueue::internalQueueForImmediateCommandList;
using L0::CommandQueue::internalUsage;
using L0::CommandQueue::partitionCount;
@@ -114,11 +115,26 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
return BaseClass::submitBatchBuffer(offset, residencyContainer, endingCmdPtr, isCooperative);
}
ze_result_t executeCommandListsRegular(CommandListExecutionContext &ctx,
uint32_t numCommandLists,
ze_command_list_handle_t *commandListHandles,
ze_fence_handle_t hFence,
ze_event_handle_t hSignalEvent, uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) override {
recordedGlobalStatelessAllocation = ctx.globalStatelessAllocation;
recordedScratchController = ctx.scratchSpaceController;
recordedLockScratchController = ctx.lockScratchController;
return BaseClass::executeCommandListsRegular(ctx, numCommandLists, commandListHandles, hFence, hSignalEvent, numWaitEvents, phWaitEvents);
}
NEO::GraphicsAllocation *recordedGlobalStatelessAllocation = nullptr;
NEO::ScratchSpaceController *recordedScratchController = nullptr;
uint32_t synchronizedCalled = 0;
NEO::ResidencyContainer residencyContainerSnapshot;
ze_result_t synchronizeReturnValue{ZE_RESULT_SUCCESS};
std::optional<NEO::WaitStatus> reserveLinearStreamSizeReturnValue{};
std::optional<NEO::SubmissionStatus> submitBatchBufferReturnValue{};
bool recordedLockScratchController = false;
};
struct Deleter {

View File

@@ -12,6 +12,8 @@
#include "shared/source/indirect_heap/indirect_heap.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/test/common/helpers/engine_descriptor_helper.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_command_stream_receiver.h"
@@ -2784,5 +2786,37 @@ HWTEST2_F(CommandListStateBaseAddressGlobalStatelessTest,
EXPECT_EQ(nullptr, ssh);
}
HWTEST2_F(CommandListStateBaseAddressGlobalStatelessTest,
givenCommandQueueUsingGlobalStatelessWhenQueueInHeaplessModeThenUsingScratchControllerAndHeapAllocationFromDefaultEngine,
IsAtLeastXeHpCore) {
auto defaultCsr = neoDevice->getDefaultEngine().commandStreamReceiver;
defaultCsr->createGlobalStatelessHeap();
auto otherCsr = std::unique_ptr<UltCommandStreamReceiver<FamilyType>>(static_cast<UltCommandStreamReceiver<FamilyType> *>(createCommandStream(*device->getNEODevice()->getExecutionEnvironment(), 0, 1)));
otherCsr->setupContext(*neoDevice->getDefaultEngine().osContext);
otherCsr->initializeResources();
otherCsr->initializeTagAllocation();
otherCsr->createGlobalFenceAllocation();
otherCsr->createPreemptionAllocation();
otherCsr->createGlobalStatelessHeap();
ze_command_queue_desc_t desc = {};
auto otherCommandQueue = new MockCommandQueueHw<gfxCoreFamily>(device, otherCsr.get(), &desc);
otherCommandQueue->initialize(false, false, false);
otherCommandQueue->heaplessModeEnabled = true;
commandList->close();
ze_command_list_handle_t cmdListHandle = commandList->toHandle();
auto result = otherCommandQueue->executeCommandLists(1, &cmdListHandle, nullptr, true, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(defaultCsr->getScratchSpaceController(), otherCommandQueue->recordedScratchController);
EXPECT_EQ(defaultCsr->getGlobalStatelessHeapAllocation(), otherCommandQueue->recordedGlobalStatelessAllocation);
EXPECT_TRUE(otherCommandQueue->recordedLockScratchController);
otherCommandQueue->destroy();
}
} // namespace ult
} // namespace L0

View File

@@ -1110,6 +1110,7 @@ class MockCommandQueue : public L0::CommandQueueHw<gfxCoreFamily> {
NEO::HeapContainer mockHeapContainer;
void handleScratchSpace(NEO::HeapContainer &heapContainer,
NEO::ScratchSpaceController *scratchController,
NEO::GraphicsAllocation *globalStatelessAllocation,
bool &gsbaState, bool &frontEndState,
uint32_t perThreadScratchSpaceSlot0Size,
uint32_t perThreadScratchSpaceSlot1Size) override {

View File

@@ -879,7 +879,7 @@ HWTEST2_F(CommandQueueScratchTests, givenCommandQueueWhenHandleScratchSpaceThenP
auto scratch = static_cast<MockScratchSpaceControllerXeHPAndLater *>(scratchController.get());
scratch->scratchAllocation = &graphicsAllocation;
commandQueueHw->handleScratchSpace(heapContainer, scratchController.get(), gsbaStateDirty, frontEndStateDirty, 0x1000, 0u);
commandQueueHw->handleScratchSpace(heapContainer, scratchController.get(), nullptr, gsbaStateDirty, frontEndStateDirty, 0x1000, 0u);
EXPECT_TRUE(scratch->programHeapsCalled);
EXPECT_GT(csr.makeResidentCalledTimes, 0u);
@@ -933,7 +933,7 @@ HWTEST2_F(CommandQueueScratchTests, givenCommandQueueWhenHandleScratchSpaceAndHe
auto scratch = static_cast<MockScratchSpaceControllerXeHPAndLater *>(scratchController.get());
scratch->scratchSlot0Allocation = &graphicsAllocation;
commandQueueHw->handleScratchSpace(heapContainer, scratchController.get(), gsbaStateDirty, frontEndStateDirty, 0x1000, 0u);
commandQueueHw->handleScratchSpace(heapContainer, scratchController.get(), nullptr, gsbaStateDirty, frontEndStateDirty, 0x1000, 0u);
EXPECT_FALSE(scratch->programHeapsCalled);
scratch->scratchSlot0Allocation = nullptr;

View File

@@ -818,6 +818,8 @@ HWTEST2_F(CommandQueueIndirectAllocations, givenCtxWithIndirectAccessWhenExecuti
1,
csr->getPreemptionMode(),
device,
csr->getScratchSpaceController(),
csr->getGlobalStatelessHeapAllocation(),
false,
csr->isProgramActivePartitionConfigRequired(),
false,
@@ -844,6 +846,8 @@ HWTEST2_F(CommandQueueIndirectAllocations, givenCtxWitNohIndirectAccessWhenExecu
1,
csr->getPreemptionMode(),
device,
csr->getScratchSpaceController(),
csr->getGlobalStatelessHeapAllocation(),
false,
csr->isProgramActivePartitionConfigRequired(),
false,
@@ -871,6 +875,8 @@ HWTEST2_F(CommandQueueIndirectAllocations, givenCommandQueueWhenHandleIndirectAl
1,
csr->getPreemptionMode(),
device,
csr->getScratchSpaceController(),
csr->getGlobalStatelessHeapAllocation(),
false,
csr->isProgramActivePartitionConfigRequired(),
false,