feature: introduce heapless state init in L0

Related-To: NEO-7824
Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
Kamil Kopryk
2024-03-29 03:22:39 +00:00
committed by Compute-Runtime-Automation
parent 8ef6cdbabd
commit 4eae28bd64
25 changed files with 291 additions and 63 deletions

View File

@@ -346,6 +346,10 @@ struct CommandList : _ze_command_list_handle_t {
return heaplessModeEnabled;
}
bool isHeaplessStateInitEnabled() const {
return heaplessStateInitEnabled;
}
virtual bool skipInOrderNonWalkerSignalingAllowed(ze_event_handle_t signalEvent) const { return false; }
bool getCmdListBatchBufferFlag() const {
@@ -436,6 +440,7 @@ struct CommandList : _ze_command_list_handle_t {
bool copyThroughLockedPtrEnabled = false;
bool useOnlyGlobalTimestamps = false;
bool heaplessModeEnabled = false;
bool heaplessStateInitEnabled = false;
};
using CommandListAllocatorFn = CommandList *(*)(uint32_t);

View File

@@ -86,11 +86,13 @@ CommandListCoreFamily<gfxCoreFamily>::~CommandListCoreFamily() {
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::postInitComputeSetup() {
if (!this->stateBaseAddressTracking) {
if (!this->stateBaseAddressTracking && !this->heaplessStateInitEnabled) {
if (!this->isFlushTaskSubmissionEnabled) {
programStateBaseAddress(commandContainer, false);
}
}
commandContainer.setDirtyStateForAllHeaps(false);
setStreamPropertiesDefaultSettings(requiredStreamState);
@@ -235,6 +237,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::initialize(Device *device, NEO
this->useOnlyGlobalTimestamps = gfxCoreHelper.useOnlyGlobalTimestamps();
this->maxFillPaternSizeForCopyEngine = gfxCoreHelper.getMaxFillPaternSizeForCopyEngine();
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled();
this->heaplessStateInitEnabled = compilerProductHelper.isHeaplessStateInitEnabled();
this->requiredStreamState.initSupport(rootDeviceEnvironment);
this->finalStreamState.initSupport(rootDeviceEnvironment);
this->duplicatedInOrderCounterStorageEnabled = gfxCoreHelper.duplicatedInOrderCounterStorageEnabled(rootDeviceEnvironment);

View File

@@ -13,6 +13,7 @@
#include "level_zero/core/source/cmdlist/cmdlist_hw.h"
#include <atomic>
#include <functional>
namespace NEO {
struct SvmAllocationData;
@@ -219,6 +220,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
MOCKABLE_VIRTUAL void checkAssert();
ComputeFlushMethodType computeFlushMethod = nullptr;
std::function<NEO::CompletionStamp(NEO::LinearStream &, size_t, NEO::ImmediateDispatchFlags &, NEO::Device &)> flushImmediateTaskMethod;
std::atomic<bool> dependenciesPresent{false};
bool latestFlushIsHostVisible = false;
};

View File

@@ -40,6 +40,7 @@
#include "encode_surface_state_args.h"
#include <cmath>
#include <functional>
namespace L0 {
@@ -209,11 +210,10 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmedia
};
CommandListImp::storeReferenceTsToMappedEvents(true);
return this->csr->flushImmediateTask(
cmdStreamTask,
taskStartOffset,
dispatchFlags,
*(this->device->getNEODevice()));
return this->flushImmediateTaskMethod(cmdStreamTask,
taskStartOffset,
dispatchFlags,
*(this->device->getNEODevice()));
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -1363,6 +1363,12 @@ template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamilyImmediate<gfxCoreFamily>::setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {
if (L0GfxCoreHelper::useImmediateComputeFlushTask(rootDeviceEnvironment)) {
this->computeFlushMethod = &CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediateRegularTask;
if (this->isHeaplessStateInitEnabled()) {
this->flushImmediateTaskMethod = std::bind(&NEO::CommandStreamReceiver::flushImmediateTaskStateless, this->csr, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
} else {
this->flushImmediateTaskMethod = std::bind(&NEO::CommandStreamReceiver::flushImmediateTask, this->csr, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
}
}
}

View File

@@ -276,7 +276,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
bool uncachedMocsKernel = isKernelUncachedMocsRequired(kernelImp->getKernelRequiresUncachedMocs());
this->requiresQueueUncachedMocs |= kernelImp->getKernelRequiresQueueUncachedMocs();
updateStreamProperties(*kernel, launchParams.isCooperative, threadGroupDimensions, launchParams.isIndirect);
if (this->heaplessStateInitEnabled == false) {
updateStreamProperties(*kernel, launchParams.isCooperative, threadGroupDimensions, launchParams.isIndirect);
}
auto localMemSize = static_cast<uint32_t>(neoDevice->getDeviceInfo().localMemSize);
auto slmTotalSize = kernelImp->getSlmTotalSize();

View File

@@ -101,6 +101,7 @@ ze_result_t CommandQueueImp::initialize(bool copyOnly, bool isInternal, bool imm
this->dispatchCmdListBatchBufferAsPrimary = L0GfxCoreHelper::dispatchCmdListBatchBufferAsPrimary(rootDeviceEnvironment, !immediateCmdListQueue);
auto &compilerProductHelper = rootDeviceEnvironment.getHelper<NEO::CompilerProductHelper>();
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled();
this->heaplessStateInitEnabled = compilerProductHelper.isHeaplessStateInitEnabled();
}
return returnValue;
}

View File

@@ -93,6 +93,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
bool dispatchCmdListBatchBufferAsPrimary = false;
bool internalQueueForImmediateCommandList = false;
bool heaplessModeEnabled = false;
bool heaplessStateInitEnabled = false;
};
using CommandQueueAllocatorFn = CommandQueue *(*)(Device *device, NEO::CommandStreamReceiver *csr,

View File

@@ -23,7 +23,6 @@ template <GFXCORE_FAMILY gfxCoreFamily>
struct CommandQueueHw : public CommandQueueImp {
using CommandQueueImp::CommandQueueImp;
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
ze_result_t createFence(const ze_fence_desc_t *desc, ze_fence_handle_t *phFence) override;
ze_result_t executeCommandLists(uint32_t numCommandLists,
ze_command_list_handle_t *phCommandLists,
@@ -117,6 +116,13 @@ struct CommandQueueHw : public CommandQueueImp {
bool lockScratchController = false;
};
ze_result_t executeCommandListsRegularHeapless(CommandListExecutionContext &ctx,
uint32_t numCommandLists,
ze_command_list_handle_t *commandListHandles,
ze_fence_handle_t hFence,
ze_event_handle_t hSignalEvent, uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents);
MOCKABLE_VIRTUAL ze_result_t executeCommandListsRegular(CommandListExecutionContext &ctx,
uint32_t numCommandLists,
ze_command_list_handle_t *commandListHandles,
@@ -139,6 +145,11 @@ struct CommandQueueHw : public CommandQueueImp {
ze_fence_handle_t hFence);
MOCKABLE_VIRTUAL bool isDispatchTaskCountPostSyncRequired(ze_fence_handle_t hFence, bool containsAnyRegularCmdList) const;
inline size_t estimateLinearStreamSizeInitial(CommandListExecutionContext &ctx);
size_t estimateStreamSizeForExecuteCommandListsRegularHeapless(CommandListExecutionContext &ctx,
uint32_t numCommandLists,
ze_command_list_handle_t *commandListHandles,
bool instructionCacheFlushRequired,
bool stateCacheFlushRequired);
inline size_t estimateCommandListSecondaryStart(CommandList *commandList);
inline size_t estimateCommandListPrimaryStart(bool required);
inline size_t estimateCommandListResidencySize(CommandList *commandList);

View File

@@ -21,6 +21,7 @@
#include "shared/source/device/device.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/compiler_product_helper.h"
#include "shared/source/helpers/definitions/command_encoder_args.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/heap_base_address_model.h"
@@ -96,7 +97,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
csr->isProgramActivePartitionConfigRequired(),
performMigration,
csr->getSipSentFlag()};
ctx.globalInit |= ctx.isDebugEnabled && !this->commandQueueDebugCmdsProgrammed && device->getL0Debugger();
ctx.globalInit |= ctx.isDebugEnabled &&
!this->commandQueueDebugCmdsProgrammed &&
device->getL0Debugger();
ctx.lockScratchController = lockScratchController;
this->startingCmdBuffer = &this->commandStream;
@@ -104,6 +108,9 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
if (this->isCopyOnlyCommandQueue) {
ret = this->executeCommandListsCopyOnly(ctx, numCommandLists, phCommandLists, hFence, nullptr, 0, nullptr);
} else if (this->heaplessStateInitEnabled) {
ctx.globalInit = false;
ret = this->executeCommandListsRegularHeapless(ctx, numCommandLists, phCommandLists, hFence, nullptr, 0, nullptr);
} else {
ret = this->executeCommandListsRegular(ctx, numCommandLists, phCommandLists, hFence, nullptr, 0, nullptr);
}
@@ -115,6 +122,119 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
return ret;
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless(
CommandListExecutionContext &ctx,
uint32_t numCommandLists,
ze_command_list_handle_t *commandListHandles,
ze_fence_handle_t hFence,
ze_event_handle_t hSignalEvent, uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) {
this->setupCmdListsAndContextParams(ctx, commandListHandles, numCommandLists, hFence);
ctx.isDirectSubmissionEnabled = this->csr->isDirectSubmissionEnabled();
bool instructionCacheFlushRequired = this->csr->isInstructionCacheFlushRequired();
auto neoDevice = this->device->getNEODevice();
bool stateCacheFlushRequired = neoDevice->getBindlessHeapsHelper() ? neoDevice->getBindlessHeapsHelper()->getStateDirtyForContext(this->csr->getOsContext().getContextId()) : false;
std::unique_lock<std::mutex> lockForIndirect;
if (ctx.hasIndirectAccess) {
handleIndirectAllocationResidency(ctx.unifiedMemoryControls, lockForIndirect, ctx.isMigrationRequested);
}
size_t linearStreamSizeEstimate = this->estimateStreamSizeForExecuteCommandListsRegularHeapless(ctx, numCommandLists, commandListHandles, instructionCacheFlushRequired, stateCacheFlushRequired);
this->csr->getResidencyAllocations().reserve(ctx.spaceForResidency);
NEO::LinearStream child(nullptr);
if (const auto ret = this->makeAlignedChildStreamAndSetGpuBase(child, linearStreamSizeEstimate); ret != ZE_RESULT_SUCCESS) {
return ret;
}
this->makeCsrTagAllocationResident();
if (instructionCacheFlushRequired) {
NEO::MemorySynchronizationCommands<GfxFamily>::addInstructionCacheFlush(child);
this->csr->setInstructionCacheFlushed();
}
if (stateCacheFlushRequired) {
NEO::MemorySynchronizationCommands<GfxFamily>::addStateCacheFlush(child, neoDevice->getRootDeviceEnvironment());
neoDevice->getBindlessHeapsHelper()->clearStateDirtyForContext(this->csr->getOsContext().getContextId());
}
for (auto i = 0u; i < numCommandLists; ++i) {
auto commandList = CommandList::fromHandle(commandListHandles[i]);
ctx.childGpuAddressPositionBeforeDynamicPreamble = child.getCurrentGpuAddressPosition();
this->patchCommands(*commandList, this->csr->getScratchSpaceController()->getScratchPatchAddress());
this->programOneCmdListBatchBufferStart(commandList, child, ctx);
this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList);
if (commandList->hasKernelWithAssert()) {
cmdListWithAssertExecuted.exchange(true);
}
this->collectPrintfContentsFromCommandsList(commandList);
}
this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList);
this->programLastCommandListReturnBbStart(child, ctx);
this->assignCsrTaskCountToFenceIfAvailable(hFence);
this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child);
auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child);
this->updateTaskCountAndPostSync(ctx.isDispatchTaskCountPostSyncRequired);
this->csr->makeSurfacePackNonResident(this->csr->getResidencyAllocations(), false);
auto completionResult = this->waitForCommandQueueCompletionAndCleanHeapContainer();
ze_result_t retVal = this->handleSubmissionAndCompletionResults(submitResult, completionResult);
this->csr->getResidencyAllocations().clear();
return retVal;
}
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateStreamSizeForExecuteCommandListsRegularHeapless(CommandListExecutionContext &ctx,
uint32_t numCommandLists,
ze_command_list_handle_t *commandListHandles,
bool instructionCacheFlushRequired,
bool stateCacheFlushRequired) {
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END;
size_t linearStreamSizeEstimate = 0u;
if (ctx.isDirectSubmissionEnabled) {
linearStreamSizeEstimate += NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize();
if (NEO::debugManager.flags.DirectSubmissionRelaxedOrdering.get() == 1) {
linearStreamSizeEstimate += 2 * sizeof(typename GfxFamily::MI_LOAD_REGISTER_REG);
}
} else {
linearStreamSizeEstimate += NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferEndSize();
}
for (uint32_t i = 0; i < numCommandLists; i++) {
auto cmdList = CommandList::fromHandle(commandListHandles[i]);
linearStreamSizeEstimate += estimateCommandListSecondaryStart(cmdList);
ctx.spaceForResidency += estimateCommandListResidencySize(cmdList);
}
if (ctx.isDispatchTaskCountPostSyncRequired) {
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(this->device->getNEODevice()->getRootDeviceEnvironment(), false);
}
if (instructionCacheFlushRequired) {
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForInstructionCacheFlush();
}
if (stateCacheFlushRequired) {
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForFullCacheFlush();
}
return linearStreamSizeEstimate;
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
CommandListExecutionContext &ctx,

View File

@@ -1,5 +1,5 @@
#
# Copyright (C) 2020-2023 Intel Corporation
# Copyright (C) 2020-2024 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
@@ -16,6 +16,7 @@ set(L0_MOCKS_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/mock_cmdlist.cpp
${CMAKE_CURRENT_SOURCE_DIR}/mock_cmdqueue.h
${CMAKE_CURRENT_SOURCE_DIR}/mock_cmdqueue.cpp
${CMAKE_CURRENT_SOURCE_DIR}/mock_cmdqueue_handle_indirect_allocs.h
${CMAKE_CURRENT_SOURCE_DIR}/mock_context.h
${CMAKE_CURRENT_SOURCE_DIR}/mock_device.h
${CMAKE_CURRENT_SOURCE_DIR}/mock_device_for_spirv.h

View File

@@ -70,6 +70,8 @@ template <GFXCORE_FAMILY gfxCoreFamily>
struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
using BaseClass = ::L0::CommandQueueHw<gfxCoreFamily>;
using BaseClass::commandStream;
using BaseClass::estimateStreamSizeForExecuteCommandListsRegularHeapless;
using BaseClass::executeCommandListsRegularHeapless;
using BaseClass::prepareAndSubmitBatchBuffer;
using BaseClass::printfKernelContainer;
using BaseClass::startingCmdBuffer;
@@ -79,6 +81,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
using L0::CommandQueue::doubleSbaWa;
using L0::CommandQueue::frontEndStateTracking;
using L0::CommandQueue::heaplessModeEnabled;
using L0::CommandQueue::heaplessStateInitEnabled;
using L0::CommandQueue::internalQueueForImmediateCommandList;
using L0::CommandQueue::internalUsage;
using L0::CommandQueue::partitionCount;

View File

@@ -0,0 +1,33 @@
/*
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
namespace L0 {
namespace ult {
template <GFXCORE_FAMILY gfxCoreFamily>
class MockCommandQueueHandleIndirectAllocs : public MockCommandQueueHw<gfxCoreFamily> {
public:
using typename MockCommandQueueHw<gfxCoreFamily>::CommandListExecutionContext;
using MockCommandQueueHw<gfxCoreFamily>::executeCommandListsRegular;
using MockCommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless;
MockCommandQueueHandleIndirectAllocs(L0::Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc) : MockCommandQueueHw<gfxCoreFamily>(device, csr, desc) {}
void handleIndirectAllocationResidency(UnifiedMemoryControls unifiedMemoryControls, std::unique_lock<std::mutex> &lockForIndirect, bool performMigration) override {
handleIndirectAllocationResidencyCalledTimes++;
MockCommandQueueHw<gfxCoreFamily>::handleIndirectAllocationResidency(unifiedMemoryControls, lockForIndirect, performMigration);
}
void makeResidentAndMigrate(bool performMigration, const NEO::ResidencyContainer &residencyContainer) override {
makeResidentAndMigrateCalledTimes++;
}
uint32_t handleIndirectAllocationResidencyCalledTimes = 0;
uint32_t makeResidentAndMigrateCalledTimes = 0;
};
} // namespace ult
} // namespace L0

View File

@@ -245,16 +245,16 @@ HWTEST2_F(CommandListImmediateWithAssert, givenKernelWithAssertWhenAppendedToAsy
desc.pNext = 0;
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
auto &csr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.isFlushTaskSubmissionEnabled = true;
cmdList.callBaseExecute = true;
cmdList.cmdListType = CommandList::CommandListType::typeImmediate;
cmdList.isSyncModeQueue = false;
cmdList.setCsr(&csr);
result = cmdList.initialize(device, NEO::EngineGroupType::renderCompute, 0u);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto &csr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
cmdList.setCsr(&csr);
cmdList.getCmdContainer().setImmediateCmdListCsr(&csr);
auto commandQueue = CommandQueue::create(productFamily, device, &csr, &desc, cmdList.isCopyOnly(), false, false, result);
cmdList.cmdQImmediate = commandQueue;
@@ -271,18 +271,17 @@ HWTEST2_F(CommandListImmediateWithAssert, givenKernelWithAssertWhenAppendedToAsy
HWTEST2_F(CommandListImmediateWithAssert, givenKernelWithAssertWhenAppendedToSynchronousImmCommandListThenAssertIsChecked, IsAtLeastSkl) {
ze_result_t result;
auto &csr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
Mock<KernelImp> kernel;
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.isFlushTaskSubmissionEnabled = true;
cmdList.callBaseExecute = true;
cmdList.cmdListType = CommandList::CommandListType::typeImmediate;
cmdList.isSyncModeQueue = true;
cmdList.setCsr(&csr);
result = cmdList.initialize(device, NEO::EngineGroupType::renderCompute, 0u);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto &csr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
cmdList.setCsr(&csr);
cmdList.getCmdContainer().setImmediateCmdListCsr(&csr);
ze_command_queue_desc_t desc = {};
@@ -317,10 +316,10 @@ HWTEST2_F(CommandListImmediateWithAssert, givenKernelWithAssertWhenAppendToSynch
cmdList.callBaseExecute = true;
cmdList.cmdListType = CommandList::CommandListType::typeImmediate;
cmdList.isSyncModeQueue = true;
cmdList.setCsr(&csr);
result = cmdList.initialize(device, NEO::EngineGroupType::renderCompute, 0u);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
cmdList.setCsr(&csr);
cmdList.getCmdContainer().setImmediateCmdListCsr(&csr);
ze_command_queue_desc_t desc = {};
desc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;

View File

@@ -705,11 +705,13 @@ HWTEST2_F(CmdlistAppendLaunchKernelTests,
auto commandList = std::make_unique<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>>();
ASSERT_NE(nullptr, commandList);
commandList->isFlushTaskSubmissionEnabled = true;
ze_result_t ret = commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
commandList->device = device;
commandList->cmdListType = CommandList::CommandListType::typeImmediate;
commandList->csr = device->getNEODevice()->getDefaultEngine().commandStreamReceiver;
ze_result_t ret = commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
MockCommandQueueHw<gfxCoreFamily> mockCommandQueue(device, device->getNEODevice()->getDefaultEngine().commandStreamReceiver, &desc);

View File

@@ -24,6 +24,7 @@
#include "level_zero/core/test/unit_tests/fixtures/module_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue_handle_indirect_allocs.h"
#include "level_zero/core/test/unit_tests/mocks/mock_memory_manager.h"
#include "level_zero/core/test/unit_tests/mocks/mock_module.h"
@@ -787,23 +788,6 @@ HWTEST2_F(EngineInstancedDeviceExecuteTests, givenEngineInstancedDeviceWithFabri
commandQueue->destroy();
}
template <GFXCORE_FAMILY gfxCoreFamily>
class MockCommandQueueHandleIndirectAllocs : public MockCommandQueueHw<gfxCoreFamily> {
public:
using typename MockCommandQueueHw<gfxCoreFamily>::CommandListExecutionContext;
using MockCommandQueueHw<gfxCoreFamily>::executeCommandListsRegular;
MockCommandQueueHandleIndirectAllocs(L0::Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc) : MockCommandQueueHw<gfxCoreFamily>(device, csr, desc) {}
void handleIndirectAllocationResidency(UnifiedMemoryControls unifiedMemoryControls, std::unique_lock<std::mutex> &lockForIndirect, bool performMigration) override {
handleIndirectAllocationResidencyCalledTimes++;
MockCommandQueueHw<gfxCoreFamily>::handleIndirectAllocationResidency(unifiedMemoryControls, lockForIndirect, performMigration);
}
void makeResidentAndMigrate(bool performMigration, const NEO::ResidencyContainer &residencyContainer) override {
makeResidentAndMigrateCalledTimes++;
}
uint32_t handleIndirectAllocationResidencyCalledTimes = 0;
uint32_t makeResidentAndMigrateCalledTimes = 0;
};
HWTEST2_F(CommandQueueIndirectAllocations, givenCtxWithIndirectAccessWhenExecutingCommandListImmediateWithFlushTaskThenHandleIndirectAccessCalled, IsAtLeastSkl) {
ze_command_queue_desc_t desc = {};
auto csr = neoDevice->getDefaultEngine().commandStreamReceiver;

View File

@@ -580,6 +580,15 @@ class CommandStreamReceiverMock : public CommandStreamReceiver {
return cs;
}
CompletionStamp flushImmediateTaskStateless(
LinearStream &immediateCommandStream,
size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags,
Device &device) override {
CompletionStamp cs = {};
return cs;
}
CompletionStamp flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) override {
CompletionStamp cs = {};

View File

@@ -18,6 +18,7 @@
#include "shared/source/gmm_helper/client_context/gmm_client_context.h"
#include "shared/source/helpers/basic_math.h"
#include "shared/source/helpers/cache_policy.h"
#include "shared/source/helpers/compiler_product_helper.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/hw_walk_order.h"
@@ -284,33 +285,37 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
}
}
if (container.isAnyHeapDirty() ||
args.requiresUncachedMocs) {
bool heaplessStateInitEnabled = rootDeviceEnvironment.getHelper<CompilerProductHelper>().isHeaplessStateInitEnabled();
PipeControlArgs syncArgs;
syncArgs.dcFlushEnable = args.dcFlushEnable;
MemorySynchronizationCommands<Family>::addSingleBarrier(*container.getCommandStream(), syncArgs);
STATE_BASE_ADDRESS sbaCmd;
auto gmmHelper = container.getDevice()->getGmmHelper();
uint32_t statelessMocsIndex =
args.requiresUncachedMocs ? (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED) >> 1) : (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1);
auto l1CachePolicy = container.l1CachePolicyDataRef()->getL1CacheValue(false);
auto l1CachePolicyDebuggerActive = container.l1CachePolicyDataRef()->getL1CacheValue(true);
if (heaplessStateInitEnabled == false) {
if (container.isAnyHeapDirty() ||
args.requiresUncachedMocs) {
EncodeStateBaseAddressArgs<Family> encodeStateBaseAddressArgs = {
&container, // container
sbaCmd, // sbaCmd
nullptr, // sbaProperties
statelessMocsIndex, // statelessMocsIndex
l1CachePolicy, // l1CachePolicy
l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive
args.partitionCount > 1, // multiOsContextCapable
args.isRcs, // isRcs
container.doubleSbaWaRef(), // doubleSbaWa
heaplessModeEnabled, // heaplessModeEnabled
};
EncodeStateBaseAddress<Family>::encode(encodeStateBaseAddressArgs);
container.setDirtyStateForAllHeaps(false);
PipeControlArgs syncArgs;
syncArgs.dcFlushEnable = args.dcFlushEnable;
MemorySynchronizationCommands<Family>::addSingleBarrier(*container.getCommandStream(), syncArgs);
STATE_BASE_ADDRESS sbaCmd;
auto gmmHelper = container.getDevice()->getGmmHelper();
uint32_t statelessMocsIndex =
args.requiresUncachedMocs ? (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED) >> 1) : (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1);
auto l1CachePolicy = container.l1CachePolicyDataRef()->getL1CacheValue(false);
auto l1CachePolicyDebuggerActive = container.l1CachePolicyDataRef()->getL1CacheValue(true);
EncodeStateBaseAddressArgs<Family> encodeStateBaseAddressArgs = {
&container, // container
sbaCmd, // sbaCmd
nullptr, // sbaProperties
statelessMocsIndex, // statelessMocsIndex
l1CachePolicy, // l1CachePolicy
l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive
args.partitionCount > 1, // multiOsContextCapable
args.isRcs, // isRcs
container.doubleSbaWaRef(), // doubleSbaWa
heaplessModeEnabled, // heaplessModeEnabled
};
EncodeStateBaseAddress<Family>::encode(encodeStateBaseAddressArgs);
container.setDirtyStateForAllHeaps(false);
}
}
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) {

View File

@@ -105,6 +105,9 @@ class CommandStreamReceiver {
virtual CompletionStamp flushBcsTask(LinearStream &commandStream, size_t commandStreamStart, const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) = 0;
virtual CompletionStamp flushImmediateTask(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags, Device &device) = 0;
virtual CompletionStamp flushImmediateTaskStateless(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags, Device &device) = 0;
virtual SubmissionStatus sendRenderStateCacheFlush() = 0;
virtual bool flushBatchedSubmissions() = 0;

View File

@@ -70,6 +70,9 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
CompletionStamp flushImmediateTask(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags, Device &device) override;
CompletionStamp flushImmediateTaskStateless(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags, Device &device) override;
void forcePipeControl(NEO::LinearStream &commandStreamCSR);
bool flushBatchedSubmissions() override;
@@ -305,6 +308,9 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
LinearStream &immediateCommandStream,
ImmediateFlushData &flushData);
void handleImmediateFlushStatelessAllocationsResidency(size_t csrEstimatedSize,
LinearStream &csrStream);
inline void handleImmediateFlushAllocationsResidency(Device &device,
LinearStream &immediateCommandStream,
ImmediateFlushData &flushData,

View File

@@ -24,6 +24,19 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTaskStateless(
return {};
}
template <typename GfxFamily>
CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushImmediateTaskStateless(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags, Device &device) {
UNRECOVERABLE_IF(true);
return {};
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::handleImmediateFlushStatelessAllocationsResidency(size_t csrEstimatedSize,
LinearStream &csrStream) {
UNRECOVERABLE_IF(true);
}
template <typename GfxFamily>
SubmissionStatus CommandStreamReceiverHw<GfxFamily>::programHeaplessProlog(Device &device) {
UNRECOVERABLE_IF(true);

View File

@@ -358,9 +358,9 @@ size_t MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(bool tl
template <typename GfxFamily>
size_t MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(const RootDeviceEnvironment &rootDeviceEnvironment, bool tlbInvalidationRequired) {
size_t size = getSizeForSingleBarrier(tlbInvalidationRequired) +
getSizeForBarrierWa(rootDeviceEnvironment) +
getSizeForSingleAdditionalSynchronization(rootDeviceEnvironment);
size_t size = getSizeForSingleBarrier(tlbInvalidationRequired);
size += getSizeForBarrierWa(rootDeviceEnvironment);
size += getSizeForSingleAdditionalSynchronization(rootDeviceEnvironment);
return size;
}

View File

@@ -61,6 +61,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
using BaseClass::getScratchSpaceController;
using BaseClass::handleAllocationsResidencyForHeaplessProlog;
using BaseClass::handleFrontEndStateTransition;
using BaseClass::handleImmediateFlushStatelessAllocationsResidency;
using BaseClass::handlePipelineSelectStateTransition;
using BaseClass::handleStateBaseAddressStateTransition;
using BaseClass::heapStorageRequiresRecyclingTag;

View File

@@ -58,3 +58,13 @@ CompletionStamp MockCommandStreamReceiver::flushImmediateTask(
CompletionStamp stamp = {taskCount, taskLevel, flushStamp->peekStamp()};
return stamp;
}
CompletionStamp MockCommandStreamReceiver::flushImmediateTaskStateless(
LinearStream &immediateCommandStream,
size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags,
Device &device) {
++taskCount;
CompletionStamp stamp = {taskCount, taskLevel, flushStamp->peekStamp()};
return stamp;
}

View File

@@ -135,6 +135,12 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
ImmediateDispatchFlags &dispatchFlags,
Device &device) override;
CompletionStamp flushImmediateTaskStateless(
LinearStream &immediateCommandStream,
size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags,
Device &device) override;
CompletionStamp flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) override;

View File

@@ -5092,4 +5092,6 @@ HWTEST_F(CommandStreamReceiverHwHeaplessTest, whenHeaplessCommandStreamReceiverF
EXPECT_ANY_THROW(csr->handleAllocationsResidencyForflushTaskStateless(nullptr, nullptr, nullptr));
EXPECT_ANY_THROW(csr->getRequiredCmdStreamHeaplessSize(csr->recordedDispatchFlags, *pDevice));
EXPECT_ANY_THROW(csr->getRequiredCmdStreamHeaplessSizeAligned(csr->recordedDispatchFlags, *pDevice));
EXPECT_ANY_THROW(csr->flushImmediateTaskStateless(commandStream, 0, csr->recordedImmediateDispatchFlags, *pDevice));
EXPECT_ANY_THROW(csr->handleImmediateFlushStatelessAllocationsResidency(0, commandStream));
}